10 Commits
v0.1.0 ... main

Author SHA1 Message Date
Bijit Mondal
bf4ba8ea77 1.0.1 2026-02-23 16:16:03 +05:30
Bijit Mondal
5e7eb469ae feat: Introduce new core components for conversation and speech management
- Added ConversationManager for managing conversation history with configurable limits.
- Implemented InputQueue for serial processing of input items.
- Created SpeechManager for handling text-to-speech generation and streaming.
- Developed StreamProcessor for processing LLM streams and forwarding events.
- Added TranscriptionManager for audio transcription using AI SDK.
- Introduced WebSocketManager for managing WebSocket connections and messaging.
- Updated VoiceAgent to support new architecture and improved socket handling.
- Refactored index files to export new core components.
2026-02-23 16:15:49 +05:30
Bijit Mondal
4dd30b89c0 Refactor code structure for improved readability and maintainability 2026-02-20 16:19:08 +05:30
Bijit Mondal
97a3078578 1.0.0 2026-02-20 16:17:14 +05:30
Bijit Mondal
990d17abe7 refactor(VideoAgent): remove unnecessary console logs for cleaner output 2026-02-20 16:16:18 +05:30
Bijit Mondal
c5542fc156 feat(example): video streaming 2026-02-19 18:42:06 +05:30
Bijit Mondal
bbe354b70b 0.2.1-beta.0 2026-02-19 16:06:17 +05:30
Bijit Mondal
6ab04788e1 0.2.0 2026-02-19 16:03:56 +05:30
Bijit Mondal
ac505c4ed9 Refactor VoiceAgent: Extract types and default configurations into separate types.ts file; remove unused StreamBuffer file 2026-02-19 16:01:25 +05:30
Bijit Mondal
ce10d521f3 feat: add dist directory with compiled files and type definitions
- Created dist/index.js and dist/index.d.ts for main entry points.
- Added source maps for index.js and index.d.ts.
- Introduced dist/utils/StreamBuffer.js and StreamBuffer.d.ts with source maps.
- Updated package.json to point main and types to dist files.
- Included additional files in package.json for distribution.
- Added peerDependencies and updated devDependencies.
2026-02-14 14:39:23 +05:30
89 changed files with 11159 additions and 61 deletions

4
.gitignore vendored
View File

@@ -4,4 +4,6 @@ node_modules
.marscode
dist
# dist
HOW_*.md

View File

@@ -1,5 +1,7 @@
# voice-agent-ai-sdk
[![npm version](https://badge.fury.io/js/voice-agent-ai-sdk.svg)](https://www.npmjs.com/package/voice-agent-ai-sdk)
Streaming voice/text agent SDK built on [AI SDK](https://sdk.vercel.ai/) with optional WebSocket transport.
## Features

343
dist/VideoAgent.d.ts vendored Normal file
View File

@@ -0,0 +1,343 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
/**
* Trigger reasons for frame capture
*/
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
/**
* Video frame data structure sent to/from the client
*/
interface VideoFrame {
type: "video_frame";
sessionId: string;
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
previousFrameRef?: string;
image: {
data: string;
format: string;
width: number;
height: number;
};
}
/**
* Audio data structure
*/
interface AudioData {
type: "audio";
sessionId: string;
data: string;
format: string;
sampleRate?: number;
duration?: number;
timestamp: number;
}
/**
* Backend configuration for video processing
*/
interface VideoAgentConfig {
/** Maximum frames to keep in context buffer for conversation history */
maxContextFrames: number;
}
/**
* Frame context for maintaining visual conversation history
*/
interface FrameContext {
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
frameHash: string;
description?: string;
}
export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
/** Maximum frame input size in bytes (default: 5 MB) */
maxFrameInputSize?: number;
/** Maximum frames to keep in context buffer (default: 10) */
maxContextFrames?: number;
/** Session ID for this video agent instance */
sessionId?: string;
}
export declare class VideoAgent extends EventEmitter {
private socket?;
private tools;
private model;
private transcriptionModel?;
private speechModel?;
private instructions;
private stopWhen;
private endpoint?;
private isConnected;
private conversationHistory;
private voice;
private speechInstructions?;
private outputFormat;
private isProcessing;
private isDestroyed;
private sessionId;
private frameSequence;
private lastFrameTimestamp;
private lastFrameHash?;
private frameContextBuffer;
private currentFrameData?;
private videoConfig;
private inputQueue;
private processingQueue;
private currentStreamAbortController?;
private historyConfig;
private maxAudioInputSize;
private maxFrameInputSize;
private streamingSpeechConfig;
private currentSpeechAbortController?;
private speechChunkQueue;
private nextChunkId;
private isSpeaking;
private pendingTextBuffer;
private speechQueueDonePromise?;
private speechQueueDoneResolve?;
constructor(options: VideoAgentOptions);
/**
* Generate a unique session ID
*/
private generateSessionId;
/**
* Simple hash function for frame comparison
*/
private hashFrame;
/**
* Ensure the agent has not been destroyed. Throws if it has.
*/
private ensureNotDestroyed;
/**
* Get current video agent configuration
*/
getConfig(): VideoAgentConfig;
/**
* Update video agent configuration
*/
updateConfig(config: Partial<VideoAgentConfig>): void;
private setupListeners;
/**
* Handle client ready signal
*/
private handleClientReady;
/**
* Handle incoming video frame
*/
private handleVideoFrame;
/**
* Add frame to context buffer
*/
private addFrameToContext;
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason: FrameTriggerReason): void;
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect;
registerTools(tools: Record<string, Tool>): void;
/**
* Transcribe audio data to text using the configured transcription model
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Generate speech from text using the configured speech model
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Interrupt ongoing speech generation and playback
*/
interruptSpeech(reason?: string): void;
/**
* Interrupt both the current LLM stream and ongoing speech
*/
interruptCurrentResponse(reason?: string): void;
/**
* Extract complete sentences from text buffer
*/
private extractSentences;
/**
* Trim conversation history to stay within configured limits
*/
private trimHistory;
/**
* Queue a text chunk for speech generation
*/
private queueSpeechChunk;
/**
* Generate audio for a single chunk
*/
private generateChunkAudio;
/**
* Process the speech queue and send audio chunks in order
*/
private processSpeechQueue;
/**
* Process text delta for streaming speech
*/
private processTextForStreamingSpeech;
/**
* Flush any remaining text in the buffer to speech
*/
private flushStreamingSpeech;
/**
* Process incoming audio data: transcribe and generate response
*/
private processAudioInput;
connect(url?: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage)
*/
handleSocket(socket: WebSocket): void;
/**
* Send text input for processing (bypasses transcription)
*/
sendText(text: string): Promise<string>;
/**
* Send audio data to be transcribed and processed
*/
sendAudio(audioData: string): Promise<void>;
/**
* Send raw audio buffer to be transcribed and processed
*/
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Send a video frame with optional text query for vision analysis
*/
sendFrame(frameData: string, query?: string, options?: {
width?: number;
height?: number;
format?: string;
}): Promise<string>;
/**
* Enqueue a text input for serial processing
*/
private enqueueTextInput;
/**
* Enqueue a multimodal input (text + frame) for serial processing
*/
private enqueueMultimodalInput;
/**
* Drain the input queue, processing one request at a time
*/
private drainInputQueue;
/**
* Build the message content array for multimodal input
*/
private buildMultimodalContent;
/**
* Process multimodal input (text + video frame)
*/
private processMultimodalInput;
/**
* Process user input with streaming text generation
*/
private processUserInput;
/**
* Handle individual stream chunks
*/
private handleStreamChunk;
/**
* Process the full stream result and return the response text
*/
private processStreamResult;
/**
* Send a message via WebSocket if connected
*/
private sendWebSocketMessage;
/**
* Start listening for voice/video input
*/
startListening(): void;
/**
* Stop listening for voice/video input
*/
stopListening(): void;
/**
* Clear conversation history
*/
clearHistory(): void;
/**
* Get current conversation history
*/
getHistory(): ModelMessage[];
/**
* Set conversation history
*/
setHistory(history: ModelMessage[]): void;
/**
* Get frame context buffer
*/
getFrameContext(): FrameContext[];
/**
* Get session ID
*/
getSessionId(): string;
/**
* Internal helper to close and clean up the current socket
*/
private disconnectSocket;
/**
* Disconnect from WebSocket and stop all in-flight work
*/
disconnect(): void;
/**
* Permanently destroy the agent, releasing all resources
*/
destroy(): void;
/**
* Check if agent is connected to WebSocket
*/
get connected(): boolean;
/**
* Check if agent is currently processing a request
*/
get processing(): boolean;
/**
* Check if agent is currently speaking
*/
get speaking(): boolean;
/**
* Get the number of pending speech chunks in the queue
*/
get pendingSpeechChunks(): number;
/**
* Check if agent has been permanently destroyed
*/
get destroyed(): boolean;
/**
* Get current frame sequence number
*/
get currentFrameSequence(): number;
/**
* Check if there is visual context available
*/
get hasVisualContext(): boolean;
}
export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
//# sourceMappingURL=VideoAgent.d.ts.map

1
dist/VideoAgent.d.ts.map vendored Normal file

File diff suppressed because one or more lines are too long

1317
dist/VideoAgent.js vendored Normal file

File diff suppressed because it is too large Load Diff

1
dist/VideoAgent.js.map vendored Normal file

File diff suppressed because one or more lines are too long

175
dist/VideoAgent.new.d.ts vendored Normal file
View File

@@ -0,0 +1,175 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
/**
* Trigger reasons for frame capture
*/
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
/**
* Video frame data structure sent to/from the client
*/
interface VideoFrame {
type: "video_frame";
sessionId: string;
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
previousFrameRef?: string;
image: {
data: string;
format: string;
width: number;
height: number;
};
}
/**
* Audio data structure
*/
interface AudioData {
type: "audio";
sessionId: string;
data: string;
format: string;
sampleRate?: number;
duration?: number;
timestamp: number;
}
/**
* Backend configuration for video processing
*/
interface VideoAgentConfig {
/** Maximum frames to keep in context buffer for conversation history */
maxContextFrames: number;
}
/**
* Frame context for maintaining visual conversation history
*/
interface FrameContext {
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
frameHash: string;
description?: string;
}
export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
history?: Partial<HistoryConfig>;
maxAudioInputSize?: number;
/** Maximum frame input size in bytes (default: 5 MB) */
maxFrameInputSize?: number;
/** Maximum frames to keep in context buffer (default: 10) */
maxContextFrames?: number;
/** Session ID for this video agent instance */
sessionId?: string;
}
export declare class VideoAgent extends EventEmitter {
private model;
private instructions;
private stopWhen;
private endpoint?;
private tools;
private isDestroyed;
private _isProcessing;
private currentStreamAbortController?;
private ws;
private speech;
private conversation;
private transcription;
private inputQueue;
private sessionId;
private frameSequence;
private lastFrameTimestamp;
private lastFrameHash?;
private frameContextBuffer;
private currentFrameData?;
private videoConfig;
private maxFrameInputSize;
constructor(options: VideoAgentOptions);
registerTools(tools: Record<string, Tool>): void;
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
interruptSpeech(reason?: string): void;
interruptCurrentResponse(reason?: string): void;
connect(url?: string): Promise<void>;
handleSocket(socket: WebSocket): void;
sendText(text: string): Promise<string>;
sendAudio(audioData: string): Promise<void>;
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Send a video frame with optional text query for vision analysis
*/
sendFrame(frameData: string, query?: string, options?: {
width?: number;
height?: number;
format?: string;
}): Promise<string>;
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason: FrameTriggerReason): void;
getConfig(): VideoAgentConfig;
updateConfig(config: Partial<VideoAgentConfig>): void;
startListening(): void;
stopListening(): void;
clearHistory(): void;
getHistory(): ModelMessage[];
setHistory(history: ModelMessage[]): void;
getFrameContext(): FrameContext[];
getSessionId(): string;
disconnect(): void;
destroy(): void;
get connected(): boolean;
get processing(): boolean;
get speaking(): boolean;
get pendingSpeechChunks(): number;
get destroyed(): boolean;
get currentFrameSequence(): number;
get hasVisualContext(): boolean;
private handleMessage;
private handleClientReady;
private handleAudioInput;
private handleVideoFrame;
private addFrameToContext;
private hashFrame;
private generateSessionId;
private enqueueTextInput;
private enqueueMultimodalInput;
/**
* Route queued items to the correct processor.
*/
private processQueueItem;
private buildMultimodalContent;
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
private runStream;
/**
* Process text-only input (with optional visual context from latest frame).
*/
private processUserInput;
/**
* Process multimodal input (text + explicit video frame).
*/
private processMultimodalInput;
private ensureNotDestroyed;
private cleanupOnDisconnect;
private bubbleEvents;
}
export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
//# sourceMappingURL=VideoAgent.new.d.ts.map

1
dist/VideoAgent.new.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"VideoAgent.new.d.ts","sourceRoot":"","sources":["../src/VideoAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACH,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACnB,MAAM,IAAI,CAAC;AACZ,OAAO,EACH,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACrB,MAAM,SAAS,CAAC;AAcjB;;GAEG;AACH,KAAK,kBAAkB,GAAG,cAAc,GAAG,cAAc,GAAG,OAAO,GAAG,SAAS,CAAC;AAEhF;;GAEG;AACH,UAAU,UAAU;IAChB,IAAI,EAAE,aAAa,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,EAAE;QACH,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;KAClB,CAAC;CACL;AAED;;GAEG;AACH,UAAU,SAAS;IACf,IAAI,EAAE,OAAO,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,UAAU,gBAAgB;IACtB,wEAAwE;IACxE,gBAAgB,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,UAAU,YAAY;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;CACxB;AAYD,MAAM,WAAW,iBAAiB;IAC9B;;;OAGG;IACH,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,6DAA6D;IAC7D,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAUD,qBAAa,UAAW,SAAQ,YAAY;IACxC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;IAG/C,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,aAAa,CAAK;IAC1B,OAAO,CAAC,kBAAkB,CAAK;IAC/B,OAAO,CAAC,aAAa,CAAC,CAAS;IAC/B,OAAO,CAAC,kBAAkB,CAAsB;IAChD,OAAO,CAAC,gBAAgB,CAAC,CAAS;IAClC,OAAO,CAAC,WAAW,CAAmB;IACtC,OAAO,CAAC,iBAAiB,CAAS;gBAEtB,OAAO,EAAE,iBAAiB;IAmF/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAInC,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAIhE,sBAAsB,CAC/B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GAC1B,OAAO,CAAC,UAAU,CAAC;IAIf,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAIrD,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQxD,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAM1C,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK/B,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQvC,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAK3C,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,SAAS,CAClB,SAAS,EAAE,MAAM,EACjB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,GAC/D,OAAO,CAAC,MAAM,CAAC;IA4BlB;;OAEG;IACI,mBAAmB,CAAC,MAAM,EAAE,kBAAkB,GAAG,IAAI;IASrD,SAAS,IAAI,gBAAgB;IAI7B,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,gBAAgB,CAAC,GAAG,IAAI;IAK5D,cAAc;IAId,aAAa;IAIb,YAAY;IAKZ,UAAU,IAAI,YAAY,EAAE;IAI5B,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,eAAe,IAAI,YAAY,EAAE;IAIjC,YAAY,IAAI,MAAM;IAItB,UAAU;IAIV,OAAO;IAYP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,oBAAoB,IAAI,MAAM,CAEjC;IAED,IAAI,gBAAgB,IAAI,OAAO,CAE9B;YAMa,aAAa;IA4C3B,OAAO,CAAC,iBAAiB;YAYX,gBAAgB;YAchB,gBAAgB;IAgD9B,OAAO,CAAC,iBAAiB;IAOzB,OAAO,CAAC,SAAS;IAUjB,OAAO,CAAC,iBAAiB;IAUzB,OAAO,CAAC,gBAAgB;IAMxB,OAAO,CAAC,sBAAsB;IAM9B;;OAEG;YACW,gBAAgB;IAa9B,OAAO,CAAC,sBAAsB;IA0B9B;;OAEG;YACW,SAAS;IAqEvB;;OAEG;YACW,gBAAgB;IAsC9B;;OAEG;YACW,sBAAsB;IAuCpC,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,mBAAmB;IAW3B,OAAO,CAAC,YAAY;CAKvB;AAGD,YAAY,EACR,UAAU,EACV,SAAS,EACT,gBAAgB,EAChB,YAAY,EACZ,kBAAkB,GACrB,CAAC;AAGF,YAAY,EAAE,qBAAqB,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC"}

571
dist/VideoAgent.new.js vendored Normal file
View File

@@ -0,0 +1,571 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VideoAgent = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const core_1 = require("./core");
/** Default maximum frame input size (5 MB) */
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
/** Default video agent config */
const DEFAULT_VIDEO_AGENT_CONFIG = {
maxContextFrames: 10,
};
// ── VideoAgent class ────────────────────────────────────
class VideoAgent extends events_1.EventEmitter {
model;
instructions;
stopWhen;
endpoint;
tools = {};
isDestroyed = false;
_isProcessing = false;
// Abort controller for the current LLM stream
currentStreamAbortController;
// ── Managers ─────────────────────────────────────────
ws;
speech;
conversation;
transcription;
inputQueue;
// ── Video-specific state ────────────────────────────
sessionId;
frameSequence = 0;
lastFrameTimestamp = 0;
lastFrameHash;
frameContextBuffer = [];
currentFrameData;
videoConfig;
maxFrameInputSize;
constructor(options) {
super();
this.model = options.model;
this.instructions =
options.instructions ||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
When analyzing images, be concise but informative. Describe what you see when asked.
Keep responses conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`;
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
this.endpoint = options.endpoint;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
this.sessionId = options.sessionId || this.generateSessionId();
this.videoConfig = {
...DEFAULT_VIDEO_AGENT_CONFIG,
maxContextFrames: options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
};
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ─────────────────────────
this.ws = new core_1.WebSocketManager();
this.speech = new core_1.SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new core_1.ConversationManager({
history: options.history,
});
this.transcription = new core_1.TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new core_1.InputQueue();
// ── Wire managers to WebSocket send ─────────────
const sendMsg = (msg) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire input queue processor ──────────────────
this.inputQueue.processor = (item) => this.processQueueItem(item);
// ── Bubble events from managers ─────────────────
this.bubbleEvents(this.ws, ["connected", "error"]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle ──────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message) => this.handleMessage(message));
}
// ══════════════════════════════════════════════════════
// Public API
// ══════════════════════════════════════════════════════
registerTools(tools) {
this.tools = { ...this.tools, ...tools };
}
async transcribeAudio(audioData) {
return this.transcription.transcribeAudio(audioData);
}
async generateSpeechFromText(text, abortSignal) {
return this.speech.generateSpeechFromText(text, abortSignal);
}
interruptSpeech(reason = "interrupted") {
this.speech.interruptSpeech(reason);
}
interruptCurrentResponse(reason = "interrupted") {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
async connect(url) {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
handleSocket(socket) {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
async sendText(text) {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueTextInput(text);
}
async sendAudio(audioData) {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
async sendAudioBuffer(audioBuffer) {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Send a video frame with optional text query for vision analysis
*/
async sendFrame(frameData, query, options) {
this.ensureNotDestroyed();
const frame = {
type: "video_frame",
sessionId: this.sessionId,
sequence: this.frameSequence++,
timestamp: Date.now(),
triggerReason: "user_request",
previousFrameRef: this.lastFrameHash,
image: {
data: frameData,
format: options?.format || "webp",
width: options?.width || 640,
height: options?.height || 480,
},
};
// Update local frame state
await this.handleVideoFrame(frame);
if (query) {
return this.enqueueMultimodalInput(query, frame);
}
return "";
}
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason) {
this.ws.send({
type: "capture_frame",
reason,
timestamp: Date.now(),
});
this.emit("frame_requested", { reason });
}
getConfig() {
return { ...this.videoConfig };
}
updateConfig(config) {
this.videoConfig = { ...this.videoConfig, ...config };
this.emit("config_changed", this.videoConfig);
}
startListening() {
this.emit("listening");
}
stopListening() {
this.emit("stopped");
}
clearHistory() {
this.conversation.clearHistory();
this.frameContextBuffer = [];
}
getHistory() {
return this.conversation.getHistory();
}
setHistory(history) {
this.conversation.setHistory(history);
}
getFrameContext() {
return [...this.frameContextBuffer];
}
getSessionId() {
return this.sessionId;
}
disconnect() {
this.ws.disconnect();
}
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.frameContextBuffer = [];
this.tools = {};
this.removeAllListeners();
}
// ── Getters ─────────────────────────────────────────
get connected() {
return this.ws.isConnected;
}
get processing() {
return this._isProcessing;
}
get speaking() {
return this.speech.isSpeaking;
}
get pendingSpeechChunks() {
return this.speech.pendingChunkCount;
}
get destroyed() {
return this.isDestroyed;
}
get currentFrameSequence() {
return this.frameSequence;
}
get hasVisualContext() {
return !!this.currentFrameData;
}
// ══════════════════════════════════════════════════════
// Private — message handling
// ══════════════════════════════════════════════════════
async handleMessage(message) {
try {
switch (message.type) {
case "transcript":
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
await this.enqueueTextInput(message.text);
break;
case "audio":
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
try {
await this.handleAudioInput(message.data, message.format);
}
catch (audioError) {
this.emit("error", audioError);
}
break;
case "video_frame":
await this.handleVideoFrame(message);
break;
case "interrupt":
this.interruptCurrentResponse(message.reason || "client_request");
break;
case "client_ready":
this.handleClientReady(message);
break;
}
}
catch (err) {
this.emit("error", err);
}
}
handleClientReady(message) {
this.ws.send({
type: "session_init",
sessionId: this.sessionId,
});
this.emit("client_ready", message.capabilities);
}
// ══════════════════════════════════════════════════════
// Private — audio
// ══════════════════════════════════════════════════════
async handleAudioInput(base64Audio, format) {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueTextInput(text);
}
}
// ══════════════════════════════════════════════════════
// Private — video frames
// ══════════════════════════════════════════════════════
async handleVideoFrame(frame) {
try {
if (!frame.image?.data) {
this.emit("warning", "Received empty or invalid video frame");
return;
}
const frameSize = Buffer.from(frame.image.data, "base64").length;
if (frameSize > this.maxFrameInputSize) {
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
this.emit("error", new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
return;
}
const frameHash = this.hashFrame(frame.image.data);
this.lastFrameTimestamp = frame.timestamp;
this.lastFrameHash = frameHash;
this.currentFrameData = frame.image.data;
this.addFrameToContext({
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
frameHash,
});
this.emit("frame_received", {
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
size: frameSize,
dimensions: { width: frame.image.width, height: frame.image.height },
});
this.ws.send({
type: "frame_ack",
sequence: frame.sequence,
timestamp: Date.now(),
});
}
catch (error) {
this.emit("error", error);
}
}
addFrameToContext(context) {
this.frameContextBuffer.push(context);
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
this.frameContextBuffer.shift();
}
}
hashFrame(data) {
let hash = 0;
for (let i = 0; i < data.length; i++) {
const char = data.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
}
generateSessionId() {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2, 10);
return `vs_${timestamp}_${randomPart}`;
}
// ══════════════════════════════════════════════════════
// Private — input queue
// ══════════════════════════════════════════════════════
enqueueTextInput(text) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
enqueueMultimodalInput(text, frame) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, frame, resolve, reject });
});
}
/**
* Route queued items to the correct processor.
*/
async processQueueItem(item) {
if (item.frame && item.text) {
return this.processMultimodalInput(item.text, item.frame);
}
else if (item.text) {
return this.processUserInput(item.text);
}
return "";
}
// ══════════════════════════════════════════════════════
// Private — multimodal content building
// ══════════════════════════════════════════════════════
buildMultimodalContent(text, frameData) {
const content = [];
if (this.frameContextBuffer.length > 0) {
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
content.push({ type: "text", text: contextSummary });
}
const imageData = frameData || this.currentFrameData;
if (imageData) {
content.push({ type: "image", image: imageData });
}
content.push({ type: "text", text });
return content;
}
// ══════════════════════════════════════════════════════
// Private — LLM processing
// ══════════════════════════════════════════════════════
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
async runStream(messages, abortSignal) {
const result = (0, ai_1.streamText)({
model: this.model,
system: this.instructions,
messages,
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal,
onChunk: ({ chunk }) => {
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
this.emit("error", error);
},
});
const streamResult = await (0, core_1.processFullStream)(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
}, {
sessionId: this.sessionId,
frameContext: this.frameContextBuffer.length > 0
? {
frameCount: this.frameContextBuffer.length,
lastFrameSequence: this.frameContextBuffer[this.frameContextBuffer.length - 1]
?.sequence,
}
: undefined,
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush remaining speech & wait for queue
this.speech.flushPendingText();
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
/**
* Process text-only input (with optional visual context from latest frame).
*/
async processUserInput(text) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text });
const hasVisual = !!this.currentFrameData;
let messages;
if (hasVisual) {
const content = this.buildMultimodalContent(text);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Visual context] ${text}` }],
});
messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
}
else {
this.conversation.addMessage({ role: "user", content: text });
messages = this.conversation.getHistoryRef();
}
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
/**
* Process multimodal input (text + explicit video frame).
*/
async processMultimodalInput(text, frame) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text, hasImage: true });
const content = this.buildMultimodalContent(text, frame.image.data);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Image attached] ${text}` }],
});
const messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ══════════════════════════════════════════════════════
// Private — helpers
// ══════════════════════════════════════════════════════
ensureNotDestroyed() {
if (this.isDestroyed) {
throw new Error("VideoAgent has been destroyed and cannot be used");
}
}
cleanupOnDisconnect() {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.currentFrameData = undefined;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
bubbleEvents(source, events) {
for (const event of events) {
source.on(event, (...args) => this.emit(event, ...args));
}
}
}
exports.VideoAgent = VideoAgent;
//# sourceMappingURL=VideoAgent.new.js.map

1
dist/VideoAgent.new.js.map vendored Normal file

File diff suppressed because one or more lines are too long

243
dist/VoiceAgent.d.ts vendored Normal file
View File

@@ -0,0 +1,243 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export declare class VoiceAgent extends EventEmitter {
private socket?;
private tools;
private model;
private transcriptionModel?;
private speechModel?;
private instructions;
private stopWhen;
private endpoint?;
private isConnected;
private conversationHistory;
private voice;
private speechInstructions?;
private outputFormat;
private isProcessing;
private isDestroyed;
private inputQueue;
private processingQueue;
private currentStreamAbortController?;
private historyConfig;
private maxAudioInputSize;
private streamingSpeechConfig;
private currentSpeechAbortController?;
private speechChunkQueue;
private nextChunkId;
private isSpeaking;
private pendingTextBuffer;
private speechQueueDonePromise?;
private speechQueueDoneResolve?;
constructor(options: VoiceAgentOptions);
/**
* Ensure the agent has not been destroyed. Throws if it has.
*/
private ensureNotDestroyed;
private setupListeners;
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect;
registerTools(tools: Record<string, Tool>): void;
/**
* Transcribe audio data to text using the configured transcription model
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Generate speech from text using the configured speech model
* @param abortSignal Optional signal to cancel the speech generation
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
* This only interrupts TTS — the LLM stream is left running.
*/
interruptSpeech(reason?: string): void;
/**
* Interrupt both the current LLM stream and ongoing speech.
* Use this for barge-in scenarios where the entire response should be cancelled.
*/
interruptCurrentResponse(reason?: string): void;
/**
* Extract complete sentences from text buffer
* Returns [extractedSentences, remainingBuffer]
*/
private extractSentences;
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
private trimHistory;
/**
* Queue a text chunk for speech generation
*/
private queueSpeechChunk;
/**
* Generate audio for a single chunk
*/
private generateChunkAudio;
/**
* Process the speech queue and send audio chunks in order
*/
private processSpeechQueue;
/**
* Process text delta for streaming speech.
* Call this as text chunks arrive from LLM.
*/
private processTextForStreamingSpeech;
/**
* Flush any remaining text in the buffer to speech
* Call this when stream ends
*/
private flushStreamingSpeech;
/**
* Process incoming audio data: transcribe and generate response
*/
private processAudioInput;
connect(url?: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
* Use this when a WS server accepts a connection and you want the
* agent to handle messages on that socket.
*
* **Note:** Calling this while a socket is already attached will cleanly
* tear down the previous connection first. Each `VoiceAgent` instance
* supports only one socket at a time — create a new agent per user.
*/
handleSocket(socket: WebSocket): void;
/**
* Send text input for processing (bypasses transcription).
* Requests are queued and processed serially to prevent race conditions.
*/
sendText(text: string): Promise<string>;
/**
* Send audio data to be transcribed and processed
* @param audioData Base64 encoded audio data
*/
sendAudio(audioData: string): Promise<void>;
/**
* Send raw audio buffer to be transcribed and processed
*/
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Enqueue a text input for serial processing.
* This ensures only one processUserInput runs at a time, preventing
* race conditions on conversationHistory, fullText accumulation, etc.
*/
private enqueueInput;
/**
* Drain the input queue, processing one request at a time.
*/
private drainInputQueue;
/**
* Process user input with streaming text generation.
* Handles the full pipeline: text -> LLM (streaming) -> TTS -> WebSocket.
*
* This method is designed to be called serially via drainInputQueue().
*/
private processUserInput;
/**
* Generate speech for full text at once (non-streaming fallback)
* Useful when you want to bypass streaming speech for short responses
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/**
* Send a message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
private sendWebSocketMessage;
/**
* Start listening for voice input
*/
startListening(): void;
/**
* Stop listening for voice input
*/
stopListening(): void;
/**
* Clear conversation history
*/
clearHistory(): void;
/**
* Get current conversation history
*/
getHistory(): ModelMessage[];
/**
* Set conversation history (useful for restoring sessions)
*/
setHistory(history: ModelMessage[]): void;
/**
* Internal helper to close and clean up the current socket.
*/
private disconnectSocket;
/**
* Disconnect from WebSocket and stop all in-flight work.
*/
disconnect(): void;
/**
* Permanently destroy the agent, releasing all resources.
* After calling this, the agent cannot be reused.
*/
destroy(): void;
/**
* Check if agent is connected to WebSocket
*/
get connected(): boolean;
/**
* Check if agent is currently processing a request
*/
get processing(): boolean;
/**
* Check if agent is currently speaking (generating/playing audio)
*/
get speaking(): boolean;
/**
* Get the number of pending speech chunks in the queue
*/
get pendingSpeechChunks(): number;
/**
* Check if agent has been permanently destroyed
*/
get destroyed(): boolean;
}
//# sourceMappingURL=VoiceAgent.d.ts.map

1
dist/VoiceAgent.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;;;;;OAQG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}

1111
dist/VoiceAgent.js vendored Normal file

File diff suppressed because it is too large Load Diff

1
dist/VoiceAgent.js.map vendored Normal file

File diff suppressed because one or more lines are too long

137
dist/VoiceAgent.new.d.ts vendored Normal file
View File

@@ -0,0 +1,137 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export declare class VoiceAgent extends EventEmitter {
private model;
private instructions;
private stopWhen;
private endpoint?;
private tools;
private isDestroyed;
private _isProcessing;
private currentStreamAbortController?;
private ws;
private speech;
private conversation;
private transcription;
private inputQueue;
constructor(options: VoiceAgentOptions);
registerTools(tools: Record<string, Tool>): void;
/**
* Transcribe audio data to text using the configured transcription model.
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Generate speech from text using the configured speech model.
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason?: string): void;
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
interruptCurrentResponse(reason?: string): void;
/**
* Connect to a WebSocket server by URL.
*/
connect(url?: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void;
/**
* Send text input for processing (bypasses transcription).
*/
sendText(text: string): Promise<string>;
/**
* Send base64 audio data to be transcribed and processed.
*/
sendAudio(audioData: string): Promise<void>;
/**
* Send raw audio buffer to be transcribed and processed.
*/
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Generate speech for full text at once (non-streaming fallback).
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/** Start listening for voice input */
startListening(): void;
/** Stop listening for voice input */
stopListening(): void;
/** Clear conversation history */
clearHistory(): void;
/** Get current conversation history */
getHistory(): ModelMessage[];
/** Set conversation history (useful for restoring sessions) */
setHistory(history: ModelMessage[]): void;
/** Disconnect from WebSocket and stop all in-flight work */
disconnect(): void;
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy(): void;
get connected(): boolean;
get processing(): boolean;
get speaking(): boolean;
get pendingSpeechChunks(): number;
get destroyed(): boolean;
private handleMessage;
private handleAudioInput;
private enqueueInput;
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
private processUserInput;
private ensureNotDestroyed;
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect;
/**
* Forward select events from a child emitter to this agent.
*/
private bubbleEvents;
}
//# sourceMappingURL=VoiceAgent.new.d.ts.map

1
dist/VoiceAgent.new.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"VoiceAgent.new.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EACL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACnB,MAAM,SAAS,CAAC;AAYjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAOD;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;gBAEnC,OAAO,EAAE,iBAAiB;IAyE/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7E;;OAEG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAItB;;OAEG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAI5D;;OAEG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQrE;;OAEG;IACU,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAMjD;;OAEG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK5C;;OAEG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;OAEG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAInE,sCAAsC;IACtC,cAAc;IAKd,qCAAqC;IACrC,aAAa;IAKb,iCAAiC;IACjC,YAAY;IAIZ,uCAAuC;IACvC,UAAU,IAAI,YAAY,EAAE;IAI5B,+DAA+D;IAC/D,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,4DAA4D;IAC5D,UAAU;IAIV;;OAEG;IACH,OAAO;IAWP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;YAIa,aAAa;YAoCb,gBAAgB;IAe9B,OAAO,CAAC,YAAY;IAQpB;;;OAGG;YACW,gBAAgB;IAyE9B,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAU3B;;OAEG;IACH,OAAO,CAAC,YAAY;CAKrB"}

379
dist/VoiceAgent.new.js vendored Normal file
View File

@@ -0,0 +1,379 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VoiceAgent = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const core_1 = require("./core");
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
class VoiceAgent extends events_1.EventEmitter {
model;
instructions;
stopWhen;
endpoint;
tools = {};
isDestroyed = false;
_isProcessing = false;
// Abort controller for the current LLM stream
currentStreamAbortController;
// ── Managers ──────────────────────────────────────────
ws;
speech;
conversation;
transcription;
inputQueue;
constructor(options) {
super();
this.model = options.model;
this.instructions =
options.instructions || "You are a helpful voice assistant.";
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
this.endpoint = options.endpoint;
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ──────────────────────────────
this.ws = new core_1.WebSocketManager();
this.speech = new core_1.SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new core_1.ConversationManager({
history: options.history,
});
this.transcription = new core_1.TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new core_1.InputQueue();
// ── Wire managers to the WebSocket send function ─────
const sendMsg = (msg) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire the input queue processor ───────────────────
this.inputQueue.processor = (item) => this.processUserInput(item.text);
// ── Bubble events from managers ──────────────────────
this.bubbleEvents(this.ws, [
"connected",
"error",
]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle events ────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message) => this.handleMessage(message));
}
// ── Public API ────────────────────────────────────────
registerTools(tools) {
this.tools = { ...this.tools, ...tools };
}
/**
* Transcribe audio data to text using the configured transcription model.
*/
async transcribeAudio(audioData) {
return this.transcription.transcribeAudio(audioData);
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(text, abortSignal) {
return this.speech.generateSpeechFromText(text, abortSignal);
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason = "interrupted") {
this.speech.interruptSpeech(reason);
}
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
interruptCurrentResponse(reason = "interrupted") {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
/**
* Connect to a WebSocket server by URL.
*/
async connect(url) {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket) {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
/**
* Send text input for processing (bypasses transcription).
*/
async sendText(text) {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueInput(text);
}
/**
* Send base64 audio data to be transcribed and processed.
*/
async sendAudio(audioData) {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
/**
* Send raw audio buffer to be transcribed and processed.
*/
async sendAudioBuffer(audioBuffer) {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text) {
return this.speech.generateAndSendSpeechFull(text);
}
/** Start listening for voice input */
startListening() {
console.log("Starting voice agent...");
this.emit("listening");
}
/** Stop listening for voice input */
stopListening() {
console.log("Stopping voice agent...");
this.emit("stopped");
}
/** Clear conversation history */
clearHistory() {
this.conversation.clearHistory();
}
/** Get current conversation history */
getHistory() {
return this.conversation.getHistory();
}
/** Set conversation history (useful for restoring sessions) */
setHistory(history) {
this.conversation.setHistory(history);
}
/** Disconnect from WebSocket and stop all in-flight work */
disconnect() {
this.ws.disconnect();
}
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.tools = {};
this.removeAllListeners();
}
// ── Getters ───────────────────────────────────────────
get connected() {
return this.ws.isConnected;
}
get processing() {
return this._isProcessing;
}
get speaking() {
return this.speech.isSpeaking;
}
get pendingSpeechChunks() {
return this.speech.pendingChunkCount;
}
get destroyed() {
return this.isDestroyed;
}
// ── Private: message handling ─────────────────────────
async handleMessage(message) {
try {
console.log(`Received WebSocket message of type: ${message.type}`);
if (message.type === "transcript") {
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueInput(message.text);
}
else if (message.type === "audio") {
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`);
await this.handleAudioInput(message.data, message.format);
}
else if (message.type === "interrupt") {
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
this.interruptCurrentResponse(message.reason || "client_request");
}
}
catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err);
}
}
// ── Private: audio ────────────────────────────────────
async handleAudioInput(base64Audio, format) {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueInput(text);
}
}
// ── Private: input queue ──────────────────────────────
enqueueInput(text) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
// ── Private: LLM processing ───────────────────────────
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
async processUserInput(text) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
const streamAbortSignal = this.currentStreamAbortController.signal;
try {
this.emit("text", { role: "user", text });
this.conversation.addMessage({ role: "user", content: text });
const result = (0, ai_1.streamText)({
model: this.model,
system: this.instructions,
messages: this.conversation.getHistoryRef(),
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal: streamAbortSignal,
onChunk: ({ chunk }) => {
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error);
},
});
const streamResult = await (0, core_1.processFullStream)(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush any remaining speech
this.speech.flushPendingText();
// Wait for all speech chunks to complete
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
catch (error) {
// Clean up speech state on error
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ── Private: helpers ──────────────────────────────────
ensureNotDestroyed() {
if (this.isDestroyed) {
throw new Error("VoiceAgent has been destroyed and cannot be used");
}
}
/**
* Clean up all in-flight state when the connection drops.
*/
cleanupOnDisconnect() {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
/**
* Forward select events from a child emitter to this agent.
*/
bubbleEvents(source, events) {
for (const event of events) {
source.on(event, (...args) => this.emit(event, ...args));
}
}
}
exports.VoiceAgent = VoiceAgent;
//# sourceMappingURL=VoiceAgent.new.js.map

1
dist/VoiceAgent.new.js.map vendored Normal file

File diff suppressed because one or more lines are too long

46
dist/core/ConversationManager.d.ts vendored Normal file
View File

@@ -0,0 +1,46 @@
import { EventEmitter } from "events";
import { type ModelMessage } from "ai";
import { type HistoryConfig } from "../types";
export interface ConversationManagerOptions {
history?: Partial<HistoryConfig>;
}
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
export declare class ConversationManager extends EventEmitter {
private conversationHistory;
private historyConfig;
constructor(options?: ConversationManagerOptions);
/**
* Add a message to history and trim if needed.
*/
addMessage(message: ModelMessage): void;
/**
* Get a copy of the current history.
*/
getHistory(): ModelMessage[];
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef(): ModelMessage[];
/**
* Replace the entire conversation history.
*/
setHistory(history: ModelMessage[]): void;
/**
* Clear all conversation history.
*/
clearHistory(): void;
/**
* Get the number of messages in history.
*/
get length(): number;
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
private trimHistory;
}
//# sourceMappingURL=ConversationManager.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ConversationManager.d.ts","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,KAAK,YAAY,EAAE,MAAM,IAAI,CAAC;AACvC,OAAO,EAAE,KAAK,aAAa,EAA0B,MAAM,UAAU,CAAC;AAEtE,MAAM,WAAW,0BAA0B;IACzC,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;CAClC;AAED;;;GAGG;AACH,qBAAa,mBAAoB,SAAQ,YAAY;IACnD,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,aAAa,CAAgB;gBAEzB,OAAO,GAAE,0BAA+B;IAQpD;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,GAAG,IAAI;IAKvC;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;;OAGG;IACH,aAAa,IAAI,YAAY,EAAE;IAI/B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,IAAI;IAIzC;;OAEG;IACH,YAAY,IAAI,IAAI;IAKpB;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;;OAGG;IACH,OAAO,CAAC,WAAW;CAgDpB"}

106
dist/core/ConversationManager.js vendored Normal file
View File

@@ -0,0 +1,106 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ConversationManager = void 0;
const events_1 = require("events");
const types_1 = require("../types");
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
class ConversationManager extends events_1.EventEmitter {
conversationHistory = [];
historyConfig;
constructor(options = {}) {
super();
this.historyConfig = {
...types_1.DEFAULT_HISTORY_CONFIG,
...options.history,
};
}
/**
* Add a message to history and trim if needed.
*/
addMessage(message) {
this.conversationHistory.push(message);
this.trimHistory();
}
/**
* Get a copy of the current history.
*/
getHistory() {
return [...this.conversationHistory];
}
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef() {
return this.conversationHistory;
}
/**
* Replace the entire conversation history.
*/
setHistory(history) {
this.conversationHistory = [...history];
}
/**
* Clear all conversation history.
*/
clearHistory() {
this.conversationHistory = [];
this.emit("history_cleared");
}
/**
* Get the number of messages in history.
*/
get length() {
return this.conversationHistory.length;
}
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
trimHistory() {
const { maxMessages, maxTotalChars } = this.historyConfig;
// Trim by message count
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
const excess = this.conversationHistory.length - maxMessages;
// Round up to even number to preserve turn pairs
const toRemove = excess % 2 === 0 ? excess : excess + 1;
this.conversationHistory.splice(0, toRemove);
this.emit("history_trimmed", {
removedCount: toRemove,
reason: "max_messages",
});
}
// Trim by total character count
if (maxTotalChars > 0) {
let totalChars = this.conversationHistory.reduce((sum, msg) => {
const content = typeof msg.content === "string"
? msg.content
: JSON.stringify(msg.content);
return sum + content.length;
}, 0);
let removedCount = 0;
while (totalChars > maxTotalChars &&
this.conversationHistory.length > 2) {
const removed = this.conversationHistory.shift();
if (removed) {
const content = typeof removed.content === "string"
? removed.content
: JSON.stringify(removed.content);
totalChars -= content.length;
removedCount++;
}
}
if (removedCount > 0) {
this.emit("history_trimmed", {
removedCount,
reason: "max_total_chars",
});
}
}
}
}
exports.ConversationManager = ConversationManager;
//# sourceMappingURL=ConversationManager.js.map

1
dist/core/ConversationManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"ConversationManager.js","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AAEtC,oCAAsE;AAMtE;;;GAGG;AACH,MAAa,mBAAoB,SAAQ,qBAAY;IAC3C,mBAAmB,GAAmB,EAAE,CAAC;IACzC,aAAa,CAAgB;IAErC,YAAY,UAAsC,EAAE;QAClD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,aAAa,GAAG;YACnB,GAAG,8BAAsB;YACzB,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAqB;QAC9B,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,EAAE,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,CAAC,GAAG,IAAI,CAAC,mBAAmB,CAAC,CAAC;IACvC,CAAC;IAED;;;OAGG;IACH,aAAa;QACX,OAAO,IAAI,CAAC,mBAAmB,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAuB;QAChC,IAAI,CAAC,mBAAmB,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,YAAY;QACV,IAAI,CAAC,mBAAmB,GAAG,EAAE,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC;IACzC,CAAC;IAED;;;OAGG;IACK,WAAW;QACjB,MAAM,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;QAE1D,wBAAwB;QACxB,IAAI,WAAW,GAAG,CAAC,IAAI,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,EAAE,CAAC;YACrE,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,CAAC;YAC7D,iDAAiD;YACjD,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;YAC7C,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;gBAC3B,YAAY,EAAE,QAAQ;gBACtB,MAAM,EAAE,cAAc;aACvB,CAAC,CAAC;QACL,CAAC;QAED,gCAAgC;QAChC,IAAI,aAAa,GAAG,CAAC,EAAE,CAAC;YACtB,IAAI,UAAU,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;gBAC5D,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ;oBAC7B,CAAC,CAAC,GAAG,CAAC,OAAO;oBACb,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAClC,OAAO,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC;YAC9B,CAAC,EAAE,CAAC,CAAC,CAAC;YAEN,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,OACE,UAAU,GAAG,aAAa;gBAC1B,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,CAAC,EACnC,CAAC;gBACD,MAAM,OAAO,GAAG,IAAI,CAAC,mBAAmB,CAAC,KAAK,EAAE,CAAC;gBACjD,IAAI,OAAO,EAAE,CAAC;oBACZ,MAAM,OAAO,GACX,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ;wBACjC,CAAC,CAAC,OAAO,CAAC,OAAO;wBACjB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;oBACtC,UAAU,IAAI,OAAO,CAAC,MAAM,CAAC;oBAC7B,YAAY,EAAE,CAAC;gBACjB,CAAC;YACH,CAAC;YACD,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;gBACrB,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;oBAC3B,YAAY;oBACZ,MAAM,EAAE,iBAAiB;iBAC1B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;CACF;AA7GD,kDA6GC"}

33
dist/core/InputQueue.d.ts vendored Normal file
View File

@@ -0,0 +1,33 @@
/**
* A generic serial input queue that ensures only one processor runs at a time.
*
* @template T The shape of each queued item (must include resolve/reject)
*/
export interface QueueItem<T = string> {
resolve: (v: T) => void;
reject: (e: unknown) => void;
}
export declare class InputQueue<T extends QueueItem<any>> {
private queue;
private processing;
/** Callback invoked for each item — must return a resolved value */
processor: (item: T) => Promise<any>;
/**
* Enqueue an item for serial processing.
*/
enqueue(item: T): void;
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason: Error): void;
/**
* Number of items waiting in the queue.
*/
get length(): number;
/**
* Whether the queue is currently processing an item.
*/
get isProcessing(): boolean;
private drain;
}
//# sourceMappingURL=InputQueue.d.ts.map

1
dist/core/InputQueue.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"InputQueue.d.ts","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,SAAS,CAAC,CAAC,GAAG,MAAM;IACnC,OAAO,EAAE,CAAC,CAAC,EAAE,CAAC,KAAK,IAAI,CAAC;IACxB,MAAM,EAAE,CAAC,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CAC9B;AAED,qBAAa,UAAU,CAAC,CAAC,SAAS,SAAS,CAAC,GAAG,CAAC;IAC9C,OAAO,CAAC,KAAK,CAAW;IACxB,OAAO,CAAC,UAAU,CAAS;IAE3B,oEAAoE;IAC7D,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,OAAO,CAAC,GAAG,CAAC,CAAkB;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,CAAC,GAAG,IAAI;IAKtB;;OAEG;IACH,SAAS,CAAC,MAAM,EAAE,KAAK,GAAG,IAAI;IAQ9B;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,OAAO,CAE1B;YAIa,KAAK;CAkBpB"}

61
dist/core/InputQueue.js vendored Normal file
View File

@@ -0,0 +1,61 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.InputQueue = void 0;
class InputQueue {
queue = [];
processing = false;
/** Callback invoked for each item — must return a resolved value */
processor = async () => "";
/**
* Enqueue an item for serial processing.
*/
enqueue(item) {
this.queue.push(item);
this.drain();
}
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason) {
for (const item of this.queue) {
item.reject(reason);
}
this.queue = [];
this.processing = false;
}
/**
* Number of items waiting in the queue.
*/
get length() {
return this.queue.length;
}
/**
* Whether the queue is currently processing an item.
*/
get isProcessing() {
return this.processing;
}
// ── Private ──────────────────────────────────────────
async drain() {
if (this.processing)
return;
this.processing = true;
try {
while (this.queue.length > 0) {
const item = this.queue.shift();
try {
const result = await this.processor(item);
item.resolve(result);
}
catch (error) {
item.reject(error);
}
}
}
finally {
this.processing = false;
}
}
}
exports.InputQueue = InputQueue;
//# sourceMappingURL=InputQueue.js.map

1
dist/core/InputQueue.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"InputQueue.js","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":";;;AAUA,MAAa,UAAU;IACb,KAAK,GAAQ,EAAE,CAAC;IAChB,UAAU,GAAG,KAAK,CAAC;IAE3B,oEAAoE;IAC7D,SAAS,GAA8B,KAAK,IAAI,EAAE,CAAC,EAAE,CAAC;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAO;QACb,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,MAAa;QACrB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAChB,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;IAED,wDAAwD;IAEhD,KAAK,CAAC,KAAK;QACjB,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO;QAC5B,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;QAEvB,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;gBACjC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;oBAC1C,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;QAC1B,CAAC;IACH,CAAC;CACF;AA5DD,gCA4DC"}

83
dist/core/SpeechManager.d.ts vendored Normal file
View File

@@ -0,0 +1,83 @@
import { EventEmitter } from "events";
import { type SpeechModel } from "ai";
import { type StreamingSpeechConfig } from "../types";
export interface SpeechManagerOptions {
speechModel?: SpeechModel;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
}
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
export declare class SpeechManager extends EventEmitter {
private speechModel?;
private voice;
private speechInstructions?;
private outputFormat;
private streamingSpeechConfig;
private currentSpeechAbortController?;
private speechChunkQueue;
private nextChunkId;
private _isSpeaking;
private pendingTextBuffer;
private speechQueueDonePromise?;
private speechQueueDoneResolve?;
/** Callback to send messages over the WebSocket */
sendMessage: (message: Record<string, unknown>) => void;
constructor(options: SpeechManagerOptions);
get isSpeaking(): boolean;
get pendingChunkCount(): number;
get hasSpeechModel(): boolean;
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise(): Promise<void> | undefined;
/**
* Generate speech from text using the configured speech model.
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Generate speech for full text at once (non-streaming fallback).
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason?: string): void;
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta: string): void;
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText(): void;
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset(): void;
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
private extractSentences;
/**
* Queue a text chunk for speech generation.
*/
private queueSpeechChunk;
/**
* Generate audio for a single chunk.
*/
private generateChunkAudio;
/**
* Process the speech queue and send audio chunks in order.
*/
private processSpeechQueue;
}
//# sourceMappingURL=SpeechManager.d.ts.map

1
dist/core/SpeechManager.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"SpeechManager.d.ts","sourceRoot":"","sources":["../../src/core/SpeechManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAE3B,MAAM,UAAU,CAAC;AAElB,MAAM,WAAW,oBAAoB;IACnC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;CAClD;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,YAAY;IAC7C,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,qBAAqB,CAAwB;IAErD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;IAE5C,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAa;gBAE/D,OAAO,EAAE,oBAAoB;IAYzC,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,iBAAiB,IAAI,MAAM,CAE9B;IAED,IAAI,cAAc,IAAI,OAAO,CAE5B;IAED;;;OAGG;IACH,IAAI,gBAAgB,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,SAAS,CAEhD;IAED;;OAEG;IACG,sBAAsB,CAC1B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;OAEG;IACG,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA4B5D;;OAEG;IACH,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgCrD;;;OAGG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAazC;;;OAGG;IACH,gBAAgB,IAAI,IAAI;IAOxB;;OAEG;IACH,KAAK,IAAI,IAAI;IAkBb;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA+CxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAwCxB;;OAEG;YACW,kBAAkB;IAiChC;;OAEG;YACW,kBAAkB;CA0GjC"}

356
dist/core/SpeechManager.js vendored Normal file
View File

@@ -0,0 +1,356 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SpeechManager = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const types_1 = require("../types");
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
class SpeechManager extends events_1.EventEmitter {
speechModel;
voice;
speechInstructions;
outputFormat;
streamingSpeechConfig;
currentSpeechAbortController;
speechChunkQueue = [];
nextChunkId = 0;
_isSpeaking = false;
pendingTextBuffer = "";
// Promise-based signal for speech queue completion
speechQueueDonePromise;
speechQueueDoneResolve;
/** Callback to send messages over the WebSocket */
sendMessage = () => { };
constructor(options) {
super();
this.speechModel = options.speechModel;
this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "opus";
this.streamingSpeechConfig = {
...types_1.DEFAULT_STREAMING_SPEECH_CONFIG,
...options.streamingSpeech,
};
}
get isSpeaking() {
return this._isSpeaking;
}
get pendingChunkCount() {
return this.speechChunkQueue.length;
}
get hasSpeechModel() {
return !!this.speechModel;
}
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise() {
return this.speechQueueDonePromise;
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(text, abortSignal) {
if (!this.speechModel) {
throw new Error("Speech model not configured");
}
const result = await (0, ai_1.experimental_generateSpeech)({
model: this.speechModel,
text,
voice: this.voice,
instructions: this.speechInstructions,
outputFormat: this.outputFormat,
abortSignal,
});
return result.audio.uint8Array;
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text) {
if (!this.speechModel)
return;
try {
this.emit("speech_start", { text, streaming: false });
const audioData = await this.generateSpeechFromText(text);
const base64Audio = Buffer.from(audioData).toString("base64");
this.sendMessage({
type: "audio",
data: base64Audio,
format: this.outputFormat,
});
this.emit("audio", {
data: base64Audio,
format: this.outputFormat,
uint8Array: audioData,
});
this.emit("speech_complete", { text, streaming: false });
}
catch (error) {
console.error("Failed to generate speech:", error);
this.emit("error", error);
}
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason = "interrupted") {
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
return;
}
// Abort any pending speech generation requests
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
// Clear the speech queue
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
// Resolve any pending speech-done waiters so callers can finish
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
// Notify clients to stop audio playback
this.sendMessage({
type: "speech_interrupted",
reason,
});
this.emit("speech_interrupted", { reason });
}
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta) {
if (!this.speechModel)
return;
this.pendingTextBuffer += textDelta;
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
this.pendingTextBuffer = remaining;
for (const sentence of sentences) {
this.queueSpeechChunk(sentence);
}
}
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText() {
if (!this.speechModel || !this.pendingTextBuffer.trim())
return;
this.queueSpeechChunk(this.pendingTextBuffer);
this.pendingTextBuffer = "";
}
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset() {
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
}
// ── Private helpers ─────────────────────────────────────────
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
extractSentences(text) {
const sentences = [];
let remaining = text;
// Match sentences ending with . ! ? followed by space or end of string
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
let lastIndex = 0;
let match;
while ((match = sentenceEndPattern.exec(text)) !== null) {
const sentence = text
.slice(lastIndex, match.index + match[0].length)
.trim();
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
sentences.push(sentence);
lastIndex = match.index + match[0].length;
}
else if (sentences.length > 0) {
// Append short sentence to previous one
sentences[sentences.length - 1] += " " + sentence;
lastIndex = match.index + match[0].length;
}
}
remaining = text.slice(lastIndex);
// If remaining text is too long, force split at clause boundaries
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
const clausePattern = /[,;:]\s+/g;
let clauseMatch;
let splitIndex = 0;
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
splitIndex = clauseMatch.index + clauseMatch[0].length;
break;
}
}
if (splitIndex > 0) {
sentences.push(remaining.slice(0, splitIndex).trim());
remaining = remaining.slice(splitIndex);
}
}
return [sentences, remaining];
}
/**
* Queue a text chunk for speech generation.
*/
queueSpeechChunk(text) {
if (!this.speechModel || !text.trim())
return;
// Wrap chunk ID to prevent unbounded growth in very long sessions
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
this.nextChunkId = 0;
}
const chunk = {
id: this.nextChunkId++,
text: text.trim(),
};
// Create the speech-done promise if not already present
if (!this.speechQueueDonePromise) {
this.speechQueueDonePromise = new Promise((resolve) => {
this.speechQueueDoneResolve = resolve;
});
}
// Start generating audio immediately (parallel generation)
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
}
this.speechChunkQueue.push(chunk);
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
// Start processing queue if not already
if (!this._isSpeaking) {
this.processSpeechQueue();
}
}
/**
* Generate audio for a single chunk.
*/
async generateChunkAudio(chunk) {
if (!this.currentSpeechAbortController) {
this.currentSpeechAbortController = new AbortController();
}
try {
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`);
const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal);
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
return audioData;
}
catch (error) {
if (error.name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null;
}
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
this.emit("error", error);
return null;
}
}
/**
* Process the speech queue and send audio chunks in order.
*/
async processSpeechQueue() {
if (this._isSpeaking)
return;
this._isSpeaking = true;
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
this.emit("speech_start", { streaming: true });
this.sendMessage({ type: "speech_stream_start" });
try {
while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0];
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
// Ensure audio generation has started
if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
// Wait for this chunk's audio
const audioData = await chunk.audioPromise;
// Check if we were interrupted while waiting
if (!this._isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break;
}
// Remove from queue after processing
this.speechChunkQueue.shift();
if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64");
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
// Send audio chunk via WebSocket
this.sendMessage({
type: "audio_chunk",
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
});
// Emit for local handling
this.emit("audio_chunk", {
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
uint8Array: audioData,
});
}
else {
console.log(`No audio data generated for chunk #${chunk.id}`);
}
// Start generating next chunks in parallel
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length);
if (toStart > 0) {
console.log(`Starting parallel generation for ${toStart} more chunks`);
for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
if (nextChunk) {
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
}
}
}
}
}
}
catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error);
}
finally {
this._isSpeaking = false;
this.currentSpeechAbortController = undefined;
// Signal that the speech queue is fully drained
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
console.log(`Speech queue processing complete`);
this.sendMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true });
}
}
}
exports.SpeechManager = SpeechManager;
//# sourceMappingURL=SpeechManager.js.map

1
dist/core/SpeechManager.js.map vendored Normal file

File diff suppressed because one or more lines are too long

42
dist/core/StreamProcessor.d.ts vendored Normal file
View File

@@ -0,0 +1,42 @@
import { type streamText } from "ai";
/**
* Result of processing a full LLM stream.
*/
export interface StreamResult {
fullText: string;
fullReasoning: string;
allToolCalls: Array<{
toolName: string;
toolCallId: string;
input: unknown;
}>;
allToolResults: Array<{
toolName: string;
toolCallId: string;
output: unknown;
}>;
allSources: Array<unknown>;
allFiles: Array<unknown>;
}
export interface StreamProcessorCallbacks {
/** Called when a text delta arrives (for streaming speech, etc.) */
onTextDelta?: (text: string) => void;
/** Called when a text-end part arrives (flush speech, etc.) */
onTextEnd?: () => void;
/** Send a WebSocket message */
sendMessage: (message: Record<string, unknown>) => void;
/** Emit an event on the agent */
emitEvent: (event: string, data?: unknown) => void;
}
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
export declare function processFullStream(result: ReturnType<typeof streamText>, callbacks: StreamProcessorCallbacks, extraResponseFields?: Record<string, unknown>): Promise<StreamResult>;
/**
* Handle onChunk callback events and emit them.
*/
export declare function handleStreamChunk(chunk: any, emitEvent: (event: string, data?: unknown) => void): void;
//# sourceMappingURL=StreamProcessor.d.ts.map

1
dist/core/StreamProcessor.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"StreamProcessor.d.ts","sourceRoot":"","sources":["../../src/core/StreamProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,KAAK,CAAC;QAClB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,KAAK,EAAE,OAAO,CAAC;KAChB,CAAC,CAAC;IACH,cAAc,EAAE,KAAK,CAAC;QACpB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,UAAU,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAC3B,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;CAC1B;AAED,MAAM,WAAW,wBAAwB;IACvC,oEAAoE;IACpE,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IACrC,+DAA+D;IAC/D,SAAS,CAAC,EAAE,MAAM,IAAI,CAAC;IACvB,+BAA+B;IAC/B,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAC;IACxD,iCAAiC;IACjC,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CACpD;AAED;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,UAAU,CAAC,OAAO,UAAU,CAAC,EACrC,SAAS,EAAE,wBAAwB,EACnC,mBAAmB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC5C,OAAO,CAAC,YAAY,CAAC,CAkMvB;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,KAAK,EAAE,GAAG,EACV,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,GACjD,IAAI,CA+CN"}

228
dist/core/StreamProcessor.js vendored Normal file
View File

@@ -0,0 +1,228 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.processFullStream = processFullStream;
exports.handleStreamChunk = handleStreamChunk;
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
async function processFullStream(result, callbacks, extraResponseFields) {
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
let fullText = "";
let fullReasoning = "";
const allToolCalls = [];
const allToolResults = [];
const allSources = [];
const allFiles = [];
for await (const part of result.fullStream) {
switch (part.type) {
// ── Stream lifecycle ──────────────────────────────
case "start":
sendMessage({ type: "stream_start" });
break;
case "finish":
emitEvent("text", { role: "assistant", text: fullText });
sendMessage({
type: "stream_finish",
finishReason: part.finishReason,
usage: part.totalUsage,
});
break;
case "error":
emitEvent("error", part.error);
sendMessage({
type: "stream_error",
error: String(part.error),
});
break;
case "abort":
emitEvent("abort", { reason: part.reason });
sendMessage({
type: "stream_abort",
reason: part.reason,
});
break;
// ── Step lifecycle ────────────────────────────────
case "start-step":
sendMessage({
type: "step_start",
warnings: part.warnings,
});
break;
case "finish-step":
sendMessage({
type: "step_finish",
finishReason: part.finishReason,
usage: part.usage,
});
break;
// ── Text streaming ────────────────────────────────
case "text-start":
sendMessage({ type: "text_start", id: part.id });
break;
case "text-delta":
fullText += part.text;
onTextDelta?.(part.text);
sendMessage({
type: "text_delta",
id: part.id,
text: part.text,
});
break;
case "text-end":
onTextEnd?.();
sendMessage({ type: "text_end", id: part.id });
break;
// ── Reasoning streaming ───────────────────────────
case "reasoning-start":
sendMessage({ type: "reasoning_start", id: part.id });
break;
case "reasoning-delta":
fullReasoning += part.text;
sendMessage({
type: "reasoning_delta",
id: part.id,
text: part.text,
});
break;
case "reasoning-end":
sendMessage({ type: "reasoning_end", id: part.id });
break;
// ── Tool input streaming ──────────────────────────
case "tool-input-start":
sendMessage({
type: "tool_input_start",
id: part.id,
toolName: part.toolName,
});
break;
case "tool-input-delta":
sendMessage({
type: "tool_input_delta",
id: part.id,
delta: part.delta,
});
break;
case "tool-input-end":
sendMessage({ type: "tool_input_end", id: part.id });
break;
// ── Tool execution ────────────────────────────────
case "tool-call":
allToolCalls.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
sendMessage({
type: "tool_call",
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
break;
case "tool-result":
allToolResults.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
output: part.output,
});
sendMessage({
type: "tool_result",
toolName: part.toolName,
toolCallId: part.toolCallId,
result: part.output,
});
break;
case "tool-error":
sendMessage({
type: "tool_error",
toolName: part.toolName,
toolCallId: part.toolCallId,
error: String(part.error),
});
break;
// ── Sources and files ─────────────────────────────
case "source":
allSources.push(part);
sendMessage({
type: "source",
source: part,
});
break;
case "file":
allFiles.push(part.file);
sendMessage({
type: "file",
file: part.file,
});
break;
}
}
// Send the complete response
sendMessage({
type: "response_complete",
text: fullText,
reasoning: fullReasoning || undefined,
toolCalls: allToolCalls,
toolResults: allToolResults,
sources: allSources.length > 0 ? allSources : undefined,
files: allFiles.length > 0 ? allFiles : undefined,
...extraResponseFields,
});
return {
fullText,
fullReasoning,
allToolCalls,
allToolResults,
allSources,
allFiles,
};
}
/**
* Handle onChunk callback events and emit them.
*/
function handleStreamChunk(chunk, emitEvent) {
switch (chunk.type) {
case "text-delta":
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
break;
case "reasoning-delta":
emitEvent("chunk:reasoning_delta", {
id: chunk.id,
text: chunk.text,
});
break;
case "tool-call":
emitEvent("chunk:tool_call", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
});
break;
case "tool-result":
emitEvent("chunk:tool_result", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
result: chunk.output,
});
break;
case "tool-input-start":
emitEvent("chunk:tool_input_start", {
id: chunk.id,
toolName: chunk.toolName,
});
break;
case "tool-input-delta":
emitEvent("chunk:tool_input_delta", {
id: chunk.id,
delta: chunk.delta,
});
break;
case "source":
emitEvent("chunk:source", chunk);
break;
}
}
//# sourceMappingURL=StreamProcessor.js.map

1
dist/core/StreamProcessor.js.map vendored Normal file

File diff suppressed because one or more lines are too long

28
dist/core/TranscriptionManager.d.ts vendored Normal file
View File

@@ -0,0 +1,28 @@
import { EventEmitter } from "events";
import { type TranscriptionModel } from "ai";
export interface TranscriptionManagerOptions {
transcriptionModel?: TranscriptionModel;
maxAudioInputSize?: number;
}
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
export declare class TranscriptionManager extends EventEmitter {
private transcriptionModel?;
private maxAudioInputSize;
/** Callback to send messages over the WebSocket */
sendMessage: (message: Record<string, unknown>) => void;
constructor(options?: TranscriptionManagerOptions);
get hasTranscriptionModel(): boolean;
/**
* Transcribe audio data to text.
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
processAudioInput(base64Audio: string, format?: string): Promise<string | null>;
}
//# sourceMappingURL=TranscriptionManager.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TranscriptionManager.d.ts","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,kBAAkB,EACxB,MAAM,IAAI,CAAC;AAGZ,MAAM,WAAW,2BAA2B;IAC1C,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;GAGG;AACH,qBAAa,oBAAqB,SAAQ,YAAY;IACpD,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAY;gBAE9D,OAAO,GAAE,2BAAgC;IAOrD,IAAI,qBAAqB,IAAI,OAAO,CAEnC;IAED;;OAEG;IACG,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAsCtE;;;OAGG;IACG,iBAAiB,CACrB,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CA2D1B"}

106
dist/core/TranscriptionManager.js vendored Normal file
View File

@@ -0,0 +1,106 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.TranscriptionManager = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const types_1 = require("../types");
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
class TranscriptionManager extends events_1.EventEmitter {
transcriptionModel;
maxAudioInputSize;
/** Callback to send messages over the WebSocket */
sendMessage = () => { };
constructor(options = {}) {
super();
this.transcriptionModel = options.transcriptionModel;
this.maxAudioInputSize =
options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
}
get hasTranscriptionModel() {
return !!this.transcriptionModel;
}
/**
* Transcribe audio data to text.
*/
async transcribeAudio(audioData) {
if (!this.transcriptionModel) {
throw new Error("Transcription model not configured");
}
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
try {
const result = await (0, ai_1.experimental_transcribe)({
model: this.transcriptionModel,
audio: audioData,
});
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`);
this.emit("transcription", {
text: result.text,
language: result.language,
});
// Send transcription to client for immediate feedback
this.sendMessage({
type: "transcription_result",
text: result.text,
language: result.language,
});
return result.text;
}
catch (error) {
console.error("Whisper transcription failed:", error);
throw error;
}
}
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
async processAudioInput(base64Audio, format) {
if (!this.transcriptionModel) {
const error = new Error("Transcription model not configured for audio input");
this.emit("error", error);
this.sendMessage({ type: "error", error: error.message });
return null;
}
try {
const audioBuffer = Buffer.from(base64Audio, "base64");
// Validate audio size
if (audioBuffer.length > this.maxAudioInputSize) {
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
this.emit("error", new Error(`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
return null;
}
if (audioBuffer.length === 0) {
this.emit("warning", "Received empty audio data");
return null;
}
this.emit("audio_received", { size: audioBuffer.length, format });
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`);
const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (!transcribedText.trim()) {
this.emit("warning", "Transcription returned empty text");
this.sendMessage({
type: "transcription_error",
error: "Whisper returned empty text",
});
return null;
}
return transcribedText;
}
catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error);
this.sendMessage({
type: "transcription_error",
error: `Transcription failed: ${error.message || String(error)}`,
});
return null;
}
}
}
exports.TranscriptionManager = TranscriptionManager;
//# sourceMappingURL=TranscriptionManager.js.map

1
dist/core/TranscriptionManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"TranscriptionManager.js","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AACtC,2BAGY;AACZ,oCAAkD;AAOlD;;;GAGG;AACH,MAAa,oBAAqB,SAAQ,qBAAY;IAC5C,kBAAkB,CAAsB;IACxC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,GAA+C,GAAG,EAAE,GAAE,CAAC,CAAC;IAE1E,YAAY,UAAuC,EAAE;QACnD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,CAAC;QACrD,IAAI,CAAC,iBAAiB;YACpB,OAAO,CAAC,iBAAiB,IAAI,8BAAsB,CAAC;IACxD,CAAC;IAED,IAAI,qBAAqB;QACvB,OAAO,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC;IACnC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,eAAe,CAAC,SAA8B;QAClD,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,CAAC,GAAG,CACT,WAAW,SAAS,CAAC,UAAU,qCAAqC,CACrE,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,4BAAU,EAAC;gBAC9B,KAAK,EAAE,IAAI,CAAC,kBAAkB;gBAC9B,KAAK,EAAE,SAAS;aACjB,CAAC,CAAC;YAEH,OAAO,CAAC,GAAG,CACT,kCAAkC,MAAM,CAAC,IAAI,gBAAgB,MAAM,CAAC,QAAQ,IAAI,SAAS,EAAE,CAC5F,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE;gBACzB,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,sDAAsD;YACtD,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,sBAAsB;gBAC5B,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,OAAO,MAAM,CAAC,IAAI,CAAC;QACrB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,+BAA+B,EAAE,KAAK,CAAC,CAAC;YACtD,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,iBAAiB,CACrB,WAAmB,EACnB,MAAe;QAEf,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,KAAK,CACrB,oDAAoD,CACrD,CAAC;YACF,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YAC1D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YAEvD,sBAAsB;YACtB,IAAI,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;gBAChD,MAAM,MAAM,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAC/D,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,iBAAiB,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAClE,IAAI,CAAC,IAAI,CACP,OAAO,EACP,IAAI,KAAK,CACP,0BAA0B,MAAM,0BAA0B,KAAK,KAAK,CACrE,CACF,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAClE,OAAO,CAAC,GAAG,CACT,2BAA2B,WAAW,CAAC,MAAM,mBAAmB,MAAM,IAAI,SAAS,EAAE,CACtF,CAAC;YAEF,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAAC;YAChE,OAAO,CAAC,GAAG,CAAC,sBAAsB,eAAe,GAAG,CAAC,CAAC;YAEtD,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;gBAC5B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,mCAAmC,CAAC,CAAC;gBAC1D,IAAI,CAAC,WAAW,CAAC;oBACf,IAAI,EAAE,qBAAqB;oBAC3B,KAAK,EAAE,6BAA6B;iBACrC,CAAC,CAAC;gBACH,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO,eAAe,CAAC;QACzB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,gCAAgC,EAAE,KAAK,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,qBAAqB;gBAC3B,KAAK,EAAE,yBAA0B,KAAe,CAAC,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,EAAE;aAC5E,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;CACF;AA7HD,oDA6HC"}

35
dist/core/WebSocketManager.d.ts vendored Normal file
View File

@@ -0,0 +1,35 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
export declare class WebSocketManager extends EventEmitter {
private socket?;
private _isConnected;
get isConnected(): boolean;
get currentSocket(): WebSocket | undefined;
/**
* Connect to a WebSocket server by URL.
*/
connect(url: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void;
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message: Record<string, unknown>): void;
/**
* Disconnect and clean up the current socket.
*/
disconnect(): void;
/**
* Attach internal event listeners on the current socket.
*/
private attachListeners;
}
//# sourceMappingURL=WebSocketManager.d.ts.map

1
dist/core/WebSocketManager.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"WebSocketManager.d.ts","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AAEtC;;;;GAIG;AACH,qBAAa,gBAAiB,SAAQ,YAAY;IAChD,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,YAAY,CAAS;IAE7B,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,aAAa,IAAI,SAAS,GAAG,SAAS,CAEzC;IAED;;OAEG;IACH,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA0BnC;;OAEG;IACH,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAYrC;;;OAGG;IACH,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI;IAgB5C;;OAEG;IACH,UAAU,IAAI,IAAI;IAmBlB;;OAEG;IACH,OAAO,CAAC,eAAe;CAuBxB"}

126
dist/core/WebSocketManager.js vendored Normal file
View File

@@ -0,0 +1,126 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.WebSocketManager = void 0;
const ws_1 = require("ws");
const events_1 = require("events");
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
class WebSocketManager extends events_1.EventEmitter {
socket;
_isConnected = false;
get isConnected() {
return this._isConnected;
}
get currentSocket() {
return this.socket;
}
/**
* Connect to a WebSocket server by URL.
*/
connect(url) {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
return new Promise((resolve, reject) => {
try {
this.socket = new ws_1.WebSocket(url);
this.attachListeners();
this.socket.once("open", () => {
this._isConnected = true;
this.emit("connected");
resolve();
});
this.socket.once("error", (error) => {
reject(error);
});
}
catch (error) {
reject(error);
}
});
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket) {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
this.socket = socket;
this._isConnected = true;
this.attachListeners();
this.emit("connected");
}
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message) {
if (!this.socket || !this._isConnected)
return;
try {
if (this.socket.readyState === ws_1.WebSocket.OPEN) {
this.socket.send(JSON.stringify(message));
}
else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
}
}
catch (error) {
// Socket may have closed between the readyState check and send()
console.error("Failed to send WebSocket message:", error);
this.emit("error", error);
}
}
/**
* Disconnect and clean up the current socket.
*/
disconnect() {
if (!this.socket)
return;
try {
this.socket.removeAllListeners();
if (this.socket.readyState === ws_1.WebSocket.OPEN ||
this.socket.readyState === ws_1.WebSocket.CONNECTING) {
this.socket.close();
}
}
catch {
// Ignore close errors — socket may already be dead
}
this.socket = undefined;
this._isConnected = false;
}
/**
* Attach internal event listeners on the current socket.
*/
attachListeners() {
if (!this.socket)
return;
this.socket.on("message", (data) => {
try {
const message = JSON.parse(data.toString());
this.emit("message", message);
}
catch (err) {
console.error("Failed to parse WebSocket message:", err);
this.emit("error", err);
}
});
this.socket.on("close", () => {
this._isConnected = false;
this.emit("disconnected");
});
this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error);
});
}
}
exports.WebSocketManager = WebSocketManager;
//# sourceMappingURL=WebSocketManager.js.map

1
dist/core/WebSocketManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"WebSocketManager.js","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":";;;AAAA,2BAA+B;AAC/B,mCAAsC;AAEtC;;;;GAIG;AACH,MAAa,gBAAiB,SAAQ,qBAAY;IACxC,MAAM,CAAa;IACnB,YAAY,GAAG,KAAK,CAAC;IAE7B,IAAI,WAAW;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,GAAW;QACjB,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,IAAI,CAAC,MAAM,GAAG,IAAI,cAAS,CAAC,GAAG,CAAC,CAAC;gBACjC,IAAI,CAAC,eAAe,EAAE,CAAC;gBAEvB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE;oBAC5B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;oBACzB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBACvB,OAAO,EAAE,CAAC;gBACZ,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;oBAClC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAChB,CAAC,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,CAAC,KAAK,CAAC,CAAC;YAChB,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,MAAiB;QAC5B,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QACzB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzB,CAAC;IAED;;;OAGG;IACH,IAAI,CAAC,OAAgC;QACnC,IAAI,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY;YAAE,OAAO;QAE/C,IAAI,CAAC;YACH,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI,EAAE,CAAC;gBAC9C,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;YAC5C,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,sCAAsC,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iEAAiE;YACjE,OAAO,CAAC,KAAK,CAAC,mCAAmC,EAAE,KAAK,CAAC,CAAC;YAC1D,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC;YACH,IAAI,CAAC,MAAM,CAAC,kBAAkB,EAAE,CAAC;YACjC,IACE,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI;gBACzC,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,UAAU,EAC/C,CAAC;gBACD,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YACtB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,mDAAmD;QACrD,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,SAAS,CAAC;QACxB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC5C,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAChC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,CAAC,KAAK,CAAC,oCAAoC,EAAE,GAAG,CAAC,CAAC;gBACzD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAC3B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;YAC1B,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAChC,OAAO,CAAC,KAAK,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AA5HD,4CA4HC"}

7
dist/core/index.d.ts vendored Normal file
View File

@@ -0,0 +1,7 @@
export { WebSocketManager } from "./WebSocketManager";
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
export { ConversationManager, type ConversationManagerOptions, } from "./ConversationManager";
export { TranscriptionManager, type TranscriptionManagerOptions, } from "./TranscriptionManager";
export { processFullStream, handleStreamChunk, type StreamResult, type StreamProcessorCallbacks, } from "./StreamProcessor";
export { InputQueue, type QueueItem } from "./InputQueue";
//# sourceMappingURL=index.d.ts.map

1
dist/core/index.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,KAAK,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAC3E,OAAO,EACL,mBAAmB,EACnB,KAAK,0BAA0B,GAChC,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EACL,oBAAoB,EACpB,KAAK,2BAA2B,GACjC,MAAM,wBAAwB,CAAC;AAChC,OAAO,EACL,iBAAiB,EACjB,iBAAiB,EACjB,KAAK,YAAY,EACjB,KAAK,wBAAwB,GAC9B,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,UAAU,EAAE,KAAK,SAAS,EAAE,MAAM,cAAc,CAAC"}

17
dist/core/index.js vendored Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.InputQueue = exports.handleStreamChunk = exports.processFullStream = exports.TranscriptionManager = exports.ConversationManager = exports.SpeechManager = exports.WebSocketManager = void 0;
var WebSocketManager_1 = require("./WebSocketManager");
Object.defineProperty(exports, "WebSocketManager", { enumerable: true, get: function () { return WebSocketManager_1.WebSocketManager; } });
var SpeechManager_1 = require("./SpeechManager");
Object.defineProperty(exports, "SpeechManager", { enumerable: true, get: function () { return SpeechManager_1.SpeechManager; } });
var ConversationManager_1 = require("./ConversationManager");
Object.defineProperty(exports, "ConversationManager", { enumerable: true, get: function () { return ConversationManager_1.ConversationManager; } });
var TranscriptionManager_1 = require("./TranscriptionManager");
Object.defineProperty(exports, "TranscriptionManager", { enumerable: true, get: function () { return TranscriptionManager_1.TranscriptionManager; } });
var StreamProcessor_1 = require("./StreamProcessor");
Object.defineProperty(exports, "processFullStream", { enumerable: true, get: function () { return StreamProcessor_1.processFullStream; } });
Object.defineProperty(exports, "handleStreamChunk", { enumerable: true, get: function () { return StreamProcessor_1.handleStreamChunk; } });
var InputQueue_1 = require("./InputQueue");
Object.defineProperty(exports, "InputQueue", { enumerable: true, get: function () { return InputQueue_1.InputQueue; } });
//# sourceMappingURL=index.js.map

1
dist/core/index.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":";;;AAAA,uDAAsD;AAA7C,oHAAA,gBAAgB,OAAA;AACzB,iDAA2E;AAAlE,8GAAA,aAAa,OAAA;AACtB,6DAG+B;AAF7B,0HAAA,mBAAmB,OAAA;AAGrB,+DAGgC;AAF9B,4HAAA,oBAAoB,OAAA;AAGtB,qDAK2B;AAJzB,oHAAA,iBAAiB,OAAA;AACjB,oHAAA,iBAAiB,OAAA;AAInB,2CAA0D;AAAjD,wGAAA,UAAU,OAAA"}

4
dist/index.d.ts vendored Normal file
View File

@@ -0,0 +1,4 @@
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent.new";
export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types";
//# sourceMappingURL=index.d.ts.map

1
dist/index.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}

14
dist/index.js vendored Normal file
View File

@@ -0,0 +1,14 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0;
// Agents
var VoiceAgent_new_1 = require("./VoiceAgent.new");
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_new_1.VoiceAgent; } });
var VideoAgent_new_1 = require("./VideoAgent.new");
Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_new_1.VideoAgent; } });
// Shared types
var types_1 = require("./types");
Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } });
Object.defineProperty(exports, "DEFAULT_HISTORY_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_HISTORY_CONFIG; } });
Object.defineProperty(exports, "DEFAULT_MAX_AUDIO_SIZE", { enumerable: true, get: function () { return types_1.DEFAULT_MAX_AUDIO_SIZE; } });
//# sourceMappingURL=index.js.map

1
dist/index.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,mDAAsE;AAA7D,4GAAA,UAAU,OAAA;AACnB,mDAQ0B;AAPtB,4GAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}

46
dist/types.d.ts vendored Normal file
View File

@@ -0,0 +1,46 @@
import type { streamText } from "ai";
/**
* Represents a chunk of text to be converted to speech
*/
export interface SpeechChunk {
id: number;
text: string;
audioPromise?: Promise<Uint8Array | null>;
}
/**
* Configuration for streaming speech behavior
*/
export interface StreamingSpeechConfig {
/** Minimum characters before generating speech for a chunk */
minChunkSize: number;
/** Maximum characters per chunk (will split at sentence boundary before this) */
maxChunkSize: number;
/** Whether to enable parallel TTS generation */
parallelGeneration: boolean;
/** Maximum number of parallel TTS requests */
maxParallelRequests: number;
}
/**
* Configuration for conversation history memory management
*/
export interface HistoryConfig {
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxMessages: number;
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxTotalChars: number;
}
/**
* Default streaming speech configuration
*/
export declare const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig;
/**
* Default history configuration
*/
export declare const DEFAULT_HISTORY_CONFIG: HistoryConfig;
/** Default maximum audio input size (10 MB) */
export declare const DEFAULT_MAX_AUDIO_SIZE: number;
/**
* Default stop condition type from streamText
*/
export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
//# sourceMappingURL=types.d.ts.map

1
dist/types.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,WAAW;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IAClC,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC1B,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,aAGpC,CAAC;AAEF,+CAA+C;AAC/C,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAEvD;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC"}

22
dist/types.js vendored Normal file
View File

@@ -0,0 +1,22 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = void 0;
/**
* Default streaming speech configuration
*/
exports.DEFAULT_STREAMING_SPEECH_CONFIG = {
minChunkSize: 50,
maxChunkSize: 200,
parallelGeneration: true,
maxParallelRequests: 3,
};
/**
* Default history configuration
*/
exports.DEFAULT_HISTORY_CONFIG = {
maxMessages: 100,
maxTotalChars: 0, // unlimited by default
};
/** Default maximum audio input size (10 MB) */
exports.DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
//# sourceMappingURL=types.js.map

1
dist/types.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";;;AAmCA;;GAEG;AACU,QAAA,+BAA+B,GAA0B;IAClE,YAAY,EAAE,EAAE;IAChB,YAAY,EAAE,GAAG;IACjB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,CAAC;CACzB,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAkB;IACjD,WAAW,EAAE,GAAG;IAChB,aAAa,EAAE,CAAC,EAAE,uBAAuB;CAC5C,CAAC;AAEF,+CAA+C;AAClC,QAAA,sBAAsB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC"}

1
dist/utils/StreamBuffer.d.ts vendored Normal file
View File

@@ -0,0 +1 @@
//# sourceMappingURL=StreamBuffer.d.ts.map

1
dist/utils/StreamBuffer.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"StreamBuffer.d.ts","sourceRoot":"","sources":["../../src/utils/StreamBuffer.ts"],"names":[],"mappings":""}

2
dist/utils/StreamBuffer.js vendored Normal file
View File

@@ -0,0 +1,2 @@
"use strict";
//# sourceMappingURL=StreamBuffer.js.map

1
dist/utils/StreamBuffer.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"StreamBuffer.js","sourceRoot":"","sources":["../../src/utils/StreamBuffer.ts"],"names":[],"mappings":""}

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -2,12 +2,12 @@ const http = require('http');
const fs = require('fs');
const path = require('path');
const PORT = 3000;
const PORT = 3102;
// Create a simple HTTP server to serve the voice client HTML
const server = http.createServer((req, res) => {
if (req.url === '/' || req.url === '/index.html') {
const htmlPath = path.join(__dirname, 'voice-client.html');
const htmlPath = path.join(__dirname, 'video-client.html');
fs.readFile(htmlPath, (err, data) => {
if (err) {
res.writeHead(500);

998
example/video-client.html Normal file
View File

@@ -0,0 +1,998 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Video + Voice Agent Client</title>
<style>
body {
font-family: system-ui, sans-serif;
max-width: 1000px;
margin: 20px auto;
padding: 0 16px;
background: #f9fafb;
color: #111827;
}
h1 {
margin-bottom: 8px;
}
.subtitle {
color: #6b7280;
font-size: 0.95rem;
margin-bottom: 24px;
}
.card {
background: white;
border: 1px solid #e5e7eb;
border-radius: 12px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
}
.row {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: center;
margin-bottom: 16px;
}
video {
width: 100%;
max-width: 520px;
border-radius: 10px;
background: #000;
aspect-ratio: 4 / 3;
}
button {
padding: 10px 16px;
border-radius: 8px;
border: 1px solid #d1d5db;
background: white;
cursor: pointer;
font-weight: 500;
}
button.primary {
background: #2563eb;
color: white;
border-color: #2563eb;
}
button.danger {
background: #dc2626;
color: white;
border-color: #dc2626;
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.status {
font-weight: 600;
margin: 8px 0;
font-size: 0.95rem;
}
.dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 50%;
margin-right: 8px;
}
.dot.disconnected {
background: #9ca3af;
}
.dot.connected {
background: #22c55e;
}
.dot.listening {
background: #f59e0b;
animation: pulse 1.5s infinite;
}
.dot.speaking {
background: #3b82f6;
animation: pulse 1.2s infinite;
}
@keyframes pulse {
0%,
100% {
opacity: 1
}
50% {
opacity: 0.6
}
}
#transcript,
#assistant,
#reasoning,
#tools {
min-height: 48px;
padding: 12px;
border-radius: 8px;
background: #f3f4f6;
border-left: 4px solid #9ca3af;
margin-bottom: 16px;
white-space: pre-wrap;
}
#transcript {
border-left-color: #2563eb;
}
#assistant {
border-left-color: #22c55e;
}
#reasoning {
border-left-color: #f59e0b;
font-style: italic;
color: #4b5563;
}
#tools {
border-left-color: #8b5cf6;
font-size: 0.9rem;
}
#log {
background: #0f172a;
color: #e2e8f0;
font-family: 'SF Mono', monospace;
font-size: 0.82rem;
padding: 12px;
border-radius: 8px;
max-height: 240px;
overflow-y: auto;
white-space: pre-wrap;
}
.hidden {
display: none;
}
/* ── Mic selector & level meter ── */
#micRow {
margin-bottom: 12px;
}
#micSelect {
flex: 1;
min-width: 180px;
padding: 6px 8px;
border-radius: 6px;
border: 1px solid #d1d5db;
}
#refreshMicsBtn {
padding: 6px 12px;
font-size: 0.85rem;
}
.meter-wrap {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 12px;
}
.meter-wrap label {
font-size: 0.85rem;
white-space: nowrap;
}
#levelMeter {
flex: 1;
height: 14px;
border-radius: 7px;
background: #e5e7eb;
overflow: hidden;
}
#levelBar {
height: 100%;
width: 0%;
border-radius: 7px;
background: #22c55e;
transition: width 60ms linear;
}
#levelBar.hot {
background: #ef4444;
}
#rmsValue {
font-family: monospace;
font-size: 0.8rem;
width: 56px;
text-align: right;
}
/* ── Push-to-talk ── */
#pttBtn {
padding: 10px 20px;
font-size: 1rem;
font-weight: 600;
border-radius: 10px;
border: 2px solid #2563eb;
background: #eff6ff;
color: #2563eb;
cursor: pointer;
user-select: none;
touch-action: none;
}
#pttBtn:active,
#pttBtn.active {
background: #dc2626;
color: white;
border-color: #dc2626;
}
#pttBtn:disabled {
opacity: 0.4;
cursor: not-allowed;
}
</style>
</head>
<body>
<h1>📹 Video + Voice Agent</h1>
<p class="subtitle">Webcam + microphone → multimodal AI (vision + speech)</p>
<div class="card">
<video id="localVideo" autoplay playsinline muted></video>
<canvas id="frameCanvas" style="display:none"></canvas>
<div class="row" style="margin-top:16px">
<input type="text" id="wsEndpoint" value="ws://localhost:8081" style="flex:1; min-width:260px" />
<button id="connectBtn" class="primary">Connect</button>
<button id="disconnectBtn" disabled>Disconnect</button>
</div>
<!-- ── Mic selector ── -->
<div class="row" id="micRow">
<label>Microphone:</label>
<select id="micSelect">
<option value="">-- click Refresh --</option>
</select>
<button id="refreshMicsBtn">🔄 Refresh</button>
</div>
<!-- ── Live level meter ── -->
<div class="meter-wrap">
<label>Mic level:</label>
<div id="levelMeter">
<div id="levelBar"></div>
</div>
<span id="rmsValue">0.000</span>
</div>
<div class="row">
<label>Input mode:</label>
<select id="inputMode">
<option value="browser-stt">Browser STT</option>
<option value="server-whisper">Server Whisper (VAD)</option>
<option value="push-to-talk" selected>Push-to-Talk</option>
</select>
<label>Frames:</label>
<select id="frameInterval">
<option value="3000">every 3s</option>
<option value="5000" selected>every 5s</option>
<option value="10000">every 10s</option>
<option value="0">manual only</option>
</select>
</div>
<div class="row">
<button id="startMediaBtn" disabled>📹🎤 Start Camera + Mic</button>
<button id="stopMediaBtn" disabled>⏹ Stop</button>
<button id="captureBtn" disabled>Capture Frame Now</button>
<button id="pttBtn" disabled>🎙 Hold to Talk</button>
<button id="interruptBtn" class="danger" disabled>✋ Interrupt</button>
</div>
<div class="status" id="status">
<span class="dot disconnected"></span>Disconnected
</div>
</div>
<h3>👤 You said</h3>
<div id="transcript"></div>
<h3>🤖 Assistant</h3>
<div id="assistant"></div>
<div id="reasoningSection" class="hidden">
<h3>💭 Reasoning</h3>
<div id="reasoning"></div>
</div>
<div id="toolsSection" class="hidden">
<h3>🛠️ Tools</h3>
<div id="tools"></div>
</div>
<h3>📜 Log</h3>
<div id="log"></div>
<script>
// ────────────────────────────────────────────────────────────────
// State & Elements
// ────────────────────────────────────────────────────────────────
const els = {
wsEndpoint: document.getElementById('wsEndpoint'),
connectBtn: document.getElementById('connectBtn'),
disconnectBtn: document.getElementById('disconnectBtn'),
inputMode: document.getElementById('inputMode'),
frameInterval: document.getElementById('frameInterval'),
startMediaBtn: document.getElementById('startMediaBtn'),
stopMediaBtn: document.getElementById('stopMediaBtn'),
captureBtn: document.getElementById('captureBtn'),
pttBtn: document.getElementById('pttBtn'),
interruptBtn: document.getElementById('interruptBtn'),
status: document.getElementById('status'),
transcript: document.getElementById('transcript'),
assistant: document.getElementById('assistant'),
reasoningSec: document.getElementById('reasoningSection'),
reasoning: document.getElementById('reasoning'),
toolsSec: document.getElementById('toolsSection'),
tools: document.getElementById('tools'),
log: document.getElementById('log'),
video: document.getElementById('localVideo'),
canvas: document.getElementById('frameCanvas'),
micSelect: document.getElementById('micSelect'),
refreshMicsBtn: document.getElementById('refreshMicsBtn'),
levelBar: document.getElementById('levelBar'),
rmsValue: document.getElementById('rmsValue'),
};
let ws = null;
let localStream = null;
let audioOnlyStream = null; // ← ADD THIS
let mediaRecorder = null;
let audioChunks = [];
let frameTimer = null;
let audioQueue = [];
let isPlaying = false;
let currentSource = null;
// Level-meter / VAD audio nodes (use browser-native sample rate)
let meterCtx = null; // AudioContext for the meter (always running when media is on)
let meterAnalyser = null;
let meterSource = null;
let meterRafId = null;
// VAD-specific
let silenceStart = null;
let recordingStartTime = null;
const SPEECH_THRESHOLD = 0.015;
const SILENCE_THRESHOLD = 0.008;
const SILENCE_DURATION = 1400; // ms
const MIN_RECORDING_TIME = 600; // ms
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
let recognition = null;
// ────────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────────
function log(...args) {
const time = new Date().toLocaleTimeString([], { hour12: false });
const line = `[${time}] ${args.join(' ')}\n`;
els.log.textContent += line;
els.log.scrollTop = els.log.scrollHeight;
}
function setStatus(text, state = 'disconnected') {
els.status.innerHTML = `<span class="dot ${state}"></span>${text}`;
}
function enable(...btns) {
btns.forEach(b => { if (els[b]) els[b].disabled = false; });
}
function disable(...btns) {
btns.forEach(b => { if (els[b]) els[b].disabled = true; });
}
function resetUI() {
els.assistant.textContent = '';
els.reasoning.textContent = '';
els.tools.textContent = '';
els.reasoningSec.classList.add('hidden');
els.toolsSec.classList.add('hidden');
}
// ────────────────────────────────────────────────────────────────
// Mic enumeration
// ────────────────────────────────────────────────────────────────
async function refreshMics() {
try {
// Need a temporary stream to get labelled device list
const tmp = await navigator.mediaDevices.getUserMedia({ audio: true });
tmp.getTracks().forEach(t => t.stop());
const devices = await navigator.mediaDevices.enumerateDevices();
const mics = devices.filter(d => d.kind === 'audioinput');
els.micSelect.innerHTML = '';
mics.forEach((m, i) => {
const opt = document.createElement('option');
opt.value = m.deviceId;
opt.textContent = m.label || `Microphone ${i + 1}`;
els.micSelect.appendChild(opt);
});
log(`Found ${mics.length} microphone(s)`);
} catch (err) {
log('Mic enumeration failed:', err.message);
}
}
els.refreshMicsBtn.onclick = refreshMics;
// Auto-populate on page load
refreshMics();
// ────────────────────────────────────────────────────────────────
// Live audio level meter (always-on when media is active)
// Uses AnalyserNode + rAF no ScriptProcessorNode needed.
// ────────────────────────────────────────────────────────────────
function startLevelMeter(stream) {
// Use the browser's native sample rate (NO custom sampleRate!)
meterCtx = new (window.AudioContext || window.webkitAudioContext)();
meterSource = meterCtx.createMediaStreamSource(stream);
meterAnalyser = meterCtx.createAnalyser();
meterAnalyser.fftSize = 1024;
meterSource.connect(meterAnalyser);
// Do NOT connect to destination we don't want to hear ourselves
const buf = new Float32Array(meterAnalyser.fftSize);
function tick() {
meterAnalyser.getFloatTimeDomainData(buf);
let sum = 0;
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
const rms = Math.sqrt(sum / buf.length);
// Update UI
const pct = Math.min(rms / 0.15, 1) * 100; // 0.15 is "loud"
els.levelBar.style.width = pct + '%';
els.levelBar.classList.toggle('hot', rms > SPEECH_THRESHOLD);
els.rmsValue.textContent = rms.toFixed(4);
// If VAD mode is active, drive it from here
if (els.inputMode.value === 'server-whisper') {
vadTick(rms);
}
meterRafId = requestAnimationFrame(tick);
}
tick();
log(`Level meter started (sampleRate=${meterCtx.sampleRate})`);
}
function stopLevelMeter() {
if (meterRafId) { cancelAnimationFrame(meterRafId); meterRafId = null; }
if (meterSource) { meterSource.disconnect(); meterSource = null; }
if (meterAnalyser) { meterAnalyser.disconnect(); meterAnalyser = null; }
if (meterCtx) { meterCtx.close(); meterCtx = null; }
els.levelBar.style.width = '0%';
els.rmsValue.textContent = '0.000';
}
// ────────────────────────────────────────────────────────────────
// Frame capture & send
// ────────────────────────────────────────────────────────────────
function captureFrame(reason = 'timer') {
if (!els.video.videoWidth) return;
const ctx = els.canvas.getContext('2d');
els.canvas.width = els.video.videoWidth;
els.canvas.height = els.video.videoHeight;
ctx.drawImage(els.video, 0, 0);
const dataUrl = els.canvas.toDataURL('image/webp', 0.78);
const base64 = dataUrl.split(',')[1];
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({
type: 'video_frame',
sessionId: 'client-main',
sequence: Date.now(),
timestamp: Date.now(),
triggerReason: reason,
image: {
data: base64,
format: 'webp',
width: els.canvas.width,
height: els.canvas.height
}
}));
log(`Frame sent (${(base64.length / 1000).toFixed(1)} kB) — ${reason}`);
}
}
// ────────────────────────────────────────────────────────────────
// Audio playback queue
// ────────────────────────────────────────────────────────────────
async function playNext() {
if (isPlaying || audioQueue.length === 0) return;
isPlaying = true;
const { bytes, format } = audioQueue.shift();
try {
const ctx = new (window.AudioContext || window.webkitAudioContext)();
const buffer = await ctx.decodeAudioData(
bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.length)
);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
currentSource = source;
source.onended = () => {
currentSource = null;
isPlaying = false;
ctx.close();
playNext();
};
source.start(0);
log(`Playing audio chunk (${bytes.length} bytes, ${format})`);
} catch (err) {
console.error('Audio decode/play error:', err);
isPlaying = false;
playNext();
}
}
// ────────────────────────────────────────────────────────────────
// WebSocket
// ────────────────────────────────────────────────────────────────
function connect() {
const url = els.wsEndpoint.value.trim();
if (!url) return log('No endpoint');
setStatus('Connecting...', 'disconnected');
ws = new WebSocket(url);
ws.onopen = () => {
setStatus('Connected', 'connected');
enable('startMediaBtn', 'interruptBtn', 'captureBtn');
disable('connectBtn');
enable('disconnectBtn');
log(`Connected to ${url}`);
};
ws.onclose = () => {
setStatus('Disconnected', 'disconnected');
disable('startMediaBtn', 'stopMediaBtn', 'captureBtn', 'interruptBtn', 'pttBtn');
enable('connectBtn');
disable('disconnectBtn');
stopAllMedia();
log('Disconnected');
ws = null;
};
ws.onerror = (e) => {
log('WebSocket error', e);
setStatus('Error', 'disconnected');
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
handleMessage(msg);
} catch (err) {
log('Parse error:', err);
}
};
}
function disconnect() {
if (ws) ws.close();
stopAllMedia();
}
// ────────────────────────────────────────────────────────────────
// Media (camera + mic)
// ────────────────────────────────────────────────────────────────
async function startMedia() {
try {
const audioConstraint = els.micSelect.value
? { deviceId: { exact: els.micSelect.value } }
: true;
localStream = await navigator.mediaDevices.getUserMedia({
video: { width: { ideal: 640 }, height: { ideal: 480 } },
audio: audioConstraint,
});
audioOnlyStream = new MediaStream(localStream.getAudioTracks()); // ← ADD THIS
// Log which mic was actually selected
const audioTrack = localStream.getAudioTracks()[0];
log(`Mic active: "${audioTrack?.label || 'unknown'}"`);
els.video.srcObject = localStream;
await els.video.play();
enable('stopMediaBtn', 'pttBtn');
disable('startMediaBtn');
// Start the always-on level meter
startLevelMeter(localStream);
// Periodic frames
const intervalMs = Number(els.frameInterval.value);
if (intervalMs > 0) {
frameTimer = setInterval(() => captureFrame('timer'), intervalMs);
log(`Frame capture every ${intervalMs / 1000}s`);
}
// Start the selected input mode
const mode = els.inputMode.value;
if (mode === 'browser-stt') {
startBrowserSTT();
}
// VAD and push-to-talk don't need extra init they're driven by
// the level-meter tick and button events respectively.
setStatus('Listening...', 'listening');
log(`Camera + Mic started, input mode: ${mode}`);
} catch (err) {
log('getUserMedia failed:', err.message);
}
}
function stopAllMedia() {
if (frameTimer) { clearInterval(frameTimer); frameTimer = null; }
stopLevelMeter();
if (localStream) {
localStream.getTracks().forEach(t => t.stop());
audioOnlyStream = null;
localStream = null;
}
els.video.srcObject = null;
if (mediaRecorder?.state === 'recording') mediaRecorder.stop();
mediaRecorder = null;
if (recognition) recognition.stop();
recognition = null;
silenceStart = null;
recordingStartTime = null;
audioChunks = [];
disable('stopMediaBtn', 'pttBtn');
enable('startMediaBtn');
setStatus('Connected', 'connected');
log('Media stopped');
}
// ────────────────────────────────────────────────────────────────
// Shared: record a segment from localStream and send it
// ────────────────────────────────────────────────────────────────
function chosenMimeType() {
for (const mt of [
'audio/webm;codecs=opus',
'audio/webm',
'audio/ogg;codecs=opus',
'audio/mp4',
]) {
if (MediaRecorder.isTypeSupported(mt)) return mt;
}
return ''; // let browser pick default
}
function startRecording() {
if (mediaRecorder?.state === 'recording') return;
if (!audioOnlyStream) { log('No audio stream!'); return; }
audioChunks = [];
recordingStartTime = Date.now();
silenceStart = null;
const mimeType = chosenMimeType();
const opts = mimeType ? { mimeType } : undefined;
mediaRecorder = new MediaRecorder(audioOnlyStream, opts);
mediaRecorder.ondataavailable = e => {
if (e.data.size > 0) audioChunks.push(e.data);
};
mediaRecorder.onstop = async () => {
const usedMime = mediaRecorder?.mimeType || mimeType || 'audio/webm';
if (audioChunks.length === 0) {
log('No audio chunks recorded');
setStatus('Listening...', 'listening');
return;
}
const blob = new Blob(audioChunks, { type: usedMime });
if (blob.size < 800) {
log(`Audio too short (${blob.size} bytes), skipping`);
setStatus('Listening...', 'listening');
return;
}
const arrayBuffer = await blob.arrayBuffer();
const base64 = btoa(
new Uint8Array(arrayBuffer).reduce((d, b) => d + String.fromCharCode(b), '')
);
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'audio', data: base64, format: usedMime }));
log(`Sent audio (${(base64.length / 1000).toFixed(1)} kB, ${usedMime})`);
els.transcript.textContent = 'Transcribing...';
} else {
log('WS not connected, audio dropped');
}
setStatus('Listening...', 'listening');
};
mediaRecorder.start(100); // timeslice 100ms
setStatus('🔴 Recording...', 'speaking');
log('Recording started');
}
function stopRecording() {
if (mediaRecorder?.state === 'recording') {
mediaRecorder.stop();
silenceStart = null;
recordingStartTime = null;
setStatus('Processing...', 'connected');
log('Recording stopped, sending...');
}
}
// ────────────────────────────────────────────────────────────────
// VAD (driven from the level-meter rAF loop)
// ────────────────────────────────────────────────────────────────
function vadTick(rms) {
if (rms > SPEECH_THRESHOLD) {
silenceStart = null;
if (!mediaRecorder || mediaRecorder.state !== 'recording') {
startRecording();
}
} else if (rms < SILENCE_THRESHOLD && mediaRecorder?.state === 'recording') {
if (!silenceStart) {
silenceStart = Date.now();
} else if (Date.now() - silenceStart > SILENCE_DURATION) {
if (recordingStartTime && (Date.now() - recordingStartTime) > MIN_RECORDING_TIME) {
log('Silence → stopping');
stopRecording();
}
}
}
}
// ────────────────────────────────────────────────────────────────
// Push-to-Talk
// ────────────────────────────────────────────────────────────────
function pttDown() {
if (!localStream) return;
els.pttBtn.classList.add('active');
startRecording();
}
function pttUp() {
els.pttBtn.classList.remove('active');
stopRecording();
}
els.pttBtn.addEventListener('mousedown', pttDown);
els.pttBtn.addEventListener('mouseup', pttUp);
els.pttBtn.addEventListener('mouseleave', pttUp);
els.pttBtn.addEventListener('touchstart', e => { e.preventDefault(); pttDown(); });
els.pttBtn.addEventListener('touchend', e => { e.preventDefault(); pttUp(); });
// Spacebar push-to-talk (only when mode is push-to-talk)
let spaceHeld = false;
document.addEventListener('keydown', e => {
if (e.code === 'Space' && !spaceHeld && els.inputMode.value === 'push-to-talk'
&& localStream && !e.target.matches('input, textarea, select')) {
e.preventDefault();
spaceHeld = true;
pttDown();
}
});
document.addEventListener('keyup', e => {
if (e.code === 'Space' && spaceHeld) {
e.preventDefault();
spaceHeld = false;
pttUp();
}
});
// ────────────────────────────────────────────────────────────────
// Browser STT
// ────────────────────────────────────────────────────────────────
function startBrowserSTT() {
if (!SpeechRecognition) { log('Web Speech API not supported'); return; }
recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'en-US';
recognition.onresult = e => {
const transcript = Array.from(e.results).map(r => r[0].transcript).join('');
els.transcript.textContent = transcript;
if (e.results[0].isFinal) sendTranscript(transcript);
};
recognition.onerror = e => log('STT error:', e.error);
recognition.start();
log('Browser STT started');
}
// ────────────────────────────────────────────────────────────────
// Sending transcript / interrupt
// ────────────────────────────────────────────────────────────────
function sendTranscript(text) {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
ws.send(JSON.stringify({ type: 'transcript', text }));
log(`Sent transcript: ${text}`);
resetUI();
}
function interrupt() {
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'interrupt', reason: 'user_button' }));
log('Interrupt sent');
}
audioQueue = [];
if (currentSource) { currentSource.stop(); currentSource = null; }
isPlaying = false;
}
// ────────────────────────────────────────────────────────────────
// Server → Client messages
// ────────────────────────────────────────────────────────────────
function handleMessage(msg) {
switch (msg.type) {
case 'transcription_result':
els.transcript.textContent = msg.text || '(empty)';
log(`Transcription: ${msg.text}`);
break;
case 'text_delta':
els.assistant.textContent += msg.text || '';
break;
case 'reasoning_delta':
els.reasoningSec.classList.remove('hidden');
els.reasoning.textContent += msg.text || '';
break;
case 'tool_call':
case 'tool_result':
els.toolsSec.classList.remove('hidden');
els.tools.innerHTML += `<div>${msg.type}: ${msg.toolName || '?'}${JSON.stringify(msg.result || msg.input || {})}</div>`;
break;
case 'audio_chunk':
case 'audio':
const bytes = Uint8Array.from(atob(msg.data), c => c.charCodeAt(0));
audioQueue.push({ bytes, format: msg.format || 'mp3' });
playNext();
break;
case 'speech_interrupted':
audioQueue = [];
if (currentSource) currentSource.stop();
isPlaying = false;
log(`Speech interrupted: ${msg.reason || '?'}`);
break;
case 'response_complete':
log('Response complete');
break;
case 'capture_frame':
log(`Server requested frame: ${msg.reason}`);
captureFrame(msg.reason || 'server_request');
break;
case 'frame_ack':
break; // silent
case 'session_init':
log(`Session: ${msg.sessionId}`);
break;
case 'stream_start':
resetUI();
break;
case 'stream_finish':
log(`Stream finished: ${msg.finishReason}`);
break;
case 'speech_stream_start':
break;
case 'speech_stream_end':
log('Speech done');
break;
case 'error':
log(`ERROR: ${msg.error}`);
console.error('Server error:', msg.error);
break;
case 'transcription_error':
log(`Transcription error: ${msg.error}`);
els.transcript.textContent = `Error: ${msg.error}`;
break;
default:
if (msg.type?.includes('stream') || msg.type?.includes('step')) {
// verbose stream events log quietly
} else {
log(`[${msg.type}]`);
}
}
}
// ────────────────────────────────────────────────────────────────
// Event listeners
// ────────────────────────────────────────────────────────────────
els.connectBtn.onclick = connect;
els.disconnectBtn.onclick = disconnect;
els.startMediaBtn.onclick = startMedia;
els.stopMediaBtn.onclick = stopAllMedia;
els.captureBtn.onclick = () => captureFrame('manual');
els.interruptBtn.onclick = interrupt;
els.frameInterval.onchange = () => {
if (frameTimer) {
clearInterval(frameTimer);
const ms = Number(els.frameInterval.value);
if (ms > 0) frameTimer = setInterval(() => captureFrame('timer'), ms);
}
};
document.getElementById('wsEndpoint').addEventListener('keypress', e => {
if (e.key === 'Enter') connect();
});
</script>
</body>
</html>

View File

@@ -258,7 +258,7 @@
<!-- Connection -->
<div class="card">
<div class="row">
<input type="text" id="endpoint" value="ws://localhost:8080" placeholder="WebSocket endpoint" />
<input type="text" id="endpoint" value="ws://localhost:8081/ws/voice" placeholder="WebSocket endpoint" />
<button id="connectBtn" class="primary">Connect</button>
<button id="disconnectBtn" disabled>Disconnect</button>
</div>

161
example/ws-server-video.ts Normal file
View File

@@ -0,0 +1,161 @@
// ws-server-video.ts
import "dotenv/config";
import { WebSocketServer } from "ws";
import { VideoAgent } from "../src/VideoAgent.new"; // adjust path
import { tool } from "ai";
import { z } from "zod";
import { openai } from "@ai-sdk/openai";
import { mkdirSync, writeFileSync } from "fs";
import { join, dirname } from "path";
import { fileURLToPath } from "url";
// ── Frame saving ────────────────────────────────────────────────────────
const __dirname = typeof import.meta.dirname === "string"
? import.meta.dirname
: dirname(fileURLToPath(import.meta.url));
const FRAMES_DIR = join(__dirname, "frames");
mkdirSync(FRAMES_DIR, { recursive: true });
console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
let frameCounter = 0;
function saveFrame(msg: {
sequence?: number;
timestamp?: number;
triggerReason?: string;
image: { data: string; format?: string; width?: number; height?: number };
}) {
const idx = frameCounter++;
const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
const ts = new Date(msg.timestamp ?? Date.now())
.toISOString()
.replace(/[:.]/g, "-");
const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
const filepath = join(FRAMES_DIR, filename);
const buf = Buffer.from(msg.image.data, "base64");
writeFileSync(filepath, buf);
console.log(
`[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` +
`${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
`, ${msg.triggerReason ?? "unknown"})`
);
}
const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
const url = new URL(endpoint);
const port = Number(url.port || 8081);
const host = url.hostname || "localhost";
// ── Tools (same as demo.ts) ────────────────────────────────────────────
const weatherTool = tool({
description: "Get the weather in a location",
inputSchema: z.object({
location: z.string().describe("The location to get the weather for"),
}),
execute: async ({ location }) => ({
location,
temperature: 72 + Math.floor(Math.random() * 21) - 10,
conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
Math.floor(Math.random() * 4)
],
}),
});
const timeTool = tool({
description: "Get the current time",
inputSchema: z.object({}),
execute: async () => ({
time: new Date().toLocaleTimeString(),
timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
}),
});
const wss = new WebSocketServer({ port, host });
wss.on("listening", () => {
console.log(`[video-ws] listening on ${endpoint}`);
console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
});
wss.on("connection", (socket) => {
console.log("[video-ws] ✓ client connected");
const agent = new VideoAgent({
model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
transcriptionModel: openai.transcription("whisper-1"),
speechModel: openai.speech("gpt-4o-mini-tts"),
instructions: `You are a helpful video+voice assistant.
You can SEE what the user is showing via webcam.
Describe what you see when it helps answer the question.
Keep spoken answers concise and natural.`,
voice: "echo",
streamingSpeech: {
minChunkSize: 25,
maxChunkSize: 140,
parallelGeneration: true,
maxParallelRequests: 3,
},
tools: { getWeather: weatherTool, getTime: timeTool },
// Tune these depending on your budget & latency goals
maxContextFrames: 6, // very important — each frame ≈ 100400 tokens
maxFrameInputSize: 2_500_000, // ~2.5 MB
});
// Reuse most of the same event logging you have in ws-server.ts
agent.on("text", (data: { role: string; text: string }) => {
console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
});
agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
process.stdout.write(data.text || "");
});
agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`);
});
agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
// Audio and transcription events
agent.on("audio_received", ({ size, format }) => {
console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
});
agent.on("transcription", ({ text, language }) => {
console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
});
// Speech events
agent.on("speech_start", () => console.log(`[video] Speech started`));
agent.on("speech_complete", () => console.log(`[video] Speech complete`));
agent.on("audio_chunk", ({ chunkId, text }) => {
console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
});
// Error handling
agent.on("error", (error: Error) => {
console.error(`[video] ERROR:`, error);
});
agent.on("warning", (warning: string) => {
console.warn(`[video] WARNING:`, warning);
});
agent.on("disconnected", () => {
agent.destroy();
console.log("[video-ws] ✗ client disconnected (agent destroyed)");
});
// ── Intercept raw messages to save frames to disk ────────────────────
socket.on("message", (raw) => {
try {
const msg = JSON.parse(raw.toString());
if (msg.type === "video_frame" && msg.image?.data) {
saveFrame(msg);
}
} catch {
// not JSON — ignore, agent will handle binary etc.
}
});
// The crucial line — same as VoiceAgent
agent.handleSocket(socket);
});

View File

@@ -1,14 +1,21 @@
{
"name": "voice-agent-ai-sdk",
"version": "0.1.0",
"version": "1.0.1",
"description": "Voice AI Agent with ai-sdk",
"main": "src/index.ts",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"files": [
"dist",
"README.md",
"LICENSE"
],
"scripts": {
"build": "tsc",
"dev": "tsc -w",
"demo": "tsx example/demo.ts",
"ws:server": "tsx example/ws-server.ts",
"client": "node example/serve-client.js",
"ws:video": "tsx example/ws-server-video.ts",
"prepublishOnly": "pnpm build"
},
"keywords": [
@@ -16,23 +23,38 @@
"websocket",
"ai",
"agent",
"tools"
"tools",
"tts",
"speech",
"ai-sdk",
"streaming"
],
"author": "Bijit Mondal",
"license": "MIT",
"repository": {
"type": "git",
"url": "git+https://github.com/Bijit-Mondal/voiceAgent.git"
},
"bugs": {
"url": "https://github.com/Bijit-Mondal/voiceAgent/issues"
},
"homepage": "https://github.com/Bijit-Mondal/voiceAgent#readme",
"packageManager": "pnpm@10.27.0",
"devDependencies": {
"@ai-sdk/openai": "^3.0.28",
"@types/node": "^25.2.3",
"@types/ws": "^8.18.1",
"tsx": "^4.20.5",
"typescript": "^5.9.3"
"peerDependencies": {
"ai": "^6.0.0"
},
"dependencies": {
"ai": "^6.0.85",
"dotenv": "^17.2.3",
"ws": "^8.19.0",
"zod": "^4.3.6",
"zod-to-json-schema": "^3.25.1"
},
"devDependencies": {
"@ai-sdk/openai": "^3.0.28",
"@types/node": "^25.2.3",
"@types/ws": "^8.18.1",
"ai": "^6.0.85",
"tsx": "^4.20.5",
"typescript": "^5.9.3"
}
}

818
src/VideoAgent.new.ts Normal file
View File

@@ -0,0 +1,818 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import {
streamText,
type LanguageModel,
stepCountIs,
type Tool,
type ModelMessage,
type TranscriptionModel,
type SpeechModel,
} from "ai";
import {
type StreamingSpeechConfig,
type HistoryConfig,
} from "./types";
import {
WebSocketManager,
SpeechManager,
ConversationManager,
TranscriptionManager,
InputQueue,
type QueueItem,
processFullStream,
handleStreamChunk,
} from "./core";
// ── Video-specific types ────────────────────────────────
/**
* Trigger reasons for frame capture
*/
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
/**
* Video frame data structure sent to/from the client
*/
interface VideoFrame {
type: "video_frame";
sessionId: string;
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
previousFrameRef?: string;
image: {
data: string;
format: string;
width: number;
height: number;
};
}
/**
* Audio data structure
*/
interface AudioData {
type: "audio";
sessionId: string;
data: string;
format: string;
sampleRate?: number;
duration?: number;
timestamp: number;
}
/**
* Backend configuration for video processing
*/
interface VideoAgentConfig {
/** Maximum frames to keep in context buffer for conversation history */
maxContextFrames: number;
}
/**
* Frame context for maintaining visual conversation history
*/
interface FrameContext {
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
frameHash: string;
description?: string;
}
/** Default maximum frame input size (5 MB) */
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
/** Default video agent config */
const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
maxContextFrames: 10,
};
// ── Options & queue item ────────────────────────────────
export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
history?: Partial<HistoryConfig>;
maxAudioInputSize?: number;
/** Maximum frame input size in bytes (default: 5 MB) */
maxFrameInputSize?: number;
/** Maximum frames to keep in context buffer (default: 10) */
maxContextFrames?: number;
/** Session ID for this video agent instance */
sessionId?: string;
}
/** Shape of items in the video agent's input queue */
interface VideoInputItem extends QueueItem<string> {
text?: string;
frame?: VideoFrame;
}
// ── VideoAgent class ────────────────────────────────────
export class VideoAgent extends EventEmitter {
private model: LanguageModel;
private instructions: string;
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
private endpoint?: string;
private tools: Record<string, Tool> = {};
private isDestroyed = false;
private _isProcessing = false;
// Abort controller for the current LLM stream
private currentStreamAbortController?: AbortController;
// ── Managers ─────────────────────────────────────────
private ws: WebSocketManager;
private speech: SpeechManager;
private conversation: ConversationManager;
private transcription: TranscriptionManager;
private inputQueue: InputQueue<VideoInputItem>;
// ── Video-specific state ────────────────────────────
private sessionId: string;
private frameSequence = 0;
private lastFrameTimestamp = 0;
private lastFrameHash?: string;
private frameContextBuffer: FrameContext[] = [];
private currentFrameData?: string;
private videoConfig: VideoAgentConfig;
private maxFrameInputSize: number;
constructor(options: VideoAgentOptions) {
super();
this.model = options.model;
this.instructions =
options.instructions ||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
When analyzing images, be concise but informative. Describe what you see when asked.
Keep responses conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`;
this.stopWhen = options.stopWhen || stepCountIs(5);
this.endpoint = options.endpoint;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
this.sessionId = options.sessionId || this.generateSessionId();
this.videoConfig = {
...DEFAULT_VIDEO_AGENT_CONFIG,
maxContextFrames:
options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
};
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ─────────────────────────
this.ws = new WebSocketManager();
this.speech = new SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new ConversationManager({
history: options.history,
});
this.transcription = new TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new InputQueue<VideoInputItem>();
// ── Wire managers to WebSocket send ─────────────
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire input queue processor ──────────────────
this.inputQueue.processor = (item) => this.processQueueItem(item);
// ── Bubble events from managers ─────────────────
this.bubbleEvents(this.ws, ["connected", "error"]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle ──────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message: any) => this.handleMessage(message));
}
// ══════════════════════════════════════════════════════
// Public API
// ══════════════════════════════════════════════════════
public registerTools(tools: Record<string, Tool>) {
this.tools = { ...this.tools, ...tools };
}
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
return this.transcription.transcribeAudio(audioData);
}
public async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
return this.speech.generateSpeechFromText(text, abortSignal);
}
public interruptSpeech(reason: string = "interrupted"): void {
this.speech.interruptSpeech(reason);
}
public interruptCurrentResponse(reason: string = "interrupted"): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
public async connect(url?: string): Promise<void> {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
public async sendText(text: string): Promise<string> {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueTextInput(text);
}
public async sendAudio(audioData: string): Promise<void> {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Send a video frame with optional text query for vision analysis
*/
public async sendFrame(
frameData: string,
query?: string,
options?: { width?: number; height?: number; format?: string }
): Promise<string> {
this.ensureNotDestroyed();
const frame: VideoFrame = {
type: "video_frame",
sessionId: this.sessionId,
sequence: this.frameSequence++,
timestamp: Date.now(),
triggerReason: "user_request",
previousFrameRef: this.lastFrameHash,
image: {
data: frameData,
format: options?.format || "webp",
width: options?.width || 640,
height: options?.height || 480,
},
};
// Update local frame state
await this.handleVideoFrame(frame);
if (query) {
return this.enqueueMultimodalInput(query, frame);
}
return "";
}
/**
* Request client to capture and send a frame
*/
public requestFrameCapture(reason: FrameTriggerReason): void {
this.ws.send({
type: "capture_frame",
reason,
timestamp: Date.now(),
});
this.emit("frame_requested", { reason });
}
public getConfig(): VideoAgentConfig {
return { ...this.videoConfig };
}
public updateConfig(config: Partial<VideoAgentConfig>): void {
this.videoConfig = { ...this.videoConfig, ...config };
this.emit("config_changed", this.videoConfig);
}
startListening() {
this.emit("listening");
}
stopListening() {
this.emit("stopped");
}
clearHistory() {
this.conversation.clearHistory();
this.frameContextBuffer = [];
}
getHistory(): ModelMessage[] {
return this.conversation.getHistory();
}
setHistory(history: ModelMessage[]) {
this.conversation.setHistory(history);
}
getFrameContext(): FrameContext[] {
return [...this.frameContextBuffer];
}
getSessionId(): string {
return this.sessionId;
}
disconnect() {
this.ws.disconnect();
}
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.frameContextBuffer = [];
this.tools = {};
this.removeAllListeners();
}
// ── Getters ─────────────────────────────────────────
get connected(): boolean {
return this.ws.isConnected;
}
get processing(): boolean {
return this._isProcessing;
}
get speaking(): boolean {
return this.speech.isSpeaking;
}
get pendingSpeechChunks(): number {
return this.speech.pendingChunkCount;
}
get destroyed(): boolean {
return this.isDestroyed;
}
get currentFrameSequence(): number {
return this.frameSequence;
}
get hasVisualContext(): boolean {
return !!this.currentFrameData;
}
// ══════════════════════════════════════════════════════
// Private — message handling
// ══════════════════════════════════════════════════════
private async handleMessage(message: any): Promise<void> {
try {
switch (message.type) {
case "transcript":
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
await this.enqueueTextInput(message.text);
break;
case "audio":
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
try {
await this.handleAudioInput(message.data, message.format);
} catch (audioError) {
this.emit("error", audioError);
}
break;
case "video_frame":
await this.handleVideoFrame(message);
break;
case "interrupt":
this.interruptCurrentResponse(message.reason || "client_request");
break;
case "client_ready":
this.handleClientReady(message);
break;
}
} catch (err) {
this.emit("error", err);
}
}
private handleClientReady(message: any): void {
this.ws.send({
type: "session_init",
sessionId: this.sessionId,
});
this.emit("client_ready", message.capabilities);
}
// ══════════════════════════════════════════════════════
// Private — audio
// ══════════════════════════════════════════════════════
private async handleAudioInput(
base64Audio: string,
format?: string
): Promise<void> {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueTextInput(text);
}
}
// ══════════════════════════════════════════════════════
// Private — video frames
// ══════════════════════════════════════════════════════
private async handleVideoFrame(frame: VideoFrame): Promise<void> {
try {
if (!frame.image?.data) {
this.emit("warning", "Received empty or invalid video frame");
return;
}
const frameSize = Buffer.from(frame.image.data, "base64").length;
if (frameSize > this.maxFrameInputSize) {
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
this.emit(
"error",
new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`)
);
return;
}
const frameHash = this.hashFrame(frame.image.data);
this.lastFrameTimestamp = frame.timestamp;
this.lastFrameHash = frameHash;
this.currentFrameData = frame.image.data;
this.addFrameToContext({
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
frameHash,
});
this.emit("frame_received", {
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
size: frameSize,
dimensions: { width: frame.image.width, height: frame.image.height },
});
this.ws.send({
type: "frame_ack",
sequence: frame.sequence,
timestamp: Date.now(),
});
} catch (error) {
this.emit("error", error);
}
}
private addFrameToContext(context: FrameContext): void {
this.frameContextBuffer.push(context);
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
this.frameContextBuffer.shift();
}
}
private hashFrame(data: string): string {
let hash = 0;
for (let i = 0; i < data.length; i++) {
const char = data.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
}
private generateSessionId(): string {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2, 10);
return `vs_${timestamp}_${randomPart}`;
}
// ══════════════════════════════════════════════════════
// Private — input queue
// ══════════════════════════════════════════════════════
private enqueueTextInput(text: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
private enqueueMultimodalInput(text: string, frame: VideoFrame): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, frame, resolve, reject });
});
}
/**
* Route queued items to the correct processor.
*/
private async processQueueItem(item: VideoInputItem): Promise<string> {
if (item.frame && item.text) {
return this.processMultimodalInput(item.text, item.frame);
} else if (item.text) {
return this.processUserInput(item.text);
}
return "";
}
// ══════════════════════════════════════════════════════
// Private — multimodal content building
// ══════════════════════════════════════════════════════
private buildMultimodalContent(
text: string,
frameData?: string
): Array<{ type: "text"; text: string } | { type: "image"; image: string }> {
const content: Array<
{ type: "text"; text: string } | { type: "image"; image: string }
> = [];
if (this.frameContextBuffer.length > 0) {
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
content.push({ type: "text", text: contextSummary });
}
const imageData = frameData || this.currentFrameData;
if (imageData) {
content.push({ type: "image", image: imageData });
}
content.push({ type: "text", text });
return content;
}
// ══════════════════════════════════════════════════════
// Private — LLM processing
// ══════════════════════════════════════════════════════
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
private async runStream(
messages: ModelMessage[],
abortSignal: AbortSignal
): Promise<string> {
const result = streamText({
model: this.model,
system: this.instructions,
messages,
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal,
onChunk: ({ chunk }) => {
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
this.emit("error", error);
},
});
const streamResult = await processFullStream(
result,
{
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
},
{
sessionId: this.sessionId,
frameContext:
this.frameContextBuffer.length > 0
? {
frameCount: this.frameContextBuffer.length,
lastFrameSequence:
this.frameContextBuffer[this.frameContextBuffer.length - 1]
?.sequence,
}
: undefined,
}
);
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush remaining speech & wait for queue
this.speech.flushPendingText();
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
/**
* Process text-only input (with optional visual context from latest frame).
*/
private async processUserInput(text: string): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text });
const hasVisual = !!this.currentFrameData;
let messages: ModelMessage[];
if (hasVisual) {
const content = this.buildMultimodalContent(text);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Visual context] ${text}` }],
});
messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
} else {
this.conversation.addMessage({ role: "user", content: text });
messages = this.conversation.getHistoryRef();
}
return await this.runStream(
messages,
this.currentStreamAbortController.signal
);
} catch (error) {
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
/**
* Process multimodal input (text + explicit video frame).
*/
private async processMultimodalInput(
text: string,
frame: VideoFrame
): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text, hasImage: true });
const content = this.buildMultimodalContent(text, frame.image.data);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Image attached] ${text}` }],
});
const messages: ModelMessage[] = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
return await this.runStream(
messages,
this.currentStreamAbortController.signal
);
} catch (error) {
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ══════════════════════════════════════════════════════
// Private — helpers
// ══════════════════════════════════════════════════════
private ensureNotDestroyed(): void {
if (this.isDestroyed) {
throw new Error("VideoAgent has been destroyed and cannot be used");
}
}
private cleanupOnDisconnect(): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.currentFrameData = undefined;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
private bubbleEvents(source: EventEmitter, events: string[]): void {
for (const event of events) {
source.on(event, (...args: any[]) => this.emit(event, ...args));
}
}
}
// Export types for external use
export type {
VideoFrame,
AudioData,
VideoAgentConfig,
FrameContext,
FrameTriggerReason,
};
// Re-export shared types
export type { StreamingSpeechConfig, HistoryConfig } from "./types";

1641
src/VideoAgent.ts Normal file

File diff suppressed because it is too large Load Diff

484
src/VoiceAgent.new.ts Normal file
View File

@@ -0,0 +1,484 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import {
streamText,
type LanguageModel,
stepCountIs,
type Tool,
type ModelMessage,
type TranscriptionModel,
type SpeechModel,
} from "ai";
import {
type StreamingSpeechConfig,
type HistoryConfig,
} from "./types";
import {
WebSocketManager,
SpeechManager,
ConversationManager,
TranscriptionManager,
InputQueue,
type QueueItem,
processFullStream,
handleStreamChunk,
} from "./core";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/** Shape of items in the voice agent's input queue */
interface VoiceInputItem extends QueueItem<string> {
text: string;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export class VoiceAgent extends EventEmitter {
private model: LanguageModel;
private instructions: string;
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
private endpoint?: string;
private tools: Record<string, Tool> = {};
private isDestroyed = false;
private _isProcessing = false;
// Abort controller for the current LLM stream
private currentStreamAbortController?: AbortController;
// ── Managers ──────────────────────────────────────────
private ws: WebSocketManager;
private speech: SpeechManager;
private conversation: ConversationManager;
private transcription: TranscriptionManager;
private inputQueue: InputQueue<VoiceInputItem>;
constructor(options: VoiceAgentOptions) {
super();
this.model = options.model;
this.instructions =
options.instructions || "You are a helpful voice assistant.";
this.stopWhen = options.stopWhen || stepCountIs(5);
this.endpoint = options.endpoint;
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ──────────────────────────────
this.ws = new WebSocketManager();
this.speech = new SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new ConversationManager({
history: options.history,
});
this.transcription = new TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new InputQueue<VoiceInputItem>();
// ── Wire managers to the WebSocket send function ─────
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire the input queue processor ───────────────────
this.inputQueue.processor = (item) => this.processUserInput(item.text);
// ── Bubble events from managers ──────────────────────
this.bubbleEvents(this.ws, [
"connected",
"error",
]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle events ────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message: any) => this.handleMessage(message));
}
// ── Public API ────────────────────────────────────────
public registerTools(tools: Record<string, Tool>) {
this.tools = { ...this.tools, ...tools };
}
/**
* Transcribe audio data to text using the configured transcription model.
*/
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
return this.transcription.transcribeAudio(audioData);
}
/**
* Generate speech from text using the configured speech model.
*/
public async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
return this.speech.generateSpeechFromText(text, abortSignal);
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
public interruptSpeech(reason: string = "interrupted"): void {
this.speech.interruptSpeech(reason);
}
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
public interruptCurrentResponse(reason: string = "interrupted"): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
/**
* Connect to a WebSocket server by URL.
*/
public async connect(url?: string): Promise<void> {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
/**
* Attach an existing WebSocket (server-side usage).
*/
public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
/**
* Send text input for processing (bypasses transcription).
*/
public async sendText(text: string): Promise<string> {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueInput(text);
}
/**
* Send base64 audio data to be transcribed and processed.
*/
public async sendAudio(audioData: string): Promise<void> {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
/**
* Send raw audio buffer to be transcribed and processed.
*/
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
public async generateAndSendSpeechFull(text: string): Promise<void> {
return this.speech.generateAndSendSpeechFull(text);
}
/** Start listening for voice input */
startListening() {
console.log("Starting voice agent...");
this.emit("listening");
}
/** Stop listening for voice input */
stopListening() {
console.log("Stopping voice agent...");
this.emit("stopped");
}
/** Clear conversation history */
clearHistory() {
this.conversation.clearHistory();
}
/** Get current conversation history */
getHistory(): ModelMessage[] {
return this.conversation.getHistory();
}
/** Set conversation history (useful for restoring sessions) */
setHistory(history: ModelMessage[]) {
this.conversation.setHistory(history);
}
/** Disconnect from WebSocket and stop all in-flight work */
disconnect() {
this.ws.disconnect();
}
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.tools = {};
this.removeAllListeners();
}
// ── Getters ───────────────────────────────────────────
get connected(): boolean {
return this.ws.isConnected;
}
get processing(): boolean {
return this._isProcessing;
}
get speaking(): boolean {
return this.speech.isSpeaking;
}
get pendingSpeechChunks(): number {
return this.speech.pendingChunkCount;
}
get destroyed(): boolean {
return this.isDestroyed;
}
// ── Private: message handling ─────────────────────────
private async handleMessage(message: any): Promise<void> {
try {
console.log(`Received WebSocket message of type: ${message.type}`);
if (message.type === "transcript") {
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueInput(message.text);
} else if (message.type === "audio") {
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(
`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
);
await this.handleAudioInput(message.data, message.format);
} else if (message.type === "interrupt") {
console.log(
`Received interrupt request: ${message.reason || "client_request"}`
);
this.interruptCurrentResponse(message.reason || "client_request");
}
} catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err);
}
}
// ── Private: audio ────────────────────────────────────
private async handleAudioInput(
base64Audio: string,
format?: string
): Promise<void> {
const text = await this.transcription.processAudioInput(
base64Audio,
format
);
if (text) {
await this.enqueueInput(text);
}
}
// ── Private: input queue ──────────────────────────────
private enqueueInput(text: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
// ── Private: LLM processing ───────────────────────────
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
private async processUserInput(text: string): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
const streamAbortSignal = this.currentStreamAbortController.signal;
try {
this.emit("text", { role: "user", text });
this.conversation.addMessage({ role: "user", content: text });
const result = streamText({
model: this.model,
system: this.instructions,
messages: this.conversation.getHistoryRef(),
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal: streamAbortSignal,
onChunk: ({ chunk }) => {
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error);
},
});
const streamResult = await processFullStream(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush any remaining speech
this.speech.flushPendingText();
// Wait for all speech chunks to complete
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
} catch (error) {
// Clean up speech state on error
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ── Private: helpers ──────────────────────────────────
private ensureNotDestroyed(): void {
if (this.isDestroyed) {
throw new Error("VoiceAgent has been destroyed and cannot be used");
}
}
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect(): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
/**
* Forward select events from a child emitter to this agent.
*/
private bubbleEvents(source: EventEmitter, events: string[]): void {
for (const event of events) {
source.on(event, (...args: any[]) => this.emit(event, ...args));
}
}
}

View File

@@ -11,42 +11,14 @@ import {
type TranscriptionModel,
type SpeechModel,
} from "ai";
/**
* Represents a chunk of text to be converted to speech
*/
interface SpeechChunk {
id: number;
text: string;
audioPromise?: Promise<Uint8Array | null>;
}
/**
* Configuration for streaming speech behavior
*/
interface StreamingSpeechConfig {
/** Minimum characters before generating speech for a chunk */
minChunkSize: number;
/** Maximum characters per chunk (will split at sentence boundary before this) */
maxChunkSize: number;
/** Whether to enable parallel TTS generation */
parallelGeneration: boolean;
/** Maximum number of parallel TTS requests */
maxParallelRequests: number;
}
/**
* Configuration for conversation history memory management
*/
interface HistoryConfig {
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxMessages: number;
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxTotalChars: number;
}
/** Default maximum audio input size (10 MB) */
const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
import {
type SpeechChunk,
type StreamingSpeechConfig,
type HistoryConfig,
DEFAULT_STREAMING_SPEECH_CONFIG,
DEFAULT_HISTORY_CONFIG,
DEFAULT_MAX_AUDIO_SIZE,
} from "./types";
export interface VoiceAgentOptions {
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
@@ -67,6 +39,25 @@ export interface VoiceAgentOptions {
maxAudioInputSize?: number;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export class VoiceAgent extends EventEmitter {
private socket?: WebSocket;
private tools: Record<string, Tool> = {};
@@ -118,7 +109,7 @@ export class VoiceAgent extends EventEmitter {
this.endpoint = options.endpoint;
this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "mp3";
this.outputFormat = options.outputFormat || "opus";
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
if (options.tools) {
this.tools = { ...options.tools };
@@ -126,17 +117,13 @@ export class VoiceAgent extends EventEmitter {
// Initialize streaming speech config with defaults
this.streamingSpeechConfig = {
minChunkSize: 50,
maxChunkSize: 200,
parallelGeneration: true,
maxParallelRequests: 3,
...DEFAULT_STREAMING_SPEECH_CONFIG,
...options.streamingSpeech,
};
// Initialize history config with defaults
this.historyConfig = {
maxMessages: 100,
maxTotalChars: 0, // unlimited by default
...DEFAULT_HISTORY_CONFIG,
...options.history,
};
}
@@ -727,6 +714,10 @@ export class VoiceAgent extends EventEmitter {
* Attach an existing WebSocket (server-side usage).
* Use this when a WS server accepts a connection and you want the
* agent to handle messages on that socket.
*
* **Note:** Calling this while a socket is already attached will cleanly
* tear down the previous connection first. Each `VoiceAgent` instance
* supports only one socket at a time — create a new agent per user.
*/
public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed();

View File

@@ -0,0 +1,122 @@
import { EventEmitter } from "events";
import { type ModelMessage } from "ai";
import { type HistoryConfig, DEFAULT_HISTORY_CONFIG } from "../types";
export interface ConversationManagerOptions {
history?: Partial<HistoryConfig>;
}
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
export class ConversationManager extends EventEmitter {
private conversationHistory: ModelMessage[] = [];
private historyConfig: HistoryConfig;
constructor(options: ConversationManagerOptions = {}) {
super();
this.historyConfig = {
...DEFAULT_HISTORY_CONFIG,
...options.history,
};
}
/**
* Add a message to history and trim if needed.
*/
addMessage(message: ModelMessage): void {
this.conversationHistory.push(message);
this.trimHistory();
}
/**
* Get a copy of the current history.
*/
getHistory(): ModelMessage[] {
return [...this.conversationHistory];
}
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef(): ModelMessage[] {
return this.conversationHistory;
}
/**
* Replace the entire conversation history.
*/
setHistory(history: ModelMessage[]): void {
this.conversationHistory = [...history];
}
/**
* Clear all conversation history.
*/
clearHistory(): void {
this.conversationHistory = [];
this.emit("history_cleared");
}
/**
* Get the number of messages in history.
*/
get length(): number {
return this.conversationHistory.length;
}
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
private trimHistory(): void {
const { maxMessages, maxTotalChars } = this.historyConfig;
// Trim by message count
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
const excess = this.conversationHistory.length - maxMessages;
// Round up to even number to preserve turn pairs
const toRemove = excess % 2 === 0 ? excess : excess + 1;
this.conversationHistory.splice(0, toRemove);
this.emit("history_trimmed", {
removedCount: toRemove,
reason: "max_messages",
});
}
// Trim by total character count
if (maxTotalChars > 0) {
let totalChars = this.conversationHistory.reduce((sum, msg) => {
const content =
typeof msg.content === "string"
? msg.content
: JSON.stringify(msg.content);
return sum + content.length;
}, 0);
let removedCount = 0;
while (
totalChars > maxTotalChars &&
this.conversationHistory.length > 2
) {
const removed = this.conversationHistory.shift();
if (removed) {
const content =
typeof removed.content === "string"
? removed.content
: JSON.stringify(removed.content);
totalChars -= content.length;
removedCount++;
}
}
if (removedCount > 0) {
this.emit("history_trimmed", {
removedCount,
reason: "max_total_chars",
});
}
}
}
}

71
src/core/InputQueue.ts Normal file
View File

@@ -0,0 +1,71 @@
/**
* A generic serial input queue that ensures only one processor runs at a time.
*
* @template T The shape of each queued item (must include resolve/reject)
*/
export interface QueueItem<T = string> {
resolve: (v: T) => void;
reject: (e: unknown) => void;
}
export class InputQueue<T extends QueueItem<any>> {
private queue: T[] = [];
private processing = false;
/** Callback invoked for each item — must return a resolved value */
public processor: (item: T) => Promise<any> = async () => "";
/**
* Enqueue an item for serial processing.
*/
enqueue(item: T): void {
this.queue.push(item);
this.drain();
}
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason: Error): void {
for (const item of this.queue) {
item.reject(reason);
}
this.queue = [];
this.processing = false;
}
/**
* Number of items waiting in the queue.
*/
get length(): number {
return this.queue.length;
}
/**
* Whether the queue is currently processing an item.
*/
get isProcessing(): boolean {
return this.processing;
}
// ── Private ──────────────────────────────────────────
private async drain(): Promise<void> {
if (this.processing) return;
this.processing = true;
try {
while (this.queue.length > 0) {
const item = this.queue.shift()!;
try {
const result = await this.processor(item);
item.resolve(result);
} catch (error) {
item.reject(error);
}
}
} finally {
this.processing = false;
}
}
}

453
src/core/SpeechManager.ts Normal file
View File

@@ -0,0 +1,453 @@
import { EventEmitter } from "events";
import {
experimental_generateSpeech as generateSpeech,
type SpeechModel,
} from "ai";
import {
type SpeechChunk,
type StreamingSpeechConfig,
DEFAULT_STREAMING_SPEECH_CONFIG,
} from "../types";
export interface SpeechManagerOptions {
speechModel?: SpeechModel;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
}
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
export class SpeechManager extends EventEmitter {
private speechModel?: SpeechModel;
private voice: string;
private speechInstructions?: string;
private outputFormat: string;
private streamingSpeechConfig: StreamingSpeechConfig;
private currentSpeechAbortController?: AbortController;
private speechChunkQueue: SpeechChunk[] = [];
private nextChunkId = 0;
private _isSpeaking = false;
private pendingTextBuffer = "";
// Promise-based signal for speech queue completion
private speechQueueDonePromise?: Promise<void>;
private speechQueueDoneResolve?: () => void;
/** Callback to send messages over the WebSocket */
public sendMessage: (message: Record<string, unknown>) => void = () => { };
constructor(options: SpeechManagerOptions) {
super();
this.speechModel = options.speechModel;
this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "opus";
this.streamingSpeechConfig = {
...DEFAULT_STREAMING_SPEECH_CONFIG,
...options.streamingSpeech,
};
}
get isSpeaking(): boolean {
return this._isSpeaking;
}
get pendingChunkCount(): number {
return this.speechChunkQueue.length;
}
get hasSpeechModel(): boolean {
return !!this.speechModel;
}
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise(): Promise<void> | undefined {
return this.speechQueueDonePromise;
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
if (!this.speechModel) {
throw new Error("Speech model not configured");
}
const result = await generateSpeech({
model: this.speechModel,
text,
voice: this.voice,
instructions: this.speechInstructions,
outputFormat: this.outputFormat,
abortSignal,
});
return result.audio.uint8Array;
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text: string): Promise<void> {
if (!this.speechModel) return;
try {
this.emit("speech_start", { text, streaming: false });
const audioData = await this.generateSpeechFromText(text);
const base64Audio = Buffer.from(audioData).toString("base64");
this.sendMessage({
type: "audio",
data: base64Audio,
format: this.outputFormat,
});
this.emit("audio", {
data: base64Audio,
format: this.outputFormat,
uint8Array: audioData,
});
this.emit("speech_complete", { text, streaming: false });
} catch (error) {
console.error("Failed to generate speech:", error);
this.emit("error", error);
}
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason: string = "interrupted"): void {
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
return;
}
// Abort any pending speech generation requests
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
// Clear the speech queue
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
// Resolve any pending speech-done waiters so callers can finish
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
// Notify clients to stop audio playback
this.sendMessage({
type: "speech_interrupted",
reason,
});
this.emit("speech_interrupted", { reason });
}
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta: string): void {
if (!this.speechModel) return;
this.pendingTextBuffer += textDelta;
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
this.pendingTextBuffer = remaining;
for (const sentence of sentences) {
this.queueSpeechChunk(sentence);
}
}
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText(): void {
if (!this.speechModel || !this.pendingTextBuffer.trim()) return;
this.queueSpeechChunk(this.pendingTextBuffer);
this.pendingTextBuffer = "";
}
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset(): void {
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
}
// ── Private helpers ─────────────────────────────────────────
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
private extractSentences(text: string): [string[], string] {
const sentences: string[] = [];
let remaining = text;
// Match sentences ending with . ! ? followed by space or end of string
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
let lastIndex = 0;
let match;
while ((match = sentenceEndPattern.exec(text)) !== null) {
const sentence = text
.slice(lastIndex, match.index + match[0].length)
.trim();
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
sentences.push(sentence);
lastIndex = match.index + match[0].length;
} else if (sentences.length > 0) {
// Append short sentence to previous one
sentences[sentences.length - 1] += " " + sentence;
lastIndex = match.index + match[0].length;
}
}
remaining = text.slice(lastIndex);
// If remaining text is too long, force split at clause boundaries
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
const clausePattern = /[,;:]\s+/g;
let clauseMatch;
let splitIndex = 0;
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
splitIndex = clauseMatch.index + clauseMatch[0].length;
break;
}
}
if (splitIndex > 0) {
sentences.push(remaining.slice(0, splitIndex).trim());
remaining = remaining.slice(splitIndex);
}
}
return [sentences, remaining];
}
/**
* Queue a text chunk for speech generation.
*/
private queueSpeechChunk(text: string): void {
if (!this.speechModel || !text.trim()) return;
// Wrap chunk ID to prevent unbounded growth in very long sessions
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
this.nextChunkId = 0;
}
const chunk: SpeechChunk = {
id: this.nextChunkId++,
text: text.trim(),
};
// Create the speech-done promise if not already present
if (!this.speechQueueDonePromise) {
this.speechQueueDonePromise = new Promise<void>((resolve) => {
this.speechQueueDoneResolve = resolve;
});
}
// Start generating audio immediately (parallel generation)
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter(
(c) => c.audioPromise
).length;
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
}
this.speechChunkQueue.push(chunk);
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
// Start processing queue if not already
if (!this._isSpeaking) {
this.processSpeechQueue();
}
}
/**
* Generate audio for a single chunk.
*/
private async generateChunkAudio(
chunk: SpeechChunk
): Promise<Uint8Array | null> {
if (!this.currentSpeechAbortController) {
this.currentSpeechAbortController = new AbortController();
}
try {
console.log(
`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`
);
const audioData = await this.generateSpeechFromText(
chunk.text,
this.currentSpeechAbortController.signal
);
console.log(
`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`
);
return audioData;
} catch (error) {
if ((error as Error).name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null;
}
console.error(
`Failed to generate audio for chunk ${chunk.id}:`,
error
);
this.emit("error", error);
return null;
}
}
/**
* Process the speech queue and send audio chunks in order.
*/
private async processSpeechQueue(): Promise<void> {
if (this._isSpeaking) return;
this._isSpeaking = true;
console.log(
`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`
);
this.emit("speech_start", { streaming: true });
this.sendMessage({ type: "speech_stream_start" });
try {
while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0];
console.log(
`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`
);
// Ensure audio generation has started
if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
// Wait for this chunk's audio
const audioData = await chunk.audioPromise;
// Check if we were interrupted while waiting
if (!this._isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break;
}
// Remove from queue after processing
this.speechChunkQueue.shift();
if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64");
console.log(
`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`
);
// Send audio chunk via WebSocket
this.sendMessage({
type: "audio_chunk",
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
});
// Emit for local handling
this.emit("audio_chunk", {
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
uint8Array: audioData,
});
} else {
console.log(`No audio data generated for chunk #${chunk.id}`);
}
// Start generating next chunks in parallel
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter(
(c) => c.audioPromise
).length;
const toStart = Math.min(
this.streamingSpeechConfig.maxParallelRequests - activeRequests,
this.speechChunkQueue.length
);
if (toStart > 0) {
console.log(
`Starting parallel generation for ${toStart} more chunks`
);
for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find(
(c) => !c.audioPromise
);
if (nextChunk) {
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
}
}
}
}
}
} catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error);
} finally {
this._isSpeaking = false;
this.currentSpeechAbortController = undefined;
// Signal that the speech queue is fully drained
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
console.log(`Speech queue processing complete`);
this.sendMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true });
}
}
}

293
src/core/StreamProcessor.ts Normal file
View File

@@ -0,0 +1,293 @@
import { type streamText } from "ai";
/**
* Result of processing a full LLM stream.
*/
export interface StreamResult {
fullText: string;
fullReasoning: string;
allToolCalls: Array<{
toolName: string;
toolCallId: string;
input: unknown;
}>;
allToolResults: Array<{
toolName: string;
toolCallId: string;
output: unknown;
}>;
allSources: Array<unknown>;
allFiles: Array<unknown>;
}
export interface StreamProcessorCallbacks {
/** Called when a text delta arrives (for streaming speech, etc.) */
onTextDelta?: (text: string) => void;
/** Called when a text-end part arrives (flush speech, etc.) */
onTextEnd?: () => void;
/** Send a WebSocket message */
sendMessage: (message: Record<string, unknown>) => void;
/** Emit an event on the agent */
emitEvent: (event: string, data?: unknown) => void;
}
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
export async function processFullStream(
result: ReturnType<typeof streamText>,
callbacks: StreamProcessorCallbacks,
extraResponseFields?: Record<string, unknown>
): Promise<StreamResult> {
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
let fullText = "";
let fullReasoning = "";
const allToolCalls: StreamResult["allToolCalls"] = [];
const allToolResults: StreamResult["allToolResults"] = [];
const allSources: unknown[] = [];
const allFiles: unknown[] = [];
for await (const part of result.fullStream) {
switch (part.type) {
// ── Stream lifecycle ──────────────────────────────
case "start":
sendMessage({ type: "stream_start" });
break;
case "finish":
emitEvent("text", { role: "assistant", text: fullText });
sendMessage({
type: "stream_finish",
finishReason: part.finishReason,
usage: part.totalUsage,
});
break;
case "error":
emitEvent("error", part.error);
sendMessage({
type: "stream_error",
error: String(part.error),
});
break;
case "abort":
emitEvent("abort", { reason: part.reason });
sendMessage({
type: "stream_abort",
reason: part.reason,
});
break;
// ── Step lifecycle ────────────────────────────────
case "start-step":
sendMessage({
type: "step_start",
warnings: part.warnings,
});
break;
case "finish-step":
sendMessage({
type: "step_finish",
finishReason: part.finishReason,
usage: part.usage,
});
break;
// ── Text streaming ────────────────────────────────
case "text-start":
sendMessage({ type: "text_start", id: part.id });
break;
case "text-delta":
fullText += part.text;
onTextDelta?.(part.text);
sendMessage({
type: "text_delta",
id: part.id,
text: part.text,
});
break;
case "text-end":
onTextEnd?.();
sendMessage({ type: "text_end", id: part.id });
break;
// ── Reasoning streaming ───────────────────────────
case "reasoning-start":
sendMessage({ type: "reasoning_start", id: part.id });
break;
case "reasoning-delta":
fullReasoning += part.text;
sendMessage({
type: "reasoning_delta",
id: part.id,
text: part.text,
});
break;
case "reasoning-end":
sendMessage({ type: "reasoning_end", id: part.id });
break;
// ── Tool input streaming ──────────────────────────
case "tool-input-start":
sendMessage({
type: "tool_input_start",
id: part.id,
toolName: part.toolName,
});
break;
case "tool-input-delta":
sendMessage({
type: "tool_input_delta",
id: part.id,
delta: part.delta,
});
break;
case "tool-input-end":
sendMessage({ type: "tool_input_end", id: part.id });
break;
// ── Tool execution ────────────────────────────────
case "tool-call":
allToolCalls.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
sendMessage({
type: "tool_call",
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
break;
case "tool-result":
allToolResults.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
output: part.output,
});
sendMessage({
type: "tool_result",
toolName: part.toolName,
toolCallId: part.toolCallId,
result: part.output,
});
break;
case "tool-error":
sendMessage({
type: "tool_error",
toolName: part.toolName,
toolCallId: part.toolCallId,
error: String(part.error),
});
break;
// ── Sources and files ─────────────────────────────
case "source":
allSources.push(part);
sendMessage({
type: "source",
source: part,
});
break;
case "file":
allFiles.push(part.file);
sendMessage({
type: "file",
file: part.file,
});
break;
}
}
// Send the complete response
sendMessage({
type: "response_complete",
text: fullText,
reasoning: fullReasoning || undefined,
toolCalls: allToolCalls,
toolResults: allToolResults,
sources: allSources.length > 0 ? allSources : undefined,
files: allFiles.length > 0 ? allFiles : undefined,
...extraResponseFields,
});
return {
fullText,
fullReasoning,
allToolCalls,
allToolResults,
allSources,
allFiles,
};
}
/**
* Handle onChunk callback events and emit them.
*/
export function handleStreamChunk(
chunk: any,
emitEvent: (event: string, data?: unknown) => void
): void {
switch (chunk.type) {
case "text-delta":
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
break;
case "reasoning-delta":
emitEvent("chunk:reasoning_delta", {
id: chunk.id,
text: chunk.text,
});
break;
case "tool-call":
emitEvent("chunk:tool_call", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
});
break;
case "tool-result":
emitEvent("chunk:tool_result", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
result: chunk.output,
});
break;
case "tool-input-start":
emitEvent("chunk:tool_input_start", {
id: chunk.id,
toolName: chunk.toolName,
});
break;
case "tool-input-delta":
emitEvent("chunk:tool_input_delta", {
id: chunk.id,
delta: chunk.delta,
});
break;
case "source":
emitEvent("chunk:source", chunk);
break;
}
}

View File

@@ -0,0 +1,142 @@
import { EventEmitter } from "events";
import {
experimental_transcribe as transcribe,
type TranscriptionModel,
} from "ai";
import { DEFAULT_MAX_AUDIO_SIZE } from "../types";
export interface TranscriptionManagerOptions {
transcriptionModel?: TranscriptionModel;
maxAudioInputSize?: number;
}
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
export class TranscriptionManager extends EventEmitter {
private transcriptionModel?: TranscriptionModel;
private maxAudioInputSize: number;
/** Callback to send messages over the WebSocket */
public sendMessage: (message: Record<string, unknown>) => void = () => {};
constructor(options: TranscriptionManagerOptions = {}) {
super();
this.transcriptionModel = options.transcriptionModel;
this.maxAudioInputSize =
options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
}
get hasTranscriptionModel(): boolean {
return !!this.transcriptionModel;
}
/**
* Transcribe audio data to text.
*/
async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
if (!this.transcriptionModel) {
throw new Error("Transcription model not configured");
}
console.log(
`Sending ${audioData.byteLength} bytes to Whisper for transcription`
);
try {
const result = await transcribe({
model: this.transcriptionModel,
audio: audioData,
});
console.log(
`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`
);
this.emit("transcription", {
text: result.text,
language: result.language,
});
// Send transcription to client for immediate feedback
this.sendMessage({
type: "transcription_result",
text: result.text,
language: result.language,
});
return result.text;
} catch (error) {
console.error("Whisper transcription failed:", error);
throw error;
}
}
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
async processAudioInput(
base64Audio: string,
format?: string
): Promise<string | null> {
if (!this.transcriptionModel) {
const error = new Error(
"Transcription model not configured for audio input"
);
this.emit("error", error);
this.sendMessage({ type: "error", error: error.message });
return null;
}
try {
const audioBuffer = Buffer.from(base64Audio, "base64");
// Validate audio size
if (audioBuffer.length > this.maxAudioInputSize) {
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
this.emit(
"error",
new Error(
`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`
)
);
return null;
}
if (audioBuffer.length === 0) {
this.emit("warning", "Received empty audio data");
return null;
}
this.emit("audio_received", { size: audioBuffer.length, format });
console.log(
`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`
);
const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (!transcribedText.trim()) {
this.emit("warning", "Transcription returned empty text");
this.sendMessage({
type: "transcription_error",
error: "Whisper returned empty text",
});
return null;
}
return transcribedText;
} catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error);
this.sendMessage({
type: "transcription_error",
error: `Transcription failed: ${(error as Error).message || String(error)}`,
});
return null;
}
}
}

View File

@@ -0,0 +1,133 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
export class WebSocketManager extends EventEmitter {
private socket?: WebSocket;
private _isConnected = false;
get isConnected(): boolean {
return this._isConnected;
}
get currentSocket(): WebSocket | undefined {
return this.socket;
}
/**
* Connect to a WebSocket server by URL.
*/
connect(url: string): Promise<void> {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
return new Promise((resolve, reject) => {
try {
this.socket = new WebSocket(url);
this.attachListeners();
this.socket.once("open", () => {
this._isConnected = true;
this.emit("connected");
resolve();
});
this.socket.once("error", (error) => {
reject(error);
});
} catch (error) {
reject(error);
}
});
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
this.socket = socket;
this._isConnected = true;
this.attachListeners();
this.emit("connected");
}
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message: Record<string, unknown>): void {
if (!this.socket || !this._isConnected) return;
try {
if (this.socket.readyState === WebSocket.OPEN) {
this.socket.send(JSON.stringify(message));
} else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
}
} catch (error) {
// Socket may have closed between the readyState check and send()
console.error("Failed to send WebSocket message:", error);
this.emit("error", error);
}
}
/**
* Disconnect and clean up the current socket.
*/
disconnect(): void {
if (!this.socket) return;
try {
this.socket.removeAllListeners();
if (
this.socket.readyState === WebSocket.OPEN ||
this.socket.readyState === WebSocket.CONNECTING
) {
this.socket.close();
}
} catch {
// Ignore close errors — socket may already be dead
}
this.socket = undefined;
this._isConnected = false;
}
/**
* Attach internal event listeners on the current socket.
*/
private attachListeners(): void {
if (!this.socket) return;
this.socket.on("message", (data) => {
try {
const message = JSON.parse(data.toString());
this.emit("message", message);
} catch (err) {
console.error("Failed to parse WebSocket message:", err);
this.emit("error", err);
}
});
this.socket.on("close", () => {
this._isConnected = false;
this.emit("disconnected");
});
this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error);
});
}
}

17
src/core/index.ts Normal file
View File

@@ -0,0 +1,17 @@
export { WebSocketManager } from "./WebSocketManager";
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
export {
ConversationManager,
type ConversationManagerOptions,
} from "./ConversationManager";
export {
TranscriptionManager,
type TranscriptionManagerOptions,
} from "./TranscriptionManager";
export {
processFullStream,
handleStreamChunk,
type StreamResult,
type StreamProcessorCallbacks,
} from "./StreamProcessor";
export { InputQueue, type QueueItem } from "./InputQueue";

View File

@@ -1 +1,22 @@
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
// Agents
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
export {
VideoAgent,
type VideoAgentOptions,
type VideoFrame,
type AudioData,
type VideoAgentConfig,
type FrameContext,
type FrameTriggerReason,
} from "./VideoAgent.new";
// Shared types
export {
type SpeechChunk,
type StreamingSpeechConfig,
type HistoryConfig,
type StopWhenCondition,
DEFAULT_STREAMING_SPEECH_CONFIG,
DEFAULT_HISTORY_CONFIG,
DEFAULT_MAX_AUDIO_SIZE,
} from "./types";

60
src/types.ts Normal file
View File

@@ -0,0 +1,60 @@
import type { streamText } from "ai";
/**
* Represents a chunk of text to be converted to speech
*/
export interface SpeechChunk {
id: number;
text: string;
audioPromise?: Promise<Uint8Array | null>;
}
/**
* Configuration for streaming speech behavior
*/
export interface StreamingSpeechConfig {
/** Minimum characters before generating speech for a chunk */
minChunkSize: number;
/** Maximum characters per chunk (will split at sentence boundary before this) */
maxChunkSize: number;
/** Whether to enable parallel TTS generation */
parallelGeneration: boolean;
/** Maximum number of parallel TTS requests */
maxParallelRequests: number;
}
/**
* Configuration for conversation history memory management
*/
export interface HistoryConfig {
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxMessages: number;
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
maxTotalChars: number;
}
/**
* Default streaming speech configuration
*/
export const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig = {
minChunkSize: 50,
maxChunkSize: 200,
parallelGeneration: true,
maxParallelRequests: 3,
};
/**
* Default history configuration
*/
export const DEFAULT_HISTORY_CONFIG: HistoryConfig = {
maxMessages: 100,
maxTotalChars: 0, // unlimited by default
};
/** Default maximum audio input size (10 MB) */
export const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
/**
* Default stop condition type from streamText
*/
export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;