Refactor VoiceAgent: Extract types and default configurations into separate types.ts file; remove unused StreamBuffer file

2026-03-02 10:36:37 +00:00 · 2026-02-19 16:01:25 +05:30
parent ce10d521f3
commit ac505c4ed9
23 changed files with 3570 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ node_modules
 .marscode

 # dist
+
+HOW_*.md
--- a/dist/VideoAgent.d.ts
+++ b/dist/VideoAgent.d.ts
@@ -0,0 +1,339 @@
+import { WebSocket } from "ws";
+import { EventEmitter } from "events";
+import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
+import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
+/**
+ * Trigger reasons for frame capture
+ */
+type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
+/**
+ * Video frame data structure sent to/from the client
+ */
+interface VideoFrame {
+    type: "video_frame";
+    sessionId: string;
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    previousFrameRef?: string;
+    image: {
+        data: string;
+        format: string;
+        width: number;
+        height: number;
+    };
+}
+/**
+ * Audio data structure
+ */
+interface AudioData {
+    type: "audio";
+    sessionId: string;
+    data: string;
+    format: string;
+    sampleRate?: number;
+    duration?: number;
+    timestamp: number;
+}
+/**
+ * Backend configuration for video processing
+ */
+interface VideoAgentConfig {
+    /** Maximum frames to keep in context buffer for conversation history */
+    maxContextFrames: number;
+}
+/**
+ * Frame context for maintaining visual conversation history
+ */
+interface FrameContext {
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    frameHash: string;
+    description?: string;
+}
+export interface VideoAgentOptions {
+    model: LanguageModel;
+    transcriptionModel?: TranscriptionModel;
+    speechModel?: SpeechModel;
+    instructions?: string;
+    stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
+    tools?: Record<string, Tool>;
+    endpoint?: string;
+    voice?: string;
+    speechInstructions?: string;
+    outputFormat?: string;
+    /** Configuration for streaming speech generation */
+    streamingSpeech?: Partial<StreamingSpeechConfig>;
+    /** Configuration for conversation history memory limits */
+    history?: Partial<HistoryConfig>;
+    /** Maximum audio input size in bytes (default: 10 MB) */
+    maxAudioInputSize?: number;
+    /** Maximum frame input size in bytes (default: 5 MB) */
+    maxFrameInputSize?: number;
+    /** Maximum frames to keep in context buffer (default: 10) */
+    maxContextFrames?: number;
+    /** Session ID for this video agent instance */
+    sessionId?: string;
+}
+export declare class VideoAgent extends EventEmitter {
+    private socket?;
+    private tools;
+    private model;
+    private transcriptionModel?;
+    private speechModel?;
+    private instructions;
+    private stopWhen;
+    private endpoint?;
+    private isConnected;
+    private conversationHistory;
+    private voice;
+    private speechInstructions?;
+    private outputFormat;
+    private isProcessing;
+    private isDestroyed;
+    private sessionId;
+    private frameSequence;
+    private lastFrameTimestamp;
+    private lastFrameHash?;
+    private frameContextBuffer;
+    private currentFrameData?;
+    private videoConfig;
+    private inputQueue;
+    private processingQueue;
+    private currentStreamAbortController?;
+    private historyConfig;
+    private maxAudioInputSize;
+    private maxFrameInputSize;
+    private streamingSpeechConfig;
+    private currentSpeechAbortController?;
+    private speechChunkQueue;
+    private nextChunkId;
+    private isSpeaking;
+    private pendingTextBuffer;
+    private speechQueueDonePromise?;
+    private speechQueueDoneResolve?;
+    constructor(options: VideoAgentOptions);
+    /**
+     * Generate a unique session ID
+     */
+    private generateSessionId;
+    /**
+     * Simple hash function for frame comparison
+     */
+    private hashFrame;
+    /**
+     * Ensure the agent has not been destroyed. Throws if it has.
+     */
+    private ensureNotDestroyed;
+    /**
+     * Get current video agent configuration
+     */
+    getConfig(): VideoAgentConfig;
+    /**
+     * Update video agent configuration
+     */
+    updateConfig(config: Partial<VideoAgentConfig>): void;
+    private setupListeners;
+    /**
+     * Handle client ready signal
+     */
+    private handleClientReady;
+    /**
+     * Handle incoming video frame
+     */
+    private handleVideoFrame;
+    /**
+     * Add frame to context buffer
+     */
+    private addFrameToContext;
+    /**
+     * Request client to capture and send a frame
+     */
+    requestFrameCapture(reason: FrameTriggerReason): void;
+    /**
+     * Clean up all in-flight state when the connection drops.
+     */
+    private cleanupOnDisconnect;
+    registerTools(tools: Record<string, Tool>): void;
+    /**
+     * Transcribe audio data to text using the configured transcription model
+     */
+    transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
+    /**
+     * Generate speech from text using the configured speech model
+     */
+    generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
+    /**
+     * Interrupt ongoing speech generation and playback
+     */
+    interruptSpeech(reason?: string): void;
+    /**
+     * Interrupt both the current LLM stream and ongoing speech
+     */
+    interruptCurrentResponse(reason?: string): void;
+    /**
+     * Extract complete sentences from text buffer
+     */
+    private extractSentences;
+    /**
+     * Trim conversation history to stay within configured limits
+     */
+    private trimHistory;
+    /**
+     * Queue a text chunk for speech generation
+     */
+    private queueSpeechChunk;
+    /**
+     * Generate audio for a single chunk
+     */
+    private generateChunkAudio;
+    /**
+     * Process the speech queue and send audio chunks in order
+     */
+    private processSpeechQueue;
+    /**
+     * Process text delta for streaming speech
+     */
+    private processTextForStreamingSpeech;
+    /**
+     * Flush any remaining text in the buffer to speech
+     */
+    private flushStreamingSpeech;
+    /**
+     * Process incoming audio data: transcribe and generate response
+     */
+    private processAudioInput;
+    connect(url?: string): Promise<void>;
+    /**
+     * Attach an existing WebSocket (server-side usage)
+     */
+    handleSocket(socket: WebSocket): void;
+    /**
+     * Send text input for processing (bypasses transcription)
+     */
+    sendText(text: string): Promise<string>;
+    /**
+     * Send audio data to be transcribed and processed
+     */
+    sendAudio(audioData: string): Promise<void>;
+    /**
+     * Send raw audio buffer to be transcribed and processed
+     */
+    sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
+    /**
+     * Send a video frame with optional text query for vision analysis
+     */
+    sendFrame(frameData: string, query?: string, options?: {
+        width?: number;
+        height?: number;
+        format?: string;
+    }): Promise<string>;
+    /**
+     * Enqueue a text input for serial processing
+     */
+    private enqueueTextInput;
+    /**
+     * Enqueue a multimodal input (text + frame) for serial processing
+     */
+    private enqueueMultimodalInput;
+    /**
+     * Drain the input queue, processing one request at a time
+     */
+    private drainInputQueue;
+    /**
+     * Build the message content array for multimodal input
+     */
+    private buildMultimodalContent;
+    /**
+     * Process multimodal input (text + video frame)
+     */
+    private processMultimodalInput;
+    /**
+     * Process user input with streaming text generation
+     */
+    private processUserInput;
+    /**
+     * Handle individual stream chunks
+     */
+    private handleStreamChunk;
+    /**
+     * Process the full stream result and return the response text
+     */
+    private processStreamResult;
+    /**
+     * Send a message via WebSocket if connected
+     */
+    private sendWebSocketMessage;
+    /**
+     * Start listening for voice/video input
+     */
+    startListening(): void;
+    /**
+     * Stop listening for voice/video input
+     */
+    stopListening(): void;
+    /**
+     * Clear conversation history
+     */
+    clearHistory(): void;
+    /**
+     * Get current conversation history
+     */
+    getHistory(): ModelMessage[];
+    /**
+     * Set conversation history
+     */
+    setHistory(history: ModelMessage[]): void;
+    /**
+     * Get frame context buffer
+     */
+    getFrameContext(): FrameContext[];
+    /**
+     * Get session ID
+     */
+    getSessionId(): string;
+    /**
+     * Internal helper to close and clean up the current socket
+     */
+    private disconnectSocket;
+    /**
+     * Disconnect from WebSocket and stop all in-flight work
+     */
+    disconnect(): void;
+    /**
+     * Permanently destroy the agent, releasing all resources
+     */
+    destroy(): void;
+    /**
+     * Check if agent is connected to WebSocket
+     */
+    get connected(): boolean;
+    /**
+     * Check if agent is currently processing a request
+     */
+    get processing(): boolean;
+    /**
+     * Check if agent is currently speaking
+     */
+    get speaking(): boolean;
+    /**
+     * Get the number of pending speech chunks in the queue
+     */
+    get pendingSpeechChunks(): number;
+    /**
+     * Check if agent has been permanently destroyed
+     */
+    get destroyed(): boolean;
+    /**
+     * Get current frame sequence number
+     */
+    get currentFrameSequence(): number;
+    /**
+     * Check if there is visual context available
+     */
+    get hasVisualContext(): boolean;
+}
+export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
+export type { StreamingSpeechConfig, HistoryConfig } from "./types";
+//# sourceMappingURL=VideoAgent.d.ts.map
--- a/dist/VideoAgent.d.ts.map
+++ b/dist/VideoAgent.d.ts.map
--- a/dist/VideoAgent.js
+++ b/dist/VideoAgent.js
--- a/dist/VideoAgent.js.map
+++ b/dist/VideoAgent.js.map
--- a/dist/VoiceAgent.d.ts
+++ b/dist/VoiceAgent.d.ts
@@ -1,28 +1,7 @@
 import { WebSocket } from "ws";
 import { EventEmitter } from "events";
 import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
-/**
- * Configuration for streaming speech behavior
- */
-interface StreamingSpeechConfig {
-    /** Minimum characters before generating speech for a chunk */
-    minChunkSize: number;
-    /** Maximum characters per chunk (will split at sentence boundary before this) */
-    maxChunkSize: number;
-    /** Whether to enable parallel TTS generation */
-    parallelGeneration: boolean;
-    /** Maximum number of parallel TTS requests */
-    maxParallelRequests: number;
-}
-/**
- * Configuration for conversation history memory management
- */
-interface HistoryConfig {
-    /** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
-    maxMessages: number;
-    /** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
-    maxTotalChars: number;
-}
+import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
 export interface VoiceAgentOptions {
    model: LanguageModel;
    transcriptionModel?: TranscriptionModel;
@@ -238,5 +217,4 @@ export declare class VoiceAgent extends EventEmitter {
     */
    get destroyed(): boolean;
 }
-export {};
 //# sourceMappingURL=VoiceAgent.d.ts.map
--- a/dist/VoiceAgent.d.ts.map
+++ b/dist/VoiceAgent.d.ts.map
@@ -1 +1 @@
-{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AAWZ;;GAEG;AACH,UAAU,qBAAqB;IAC7B,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,UAAU,aAAa;IACrB,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACvB;AAKD,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IAkCtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
+{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
--- a/dist/VoiceAgent.js
+++ b/dist/VoiceAgent.js
@@ -4,8 +4,7 @@ exports.VoiceAgent = void 0;
 const ws_1 = require("ws");
 const events_1 = require("events");
 const ai_1 = require("ai");
-/** Default maximum audio input size (10 MB) */
-const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
+const types_1 = require("./types");
 class VoiceAgent extends events_1.EventEmitter {
    socket;
    tools = {};
@@ -52,22 +51,18 @@ class VoiceAgent extends events_1.EventEmitter {
        this.voice = options.voice || "alloy";
        this.speechInstructions = options.speechInstructions;
        this.outputFormat = options.outputFormat || "mp3";
-        this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
+        this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
        if (options.tools) {
            this.tools = { ...options.tools };
        }
        // Initialize streaming speech config with defaults
        this.streamingSpeechConfig = {
-            minChunkSize: 50,
-            maxChunkSize: 200,
-            parallelGeneration: true,
-            maxParallelRequests: 3,
+            ...types_1.DEFAULT_STREAMING_SPEECH_CONFIG,
            ...options.streamingSpeech,
        };
        // Initialize history config with defaults
        this.historyConfig = {
-            maxMessages: 100,
-            maxTotalChars: 0, // unlimited by default
+            ...types_1.DEFAULT_HISTORY_CONFIG,
            ...options.history,
        };
    }
--- a/dist/VoiceAgent.js.map
+++ b/dist/VoiceAgent.js.map
--- a/dist/index.d.ts
+++ b/dist/index.d.ts
@@ -1,2 +1,4 @@
 export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
+export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent";
+export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types";
 //# sourceMappingURL=index.d.ts.map
--- a/dist/index.d.ts.map
+++ b/dist/index.d.ts.map
@@ -1 +1 @@
-{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC"}
+{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAClE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,cAAc,CAAC;AAGtB,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}
--- a/dist/index.js
+++ b/dist/index.js
@@ -1,6 +1,14 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.VoiceAgent = void 0;
+exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0;
+// Agents
 var VoiceAgent_1 = require("./VoiceAgent");
 Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_1.VoiceAgent; } });
+var VideoAgent_1 = require("./VideoAgent");
+Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_1.VideoAgent; } });
+// Shared types
+var types_1 = require("./types");
+Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } });
+Object.defineProperty(exports, "DEFAULT_HISTORY_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_HISTORY_CONFIG; } });
+Object.defineProperty(exports, "DEFAULT_MAX_AUDIO_SIZE", { enumerable: true, get: function () { return types_1.DEFAULT_MAX_AUDIO_SIZE; } });
 //# sourceMappingURL=index.js.map
--- a/dist/index.js.map
+++ b/dist/index.js.map
@@ -1 +1 @@
-{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,2CAAkE;AAAzD,wGAAA,UAAU,OAAA"}
+{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,2CAAkE;AAAzD,wGAAA,UAAU,OAAA;AACnB,2CAQsB;AAPlB,wGAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}
--- a/dist/types.d.ts
+++ b/dist/types.d.ts
@@ -0,0 +1,46 @@
+import type { streamText } from "ai";
+/**
+ * Represents a chunk of text to be converted to speech
+ */
+export interface SpeechChunk {
+    id: number;
+    text: string;
+    audioPromise?: Promise<Uint8Array | null>;
+}
+/**
+ * Configuration for streaming speech behavior
+ */
+export interface StreamingSpeechConfig {
+    /** Minimum characters before generating speech for a chunk */
+    minChunkSize: number;
+    /** Maximum characters per chunk (will split at sentence boundary before this) */
+    maxChunkSize: number;
+    /** Whether to enable parallel TTS generation */
+    parallelGeneration: boolean;
+    /** Maximum number of parallel TTS requests */
+    maxParallelRequests: number;
+}
+/**
+ * Configuration for conversation history memory management
+ */
+export interface HistoryConfig {
+    /** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
+    maxMessages: number;
+    /** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
+    maxTotalChars: number;
+}
+/**
+ * Default streaming speech configuration
+ */
+export declare const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig;
+/**
+ * Default history configuration
+ */
+export declare const DEFAULT_HISTORY_CONFIG: HistoryConfig;
+/** Default maximum audio input size (10 MB) */
+export declare const DEFAULT_MAX_AUDIO_SIZE: number;
+/**
+ * Default stop condition type from streamText
+ */
+export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
+//# sourceMappingURL=types.d.ts.map
--- a/dist/types.d.ts.map
+++ b/dist/types.d.ts.map
@@ -0,0 +1 @@
+{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,WAAW;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IAClC,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC1B,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,aAGpC,CAAC;AAEF,+CAA+C;AAC/C,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAEvD;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC"}
--- a/dist/types.js
+++ b/dist/types.js
@@ -0,0 +1,22 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = void 0;
+/**
+ * Default streaming speech configuration
+ */
+exports.DEFAULT_STREAMING_SPEECH_CONFIG = {
+    minChunkSize: 50,
+    maxChunkSize: 200,
+    parallelGeneration: true,
+    maxParallelRequests: 3,
+};
+/**
+ * Default history configuration
+ */
+exports.DEFAULT_HISTORY_CONFIG = {
+    maxMessages: 100,
+    maxTotalChars: 0, // unlimited by default
+};
+/** Default maximum audio input size (10 MB) */
+exports.DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
+//# sourceMappingURL=types.js.map
--- a/dist/types.js.map
+++ b/dist/types.js.map
@@ -0,0 +1 @@
+{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";;;AAmCA;;GAEG;AACU,QAAA,+BAA+B,GAA0B;IAClE,YAAY,EAAE,EAAE;IAChB,YAAY,EAAE,GAAG;IACjB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,CAAC;CACzB,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAkB;IACjD,WAAW,EAAE,GAAG;IAChB,aAAa,EAAE,CAAC,EAAE,uBAAuB;CAC5C,CAAC;AAEF,+CAA+C;AAClC,QAAA,sBAAsB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC"}
--- a/example/voice-client.html
+++ b/example/voice-client.html
@@ -258,7 +258,7 @@
    <!-- Connection -->
    <div class="card">
        <div class="row">
-            <input type="text" id="endpoint" value="ws://localhost:8080" placeholder="WebSocket endpoint" />
+            <input type="text" id="endpoint" value="ws://localhost:8081/ws/voice" placeholder="WebSocket endpoint" />
            <button id="connectBtn" class="primary">Connect</button>
            <button id="disconnectBtn" disabled>Disconnect</button>
        </div>
--- a/src/VideoAgent.ts
+++ b/src/VideoAgent.ts
--- a/src/VoiceAgent.ts
+++ b/src/VoiceAgent.ts
@@ -11,42 +11,14 @@ import {
  type TranscriptionModel,
  type SpeechModel,
 } from "ai";
-
-/**
- * Represents a chunk of text to be converted to speech
- */
-interface SpeechChunk {
-  id: number;
-  text: string;
-  audioPromise?: Promise<Uint8Array | null>;
-}
-
-/**
- * Configuration for streaming speech behavior
- */
-interface StreamingSpeechConfig {
-  /** Minimum characters before generating speech for a chunk */
-  minChunkSize: number;
-  /** Maximum characters per chunk (will split at sentence boundary before this) */
-  maxChunkSize: number;
-  /** Whether to enable parallel TTS generation */
-  parallelGeneration: boolean;
-  /** Maximum number of parallel TTS requests */
-  maxParallelRequests: number;
-}
-
-/**
- * Configuration for conversation history memory management
- */
-interface HistoryConfig {
-  /** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
-  maxMessages: number;
-  /** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
-  maxTotalChars: number;
-}
-
-/** Default maximum audio input size (10 MB) */
-const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
+import {
+  type SpeechChunk,
+  type StreamingSpeechConfig,
+  type HistoryConfig,
+  DEFAULT_STREAMING_SPEECH_CONFIG,
+  DEFAULT_HISTORY_CONFIG,
+  DEFAULT_MAX_AUDIO_SIZE,
+} from "./types";

 export interface VoiceAgentOptions {
  model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
@@ -126,17 +98,13 @@ export class VoiceAgent extends EventEmitter {

    // Initialize streaming speech config with defaults
    this.streamingSpeechConfig = {
-      minChunkSize: 50,
-      maxChunkSize: 200,
-      parallelGeneration: true,
-      maxParallelRequests: 3,
+      ...DEFAULT_STREAMING_SPEECH_CONFIG,
      ...options.streamingSpeech,
    };

    // Initialize history config with defaults
    this.historyConfig = {
-      maxMessages: 100,
-      maxTotalChars: 0, // unlimited by default
+      ...DEFAULT_HISTORY_CONFIG,
      ...options.history,
    };
  }
--- a/src/index.ts
+++ b/src/index.ts
@@ -1 +1,22 @@
+// Agents
 export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
+export {
+    VideoAgent,
+    type VideoAgentOptions,
+    type VideoFrame,
+    type AudioData,
+    type VideoAgentConfig,
+    type FrameContext,
+    type FrameTriggerReason,
+} from "./VideoAgent";
+
+// Shared types
+export {
+    type SpeechChunk,
+    type StreamingSpeechConfig,
+    type HistoryConfig,
+    type StopWhenCondition,
+    DEFAULT_STREAMING_SPEECH_CONFIG,
+    DEFAULT_HISTORY_CONFIG,
+    DEFAULT_MAX_AUDIO_SIZE,
+} from "./types";
--- a/src/types.ts
+++ b/src/types.ts
@@ -0,0 +1,60 @@
+import type { streamText } from "ai";
+
+/**
+ * Represents a chunk of text to be converted to speech
+ */
+export interface SpeechChunk {
+    id: number;
+    text: string;
+    audioPromise?: Promise<Uint8Array | null>;
+}
+
+/**
+ * Configuration for streaming speech behavior
+ */
+export interface StreamingSpeechConfig {
+    /** Minimum characters before generating speech for a chunk */
+    minChunkSize: number;
+    /** Maximum characters per chunk (will split at sentence boundary before this) */
+    maxChunkSize: number;
+    /** Whether to enable parallel TTS generation */
+    parallelGeneration: boolean;
+    /** Maximum number of parallel TTS requests */
+    maxParallelRequests: number;
+}
+
+/**
+ * Configuration for conversation history memory management
+ */
+export interface HistoryConfig {
+    /** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
+    maxMessages: number;
+    /** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
+    maxTotalChars: number;
+}
+
+/**
+ * Default streaming speech configuration
+ */
+export const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig = {
+    minChunkSize: 50,
+    maxChunkSize: 200,
+    parallelGeneration: true,
+    maxParallelRequests: 3,
+};
+
+/**
+ * Default history configuration
+ */
+export const DEFAULT_HISTORY_CONFIG: HistoryConfig = {
+    maxMessages: 100,
+    maxTotalChars: 0, // unlimited by default
+};
+
+/** Default maximum audio input size (10 MB) */
+export const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
+
+/**
+ * Default stop condition type from streamText
+ */
+export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
--- a/src/utils/StreamBuffer.ts
+++ b/src/utils/StreamBuffer.ts
				`@@ -0,0 +1 @@`
				{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,WAAW;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IAClC,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC1B,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,aAGpC,CAAC;AAEF,+CAA+C;AAC/C,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAEvD;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC"}
				`@@ -0,0 +1 @@`
				`{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";;;AAmCA;;GAEG;AACU,QAAA,+BAA+B,GAA0B;IAClE,YAAY,EAAE,EAAE;IAChB,YAAY,EAAE,GAAG;IACjB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,CAAC;CACzB,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAkB;IACjD,WAAW,EAAE,GAAG;IAChB,aAAa,EAAE,CAAC,EAAE,uBAAuB;CAC5C,CAAC;AAEF,+CAA+C;AAClC,QAAA,sBAAsB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC"}`