Refactor VoiceAgent: Extract types and default configurations into separate types.ts file; remove unused StreamBuffer file

2026-03-02 18:36:39 +00:00 · 2026-02-19 16:01:25 +05:30
parent ce10d521f3
commit ac505c4ed9
23 changed files with 3570 additions and 82 deletions
--- a/dist/VideoAgent.d.ts
+++ b/dist/VideoAgent.d.ts
@@ -0,0 +1,339 @@
+import { WebSocket } from "ws";
+import { EventEmitter } from "events";
+import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
+import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
+/**
+ * Trigger reasons for frame capture
+ */
+type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
+/**
+ * Video frame data structure sent to/from the client
+ */
+interface VideoFrame {
+    type: "video_frame";
+    sessionId: string;
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    previousFrameRef?: string;
+    image: {
+        data: string;
+        format: string;
+        width: number;
+        height: number;
+    };
+}
+/**
+ * Audio data structure
+ */
+interface AudioData {
+    type: "audio";
+    sessionId: string;
+    data: string;
+    format: string;
+    sampleRate?: number;
+    duration?: number;
+    timestamp: number;
+}
+/**
+ * Backend configuration for video processing
+ */
+interface VideoAgentConfig {
+    /** Maximum frames to keep in context buffer for conversation history */
+    maxContextFrames: number;
+}
+/**
+ * Frame context for maintaining visual conversation history
+ */
+interface FrameContext {
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    frameHash: string;
+    description?: string;
+}
+export interface VideoAgentOptions {
+    model: LanguageModel;
+    transcriptionModel?: TranscriptionModel;
+    speechModel?: SpeechModel;
+    instructions?: string;
+    stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
+    tools?: Record<string, Tool>;
+    endpoint?: string;
+    voice?: string;
+    speechInstructions?: string;
+    outputFormat?: string;
+    /** Configuration for streaming speech generation */
+    streamingSpeech?: Partial<StreamingSpeechConfig>;
+    /** Configuration for conversation history memory limits */
+    history?: Partial<HistoryConfig>;
+    /** Maximum audio input size in bytes (default: 10 MB) */
+    maxAudioInputSize?: number;
+    /** Maximum frame input size in bytes (default: 5 MB) */
+    maxFrameInputSize?: number;
+    /** Maximum frames to keep in context buffer (default: 10) */
+    maxContextFrames?: number;
+    /** Session ID for this video agent instance */
+    sessionId?: string;
+}
+export declare class VideoAgent extends EventEmitter {
+    private socket?;
+    private tools;
+    private model;
+    private transcriptionModel?;
+    private speechModel?;
+    private instructions;
+    private stopWhen;
+    private endpoint?;
+    private isConnected;
+    private conversationHistory;
+    private voice;
+    private speechInstructions?;
+    private outputFormat;
+    private isProcessing;
+    private isDestroyed;
+    private sessionId;
+    private frameSequence;
+    private lastFrameTimestamp;
+    private lastFrameHash?;
+    private frameContextBuffer;
+    private currentFrameData?;
+    private videoConfig;
+    private inputQueue;
+    private processingQueue;
+    private currentStreamAbortController?;
+    private historyConfig;
+    private maxAudioInputSize;
+    private maxFrameInputSize;
+    private streamingSpeechConfig;
+    private currentSpeechAbortController?;
+    private speechChunkQueue;
+    private nextChunkId;
+    private isSpeaking;
+    private pendingTextBuffer;
+    private speechQueueDonePromise?;
+    private speechQueueDoneResolve?;
+    constructor(options: VideoAgentOptions);
+    /**
+     * Generate a unique session ID
+     */
+    private generateSessionId;
+    /**
+     * Simple hash function for frame comparison
+     */
+    private hashFrame;
+    /**
+     * Ensure the agent has not been destroyed. Throws if it has.
+     */
+    private ensureNotDestroyed;
+    /**
+     * Get current video agent configuration
+     */
+    getConfig(): VideoAgentConfig;
+    /**
+     * Update video agent configuration
+     */
+    updateConfig(config: Partial<VideoAgentConfig>): void;
+    private setupListeners;
+    /**
+     * Handle client ready signal
+     */
+    private handleClientReady;
+    /**
+     * Handle incoming video frame
+     */
+    private handleVideoFrame;
+    /**
+     * Add frame to context buffer
+     */
+    private addFrameToContext;
+    /**
+     * Request client to capture and send a frame
+     */
+    requestFrameCapture(reason: FrameTriggerReason): void;
+    /**
+     * Clean up all in-flight state when the connection drops.
+     */
+    private cleanupOnDisconnect;
+    registerTools(tools: Record<string, Tool>): void;
+    /**
+     * Transcribe audio data to text using the configured transcription model
+     */
+    transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
+    /**
+     * Generate speech from text using the configured speech model
+     */
+    generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
+    /**
+     * Interrupt ongoing speech generation and playback
+     */
+    interruptSpeech(reason?: string): void;
+    /**
+     * Interrupt both the current LLM stream and ongoing speech
+     */
+    interruptCurrentResponse(reason?: string): void;
+    /**
+     * Extract complete sentences from text buffer
+     */
+    private extractSentences;
+    /**
+     * Trim conversation history to stay within configured limits
+     */
+    private trimHistory;
+    /**
+     * Queue a text chunk for speech generation
+     */
+    private queueSpeechChunk;
+    /**
+     * Generate audio for a single chunk
+     */
+    private generateChunkAudio;
+    /**
+     * Process the speech queue and send audio chunks in order
+     */
+    private processSpeechQueue;
+    /**
+     * Process text delta for streaming speech
+     */
+    private processTextForStreamingSpeech;
+    /**
+     * Flush any remaining text in the buffer to speech
+     */
+    private flushStreamingSpeech;
+    /**
+     * Process incoming audio data: transcribe and generate response
+     */
+    private processAudioInput;
+    connect(url?: string): Promise<void>;
+    /**
+     * Attach an existing WebSocket (server-side usage)
+     */
+    handleSocket(socket: WebSocket): void;
+    /**
+     * Send text input for processing (bypasses transcription)
+     */
+    sendText(text: string): Promise<string>;
+    /**
+     * Send audio data to be transcribed and processed
+     */
+    sendAudio(audioData: string): Promise<void>;
+    /**
+     * Send raw audio buffer to be transcribed and processed
+     */
+    sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
+    /**
+     * Send a video frame with optional text query for vision analysis
+     */
+    sendFrame(frameData: string, query?: string, options?: {
+        width?: number;
+        height?: number;
+        format?: string;
+    }): Promise<string>;
+    /**
+     * Enqueue a text input for serial processing
+     */
+    private enqueueTextInput;
+    /**
+     * Enqueue a multimodal input (text + frame) for serial processing
+     */
+    private enqueueMultimodalInput;
+    /**
+     * Drain the input queue, processing one request at a time
+     */
+    private drainInputQueue;
+    /**
+     * Build the message content array for multimodal input
+     */
+    private buildMultimodalContent;
+    /**
+     * Process multimodal input (text + video frame)
+     */
+    private processMultimodalInput;
+    /**
+     * Process user input with streaming text generation
+     */
+    private processUserInput;
+    /**
+     * Handle individual stream chunks
+     */
+    private handleStreamChunk;
+    /**
+     * Process the full stream result and return the response text
+     */
+    private processStreamResult;
+    /**
+     * Send a message via WebSocket if connected
+     */
+    private sendWebSocketMessage;
+    /**
+     * Start listening for voice/video input
+     */
+    startListening(): void;
+    /**
+     * Stop listening for voice/video input
+     */
+    stopListening(): void;
+    /**
+     * Clear conversation history
+     */
+    clearHistory(): void;
+    /**
+     * Get current conversation history
+     */
+    getHistory(): ModelMessage[];
+    /**
+     * Set conversation history
+     */
+    setHistory(history: ModelMessage[]): void;
+    /**
+     * Get frame context buffer
+     */
+    getFrameContext(): FrameContext[];
+    /**
+     * Get session ID
+     */
+    getSessionId(): string;
+    /**
+     * Internal helper to close and clean up the current socket
+     */
+    private disconnectSocket;
+    /**
+     * Disconnect from WebSocket and stop all in-flight work
+     */
+    disconnect(): void;
+    /**
+     * Permanently destroy the agent, releasing all resources
+     */
+    destroy(): void;
+    /**
+     * Check if agent is connected to WebSocket
+     */
+    get connected(): boolean;
+    /**
+     * Check if agent is currently processing a request
+     */
+    get processing(): boolean;
+    /**
+     * Check if agent is currently speaking
+     */
+    get speaking(): boolean;
+    /**
+     * Get the number of pending speech chunks in the queue
+     */
+    get pendingSpeechChunks(): number;
+    /**
+     * Check if agent has been permanently destroyed
+     */
+    get destroyed(): boolean;
+    /**
+     * Get current frame sequence number
+     */
+    get currentFrameSequence(): number;
+    /**
+     * Check if there is visual context available
+     */
+    get hasVisualContext(): boolean;
+}
+export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
+export type { StreamingSpeechConfig, HistoryConfig } from "./types";
+//# sourceMappingURL=VideoAgent.d.ts.map