feat: Introduce new core components for conversation and speech management

- Added ConversationManager for managing conversation history with configurable limits. - Implemented InputQueue for serial processing of input items. - Created SpeechManager for handling text-to-speech generation and streaming. - Developed StreamProcessor for processing LLM streams and forwarding events. - Added TranscriptionManager for audio transcription using AI SDK. - Introduced WebSocketManager for managing WebSocket connections and messaging. - Updated VoiceAgent to support new architecture and improved socket handling. - Refactored index files to export new core components.
2026-03-02 18:36:39 +00:00 · 2026-02-23 16:15:49 +05:30
parent 4dd30b89c0
commit 5e7eb469ae
71 changed files with 5175 additions and 19 deletions
--- a/dist/VideoAgent.new.d.ts
+++ b/dist/VideoAgent.new.d.ts
@@ -0,0 +1,175 @@
+import { WebSocket } from "ws";
+import { EventEmitter } from "events";
+import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
+import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
+/**
+ * Trigger reasons for frame capture
+ */
+type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
+/**
+ * Video frame data structure sent to/from the client
+ */
+interface VideoFrame {
+    type: "video_frame";
+    sessionId: string;
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    previousFrameRef?: string;
+    image: {
+        data: string;
+        format: string;
+        width: number;
+        height: number;
+    };
+}
+/**
+ * Audio data structure
+ */
+interface AudioData {
+    type: "audio";
+    sessionId: string;
+    data: string;
+    format: string;
+    sampleRate?: number;
+    duration?: number;
+    timestamp: number;
+}
+/**
+ * Backend configuration for video processing
+ */
+interface VideoAgentConfig {
+    /** Maximum frames to keep in context buffer for conversation history */
+    maxContextFrames: number;
+}
+/**
+ * Frame context for maintaining visual conversation history
+ */
+interface FrameContext {
+    sequence: number;
+    timestamp: number;
+    triggerReason: FrameTriggerReason;
+    frameHash: string;
+    description?: string;
+}
+export interface VideoAgentOptions {
+    /**
+     * AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
+     * anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
+     */
+    model: LanguageModel;
+    transcriptionModel?: TranscriptionModel;
+    speechModel?: SpeechModel;
+    instructions?: string;
+    stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
+    tools?: Record<string, Tool>;
+    endpoint?: string;
+    voice?: string;
+    speechInstructions?: string;
+    outputFormat?: string;
+    streamingSpeech?: Partial<StreamingSpeechConfig>;
+    history?: Partial<HistoryConfig>;
+    maxAudioInputSize?: number;
+    /** Maximum frame input size in bytes (default: 5 MB) */
+    maxFrameInputSize?: number;
+    /** Maximum frames to keep in context buffer (default: 10) */
+    maxContextFrames?: number;
+    /** Session ID for this video agent instance */
+    sessionId?: string;
+}
+export declare class VideoAgent extends EventEmitter {
+    private model;
+    private instructions;
+    private stopWhen;
+    private endpoint?;
+    private tools;
+    private isDestroyed;
+    private _isProcessing;
+    private currentStreamAbortController?;
+    private ws;
+    private speech;
+    private conversation;
+    private transcription;
+    private inputQueue;
+    private sessionId;
+    private frameSequence;
+    private lastFrameTimestamp;
+    private lastFrameHash?;
+    private frameContextBuffer;
+    private currentFrameData?;
+    private videoConfig;
+    private maxFrameInputSize;
+    constructor(options: VideoAgentOptions);
+    registerTools(tools: Record<string, Tool>): void;
+    transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
+    generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
+    interruptSpeech(reason?: string): void;
+    interruptCurrentResponse(reason?: string): void;
+    connect(url?: string): Promise<void>;
+    handleSocket(socket: WebSocket): void;
+    sendText(text: string): Promise<string>;
+    sendAudio(audioData: string): Promise<void>;
+    sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
+    /**
+     * Send a video frame with optional text query for vision analysis
+     */
+    sendFrame(frameData: string, query?: string, options?: {
+        width?: number;
+        height?: number;
+        format?: string;
+    }): Promise<string>;
+    /**
+     * Request client to capture and send a frame
+     */
+    requestFrameCapture(reason: FrameTriggerReason): void;
+    getConfig(): VideoAgentConfig;
+    updateConfig(config: Partial<VideoAgentConfig>): void;
+    startListening(): void;
+    stopListening(): void;
+    clearHistory(): void;
+    getHistory(): ModelMessage[];
+    setHistory(history: ModelMessage[]): void;
+    getFrameContext(): FrameContext[];
+    getSessionId(): string;
+    disconnect(): void;
+    destroy(): void;
+    get connected(): boolean;
+    get processing(): boolean;
+    get speaking(): boolean;
+    get pendingSpeechChunks(): number;
+    get destroyed(): boolean;
+    get currentFrameSequence(): number;
+    get hasVisualContext(): boolean;
+    private handleMessage;
+    private handleClientReady;
+    private handleAudioInput;
+    private handleVideoFrame;
+    private addFrameToContext;
+    private hashFrame;
+    private generateSessionId;
+    private enqueueTextInput;
+    private enqueueMultimodalInput;
+    /**
+     * Route queued items to the correct processor.
+     */
+    private processQueueItem;
+    private buildMultimodalContent;
+    /**
+     * Shared streamText invocation used by both processUserInput and processMultimodalInput.
+     */
+    private runStream;
+    /**
+     * Process text-only input (with optional visual context from latest frame).
+     */
+    private processUserInput;
+    /**
+     * Process multimodal input (text + explicit video frame).
+     */
+    private processMultimodalInput;
+    private ensureNotDestroyed;
+    private cleanupOnDisconnect;
+    private bubbleEvents;
+}
+export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
+export type { StreamingSpeechConfig, HistoryConfig } from "./types";
+//# sourceMappingURL=VideoAgent.new.d.ts.map