import { WebSocket } from "ws"; import { EventEmitter } from "events"; import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai"; import { type StreamingSpeechConfig, type HistoryConfig } from "./types"; /** * Trigger reasons for frame capture */ type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial"; /** * Video frame data structure sent to/from the client */ interface VideoFrame { type: "video_frame"; sessionId: string; sequence: number; timestamp: number; triggerReason: FrameTriggerReason; previousFrameRef?: string; image: { data: string; format: string; width: number; height: number; }; } /** * Audio data structure */ interface AudioData { type: "audio"; sessionId: string; data: string; format: string; sampleRate?: number; duration?: number; timestamp: number; } /** * Backend configuration for video processing */ interface VideoAgentConfig { /** Maximum frames to keep in context buffer for conversation history */ maxContextFrames: number; } /** * Frame context for maintaining visual conversation history */ interface FrameContext { sequence: number; timestamp: number; triggerReason: FrameTriggerReason; frameHash: string; description?: string; } export interface VideoAgentOptions { /** * AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'), * anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames. */ model: LanguageModel; transcriptionModel?: TranscriptionModel; speechModel?: SpeechModel; instructions?: string; stopWhen?: NonNullable[0]["stopWhen"]>; tools?: Record; endpoint?: string; voice?: string; speechInstructions?: string; outputFormat?: string; /** Configuration for streaming speech generation */ streamingSpeech?: Partial; /** Configuration for conversation history memory limits */ history?: Partial; /** Maximum audio input size in bytes (default: 10 MB) */ maxAudioInputSize?: number; /** Maximum frame input size in bytes (default: 5 MB) */ maxFrameInputSize?: number; /** Maximum frames to keep in context buffer (default: 10) */ maxContextFrames?: number; /** Session ID for this video agent instance */ sessionId?: string; } export declare class VideoAgent extends EventEmitter { private socket?; private tools; private model; private transcriptionModel?; private speechModel?; private instructions; private stopWhen; private endpoint?; private isConnected; private conversationHistory; private voice; private speechInstructions?; private outputFormat; private isProcessing; private isDestroyed; private sessionId; private frameSequence; private lastFrameTimestamp; private lastFrameHash?; private frameContextBuffer; private currentFrameData?; private videoConfig; private inputQueue; private processingQueue; private currentStreamAbortController?; private historyConfig; private maxAudioInputSize; private maxFrameInputSize; private streamingSpeechConfig; private currentSpeechAbortController?; private speechChunkQueue; private nextChunkId; private isSpeaking; private pendingTextBuffer; private speechQueueDonePromise?; private speechQueueDoneResolve?; constructor(options: VideoAgentOptions); /** * Generate a unique session ID */ private generateSessionId; /** * Simple hash function for frame comparison */ private hashFrame; /** * Ensure the agent has not been destroyed. Throws if it has. */ private ensureNotDestroyed; /** * Get current video agent configuration */ getConfig(): VideoAgentConfig; /** * Update video agent configuration */ updateConfig(config: Partial): void; private setupListeners; /** * Handle client ready signal */ private handleClientReady; /** * Handle incoming video frame */ private handleVideoFrame; /** * Add frame to context buffer */ private addFrameToContext; /** * Request client to capture and send a frame */ requestFrameCapture(reason: FrameTriggerReason): void; /** * Clean up all in-flight state when the connection drops. */ private cleanupOnDisconnect; registerTools(tools: Record): void; /** * Transcribe audio data to text using the configured transcription model */ transcribeAudio(audioData: Buffer | Uint8Array): Promise; /** * Generate speech from text using the configured speech model */ generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise; /** * Interrupt ongoing speech generation and playback */ interruptSpeech(reason?: string): void; /** * Interrupt both the current LLM stream and ongoing speech */ interruptCurrentResponse(reason?: string): void; /** * Extract complete sentences from text buffer */ private extractSentences; /** * Trim conversation history to stay within configured limits */ private trimHistory; /** * Queue a text chunk for speech generation */ private queueSpeechChunk; /** * Generate audio for a single chunk */ private generateChunkAudio; /** * Process the speech queue and send audio chunks in order */ private processSpeechQueue; /** * Process text delta for streaming speech */ private processTextForStreamingSpeech; /** * Flush any remaining text in the buffer to speech */ private flushStreamingSpeech; /** * Process incoming audio data: transcribe and generate response */ private processAudioInput; connect(url?: string): Promise; /** * Attach an existing WebSocket (server-side usage) */ handleSocket(socket: WebSocket): void; /** * Send text input for processing (bypasses transcription) */ sendText(text: string): Promise; /** * Send audio data to be transcribed and processed */ sendAudio(audioData: string): Promise; /** * Send raw audio buffer to be transcribed and processed */ sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise; /** * Send a video frame with optional text query for vision analysis */ sendFrame(frameData: string, query?: string, options?: { width?: number; height?: number; format?: string; }): Promise; /** * Enqueue a text input for serial processing */ private enqueueTextInput; /** * Enqueue a multimodal input (text + frame) for serial processing */ private enqueueMultimodalInput; /** * Drain the input queue, processing one request at a time */ private drainInputQueue; /** * Build the message content array for multimodal input */ private buildMultimodalContent; /** * Process multimodal input (text + video frame) */ private processMultimodalInput; /** * Process user input with streaming text generation */ private processUserInput; /** * Handle individual stream chunks */ private handleStreamChunk; /** * Process the full stream result and return the response text */ private processStreamResult; /** * Send a message via WebSocket if connected */ private sendWebSocketMessage; /** * Start listening for voice/video input */ startListening(): void; /** * Stop listening for voice/video input */ stopListening(): void; /** * Clear conversation history */ clearHistory(): void; /** * Get current conversation history */ getHistory(): ModelMessage[]; /** * Set conversation history */ setHistory(history: ModelMessage[]): void; /** * Get frame context buffer */ getFrameContext(): FrameContext[]; /** * Get session ID */ getSessionId(): string; /** * Internal helper to close and clean up the current socket */ private disconnectSocket; /** * Disconnect from WebSocket and stop all in-flight work */ disconnect(): void; /** * Permanently destroy the agent, releasing all resources */ destroy(): void; /** * Check if agent is connected to WebSocket */ get connected(): boolean; /** * Check if agent is currently processing a request */ get processing(): boolean; /** * Check if agent is currently speaking */ get speaking(): boolean; /** * Get the number of pending speech chunks in the queue */ get pendingSpeechChunks(): number; /** * Check if agent has been permanently destroyed */ get destroyed(): boolean; /** * Get current frame sequence number */ get currentFrameSequence(): number; /** * Check if there is visual context available */ get hasVisualContext(): boolean; } export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, }; export type { StreamingSpeechConfig, HistoryConfig } from "./types"; //# sourceMappingURL=VideoAgent.d.ts.map