import { WebSocket } from "ws"; import { EventEmitter } from "events"; import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai"; /** * Configuration for streaming speech behavior */ interface StreamingSpeechConfig { /** Minimum characters before generating speech for a chunk */ minChunkSize: number; /** Maximum characters per chunk (will split at sentence boundary before this) */ maxChunkSize: number; /** Whether to enable parallel TTS generation */ parallelGeneration: boolean; /** Maximum number of parallel TTS requests */ maxParallelRequests: number; } /** * Configuration for conversation history memory management */ interface HistoryConfig { /** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */ maxMessages: number; /** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */ maxTotalChars: number; } export interface VoiceAgentOptions { model: LanguageModel; transcriptionModel?: TranscriptionModel; speechModel?: SpeechModel; instructions?: string; stopWhen?: NonNullable[0]["stopWhen"]>; tools?: Record; endpoint?: string; voice?: string; speechInstructions?: string; outputFormat?: string; /** Configuration for streaming speech generation */ streamingSpeech?: Partial; /** Configuration for conversation history memory limits */ history?: Partial; /** Maximum audio input size in bytes (default: 10 MB) */ maxAudioInputSize?: number; } export declare class VoiceAgent extends EventEmitter { private socket?; private tools; private model; private transcriptionModel?; private speechModel?; private instructions; private stopWhen; private endpoint?; private isConnected; private conversationHistory; private voice; private speechInstructions?; private outputFormat; private isProcessing; private isDestroyed; private inputQueue; private processingQueue; private currentStreamAbortController?; private historyConfig; private maxAudioInputSize; private streamingSpeechConfig; private currentSpeechAbortController?; private speechChunkQueue; private nextChunkId; private isSpeaking; private pendingTextBuffer; private speechQueueDonePromise?; private speechQueueDoneResolve?; constructor(options: VoiceAgentOptions); /** * Ensure the agent has not been destroyed. Throws if it has. */ private ensureNotDestroyed; private setupListeners; /** * Clean up all in-flight state when the connection drops. */ private cleanupOnDisconnect; registerTools(tools: Record): void; /** * Transcribe audio data to text using the configured transcription model */ transcribeAudio(audioData: Buffer | Uint8Array): Promise; /** * Generate speech from text using the configured speech model * @param abortSignal Optional signal to cancel the speech generation */ generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise; /** * Interrupt ongoing speech generation and playback (barge-in support). * This only interrupts TTS — the LLM stream is left running. */ interruptSpeech(reason?: string): void; /** * Interrupt both the current LLM stream and ongoing speech. * Use this for barge-in scenarios where the entire response should be cancelled. */ interruptCurrentResponse(reason?: string): void; /** * Extract complete sentences from text buffer * Returns [extractedSentences, remainingBuffer] */ private extractSentences; /** * Trim conversation history to stay within configured limits. * Removes oldest messages (always in pairs to preserve user/assistant turns). */ private trimHistory; /** * Queue a text chunk for speech generation */ private queueSpeechChunk; /** * Generate audio for a single chunk */ private generateChunkAudio; /** * Process the speech queue and send audio chunks in order */ private processSpeechQueue; /** * Process text delta for streaming speech. * Call this as text chunks arrive from LLM. */ private processTextForStreamingSpeech; /** * Flush any remaining text in the buffer to speech * Call this when stream ends */ private flushStreamingSpeech; /** * Process incoming audio data: transcribe and generate response */ private processAudioInput; connect(url?: string): Promise; /** * Attach an existing WebSocket (server-side usage). * Use this when a WS server accepts a connection and you want the * agent to handle messages on that socket. */ handleSocket(socket: WebSocket): void; /** * Send text input for processing (bypasses transcription). * Requests are queued and processed serially to prevent race conditions. */ sendText(text: string): Promise; /** * Send audio data to be transcribed and processed * @param audioData Base64 encoded audio data */ sendAudio(audioData: string): Promise; /** * Send raw audio buffer to be transcribed and processed */ sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise; /** * Enqueue a text input for serial processing. * This ensures only one processUserInput runs at a time, preventing * race conditions on conversationHistory, fullText accumulation, etc. */ private enqueueInput; /** * Drain the input queue, processing one request at a time. */ private drainInputQueue; /** * Process user input with streaming text generation. * Handles the full pipeline: text -> LLM (streaming) -> TTS -> WebSocket. * * This method is designed to be called serially via drainInputQueue(). */ private processUserInput; /** * Generate speech for full text at once (non-streaming fallback) * Useful when you want to bypass streaming speech for short responses */ generateAndSendSpeechFull(text: string): Promise; /** * Send a message via WebSocket if connected. * Gracefully handles send failures (e.g., socket closing mid-send). */ private sendWebSocketMessage; /** * Start listening for voice input */ startListening(): void; /** * Stop listening for voice input */ stopListening(): void; /** * Clear conversation history */ clearHistory(): void; /** * Get current conversation history */ getHistory(): ModelMessage[]; /** * Set conversation history (useful for restoring sessions) */ setHistory(history: ModelMessage[]): void; /** * Internal helper to close and clean up the current socket. */ private disconnectSocket; /** * Disconnect from WebSocket and stop all in-flight work. */ disconnect(): void; /** * Permanently destroy the agent, releasing all resources. * After calling this, the agent cannot be reused. */ destroy(): void; /** * Check if agent is connected to WebSocket */ get connected(): boolean; /** * Check if agent is currently processing a request */ get processing(): boolean; /** * Check if agent is currently speaking (generating/playing audio) */ get speaking(): boolean; /** * Get the number of pending speech chunks in the queue */ get pendingSpeechChunks(): number; /** * Check if agent has been permanently destroyed */ get destroyed(): boolean; } export {}; //# sourceMappingURL=VoiceAgent.d.ts.map