Files
VoiceAgent/dist/VoiceAgent.new.d.ts
Bijit Mondal 5e7eb469ae feat: Introduce new core components for conversation and speech management
- Added ConversationManager for managing conversation history with configurable limits.
- Implemented InputQueue for serial processing of input items.
- Created SpeechManager for handling text-to-speech generation and streaming.
- Developed StreamProcessor for processing LLM streams and forwarding events.
- Added TranscriptionManager for audio transcription using AI SDK.
- Introduced WebSocketManager for managing WebSocket connections and messaging.
- Updated VoiceAgent to support new architecture and improved socket handling.
- Refactored index files to export new core components.
2026-02-23 16:15:49 +05:30

137 lines
4.7 KiB
TypeScript

import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export declare class VoiceAgent extends EventEmitter {
private model;
private instructions;
private stopWhen;
private endpoint?;
private tools;
private isDestroyed;
private _isProcessing;
private currentStreamAbortController?;
private ws;
private speech;
private conversation;
private transcription;
private inputQueue;
constructor(options: VoiceAgentOptions);
registerTools(tools: Record<string, Tool>): void;
/**
* Transcribe audio data to text using the configured transcription model.
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Generate speech from text using the configured speech model.
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason?: string): void;
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
interruptCurrentResponse(reason?: string): void;
/**
* Connect to a WebSocket server by URL.
*/
connect(url?: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void;
/**
* Send text input for processing (bypasses transcription).
*/
sendText(text: string): Promise<string>;
/**
* Send base64 audio data to be transcribed and processed.
*/
sendAudio(audioData: string): Promise<void>;
/**
* Send raw audio buffer to be transcribed and processed.
*/
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Generate speech for full text at once (non-streaming fallback).
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/** Start listening for voice input */
startListening(): void;
/** Stop listening for voice input */
stopListening(): void;
/** Clear conversation history */
clearHistory(): void;
/** Get current conversation history */
getHistory(): ModelMessage[];
/** Set conversation history (useful for restoring sessions) */
setHistory(history: ModelMessage[]): void;
/** Disconnect from WebSocket and stop all in-flight work */
disconnect(): void;
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy(): void;
get connected(): boolean;
get processing(): boolean;
get speaking(): boolean;
get pendingSpeechChunks(): number;
get destroyed(): boolean;
private handleMessage;
private handleAudioInput;
private enqueueInput;
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
private processUserInput;
private ensureNotDestroyed;
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect;
/**
* Forward select events from a child emitter to this agent.
*/
private bubbleEvents;
}
//# sourceMappingURL=VoiceAgent.new.d.ts.map