feat: Introduce new core components for conversation and speech management

- Added ConversationManager for managing conversation history with configurable limits. - Implemented InputQueue for serial processing of input items. - Created SpeechManager for handling text-to-speech generation and streaming. - Developed StreamProcessor for processing LLM streams and forwarding events. - Added TranscriptionManager for audio transcription using AI SDK. - Introduced WebSocketManager for managing WebSocket connections and messaging. - Updated VoiceAgent to support new architecture and improved socket handling. - Refactored index files to export new core components.
2026-03-02 18:36:39 +00:00 · 2026-02-23 16:15:49 +05:30
parent 4dd30b89c0
commit 5e7eb469ae
71 changed files with 5175 additions and 19 deletions
--- a/dist/VoiceAgent.new.js
+++ b/dist/VoiceAgent.new.js
@@ -0,0 +1,379 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.VoiceAgent = void 0;
+const events_1 = require("events");
+const ai_1 = require("ai");
+const core_1 = require("./core");
+/**
+ * A single-session voice agent that manages one WebSocket connection at a time.
+ *
+ * **Important:** Each `VoiceAgent` instance holds its own conversation history,
+ * input queue, speech state, and WebSocket. It is designed for **one user per
+ * instance**. To support multiple concurrent users, create a separate
+ * `VoiceAgent` for each connection:
+ *
+ * ```ts
+ * wss.on("connection", (socket) => {
+ *   const agent = new VoiceAgent({ model, ... });
+ *   agent.handleSocket(socket);
+ *   agent.on("disconnected", () => agent.destroy());
+ * });
+ * ```
+ *
+ * Sharing a single instance across multiple users will cause conversation
+ * history cross-contamination, interleaved audio, and unpredictable behavior.
+ */
+class VoiceAgent extends events_1.EventEmitter {
+    model;
+    instructions;
+    stopWhen;
+    endpoint;
+    tools = {};
+    isDestroyed = false;
+    _isProcessing = false;
+    // Abort controller for the current LLM stream
+    currentStreamAbortController;
+    // ── Managers ──────────────────────────────────────────
+    ws;
+    speech;
+    conversation;
+    transcription;
+    inputQueue;
+    constructor(options) {
+        super();
+        this.model = options.model;
+        this.instructions =
+            options.instructions || "You are a helpful voice assistant.";
+        this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
+        this.endpoint = options.endpoint;
+        if (options.tools) {
+            this.tools = { ...options.tools };
+        }
+        // ── Initialize managers ──────────────────────────────
+        this.ws = new core_1.WebSocketManager();
+        this.speech = new core_1.SpeechManager({
+            speechModel: options.speechModel,
+            voice: options.voice,
+            speechInstructions: options.speechInstructions,
+            outputFormat: options.outputFormat,
+            streamingSpeech: options.streamingSpeech,
+        });
+        this.conversation = new core_1.ConversationManager({
+            history: options.history,
+        });
+        this.transcription = new core_1.TranscriptionManager({
+            transcriptionModel: options.transcriptionModel,
+            maxAudioInputSize: options.maxAudioInputSize,
+        });
+        this.inputQueue = new core_1.InputQueue();
+        // ── Wire managers to the WebSocket send function ─────
+        const sendMsg = (msg) => this.ws.send(msg);
+        this.speech.sendMessage = sendMsg;
+        this.transcription.sendMessage = sendMsg;
+        // ── Wire the input queue processor ───────────────────
+        this.inputQueue.processor = (item) => this.processUserInput(item.text);
+        // ── Bubble events from managers ──────────────────────
+        this.bubbleEvents(this.ws, [
+            "connected",
+            "error",
+        ]);
+        this.bubbleEvents(this.speech, [
+            "speech_start",
+            "speech_complete",
+            "speech_interrupted",
+            "speech_chunk_queued",
+            "audio_chunk",
+            "audio",
+            "error",
+        ]);
+        this.bubbleEvents(this.conversation, [
+            "history_cleared",
+            "history_trimmed",
+        ]);
+        this.bubbleEvents(this.transcription, [
+            "transcription",
+            "audio_received",
+            "error",
+            "warning",
+        ]);
+        // ── Handle WebSocket lifecycle events ────────────────
+        this.ws.on("disconnected", () => {
+            this.cleanupOnDisconnect();
+            this.emit("disconnected");
+        });
+        this.ws.on("message", (message) => this.handleMessage(message));
+    }
+    // ── Public API ────────────────────────────────────────
+    registerTools(tools) {
+        this.tools = { ...this.tools, ...tools };
+    }
+    /**
+     * Transcribe audio data to text using the configured transcription model.
+     */
+    async transcribeAudio(audioData) {
+        return this.transcription.transcribeAudio(audioData);
+    }
+    /**
+     * Generate speech from text using the configured speech model.
+     */
+    async generateSpeechFromText(text, abortSignal) {
+        return this.speech.generateSpeechFromText(text, abortSignal);
+    }
+    /**
+     * Interrupt ongoing speech generation and playback (barge-in support).
+     */
+    interruptSpeech(reason = "interrupted") {
+        this.speech.interruptSpeech(reason);
+    }
+    /**
+     * Interrupt both the current LLM stream and ongoing speech.
+     */
+    interruptCurrentResponse(reason = "interrupted") {
+        if (this.currentStreamAbortController) {
+            this.currentStreamAbortController.abort();
+            this.currentStreamAbortController = undefined;
+        }
+        this.speech.interruptSpeech(reason);
+    }
+    /**
+     * Connect to a WebSocket server by URL.
+     */
+    async connect(url) {
+        this.ensureNotDestroyed();
+        const wsUrl = url || this.endpoint || "ws://localhost:8080";
+        await this.ws.connect(wsUrl);
+    }
+    /**
+     * Attach an existing WebSocket (server-side usage).
+     */
+    handleSocket(socket) {
+        this.ensureNotDestroyed();
+        this.ws.handleSocket(socket);
+    }
+    /**
+     * Send text input for processing (bypasses transcription).
+     */
+    async sendText(text) {
+        this.ensureNotDestroyed();
+        if (!text || !text.trim()) {
+            throw new Error("Text input cannot be empty");
+        }
+        return this.enqueueInput(text);
+    }
+    /**
+     * Send base64 audio data to be transcribed and processed.
+     */
+    async sendAudio(audioData) {
+        this.ensureNotDestroyed();
+        await this.handleAudioInput(audioData);
+    }
+    /**
+     * Send raw audio buffer to be transcribed and processed.
+     */
+    async sendAudioBuffer(audioBuffer) {
+        this.ensureNotDestroyed();
+        const base64Audio = Buffer.from(audioBuffer).toString("base64");
+        await this.handleAudioInput(base64Audio);
+    }
+    /**
+     * Generate speech for full text at once (non-streaming fallback).
+     */
+    async generateAndSendSpeechFull(text) {
+        return this.speech.generateAndSendSpeechFull(text);
+    }
+    /** Start listening for voice input */
+    startListening() {
+        console.log("Starting voice agent...");
+        this.emit("listening");
+    }
+    /** Stop listening for voice input */
+    stopListening() {
+        console.log("Stopping voice agent...");
+        this.emit("stopped");
+    }
+    /** Clear conversation history */
+    clearHistory() {
+        this.conversation.clearHistory();
+    }
+    /** Get current conversation history */
+    getHistory() {
+        return this.conversation.getHistory();
+    }
+    /** Set conversation history (useful for restoring sessions) */
+    setHistory(history) {
+        this.conversation.setHistory(history);
+    }
+    /** Disconnect from WebSocket and stop all in-flight work */
+    disconnect() {
+        this.ws.disconnect();
+    }
+    /**
+     * Permanently destroy the agent, releasing all resources.
+     */
+    destroy() {
+        this.isDestroyed = true;
+        this.cleanupOnDisconnect();
+        this.ws.disconnect();
+        this.conversation.clearHistory();
+        this.tools = {};
+        this.removeAllListeners();
+    }
+    // ── Getters ───────────────────────────────────────────
+    get connected() {
+        return this.ws.isConnected;
+    }
+    get processing() {
+        return this._isProcessing;
+    }
+    get speaking() {
+        return this.speech.isSpeaking;
+    }
+    get pendingSpeechChunks() {
+        return this.speech.pendingChunkCount;
+    }
+    get destroyed() {
+        return this.isDestroyed;
+    }
+    // ── Private: message handling ─────────────────────────
+    async handleMessage(message) {
+        try {
+            console.log(`Received WebSocket message of type: ${message.type}`);
+            if (message.type === "transcript") {
+                if (typeof message.text !== "string" || !message.text.trim()) {
+                    this.emit("warning", "Received empty or invalid transcript message");
+                    return;
+                }
+                this.interruptCurrentResponse("user_speaking");
+                console.log(`Processing transcript: "${message.text}"`);
+                await this.enqueueInput(message.text);
+            }
+            else if (message.type === "audio") {
+                if (typeof message.data !== "string" || !message.data) {
+                    this.emit("warning", "Received empty or invalid audio message");
+                    return;
+                }
+                this.interruptCurrentResponse("user_speaking");
+                console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`);
+                await this.handleAudioInput(message.data, message.format);
+            }
+            else if (message.type === "interrupt") {
+                console.log(`Received interrupt request: ${message.reason || "client_request"}`);
+                this.interruptCurrentResponse(message.reason || "client_request");
+            }
+        }
+        catch (err) {
+            console.error("Failed to process message:", err);
+            this.emit("error", err);
+        }
+    }
+    // ── Private: audio ────────────────────────────────────
+    async handleAudioInput(base64Audio, format) {
+        const text = await this.transcription.processAudioInput(base64Audio, format);
+        if (text) {
+            await this.enqueueInput(text);
+        }
+    }
+    // ── Private: input queue ──────────────────────────────
+    enqueueInput(text) {
+        return new Promise((resolve, reject) => {
+            this.inputQueue.enqueue({ text, resolve, reject });
+        });
+    }
+    // ── Private: LLM processing ───────────────────────────
+    /**
+     * Process user input with streaming text generation.
+     * Called serially by the input queue.
+     */
+    async processUserInput(text) {
+        this._isProcessing = true;
+        this.currentStreamAbortController = new AbortController();
+        const streamAbortSignal = this.currentStreamAbortController.signal;
+        try {
+            this.emit("text", { role: "user", text });
+            this.conversation.addMessage({ role: "user", content: text });
+            const result = (0, ai_1.streamText)({
+                model: this.model,
+                system: this.instructions,
+                messages: this.conversation.getHistoryRef(),
+                tools: this.tools,
+                stopWhen: this.stopWhen,
+                abortSignal: streamAbortSignal,
+                onChunk: ({ chunk }) => {
+                    (0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
+                },
+                onFinish: async (event) => {
+                    for (const step of event.steps) {
+                        for (const toolResult of step.toolResults) {
+                            this.emit("tool_result", {
+                                name: toolResult.toolName,
+                                toolCallId: toolResult.toolCallId,
+                                result: toolResult.output,
+                            });
+                        }
+                    }
+                },
+                onError: ({ error }) => {
+                    console.error("Stream error:", error);
+                    this.emit("error", error);
+                },
+            });
+            const streamResult = await (0, core_1.processFullStream)(result, {
+                onTextDelta: (delta) => this.speech.processTextDelta(delta),
+                onTextEnd: () => this.speech.flushPendingText(),
+                sendMessage: (msg) => this.ws.send(msg),
+                emitEvent: (event, data) => this.emit(event, data),
+            });
+            // Add assistant response to history
+            if (streamResult.fullText) {
+                this.conversation.addMessage({
+                    role: "assistant",
+                    content: streamResult.fullText,
+                });
+            }
+            // Flush any remaining speech
+            this.speech.flushPendingText();
+            // Wait for all speech chunks to complete
+            if (this.speech.queueDonePromise) {
+                await this.speech.queueDonePromise;
+            }
+            return streamResult.fullText;
+        }
+        catch (error) {
+            // Clean up speech state on error
+            this.speech.reset();
+            throw error;
+        }
+        finally {
+            this._isProcessing = false;
+            this.currentStreamAbortController = undefined;
+        }
+    }
+    // ── Private: helpers ──────────────────────────────────
+    ensureNotDestroyed() {
+        if (this.isDestroyed) {
+            throw new Error("VoiceAgent has been destroyed and cannot be used");
+        }
+    }
+    /**
+     * Clean up all in-flight state when the connection drops.
+     */
+    cleanupOnDisconnect() {
+        if (this.currentStreamAbortController) {
+            this.currentStreamAbortController.abort();
+            this.currentStreamAbortController = undefined;
+        }
+        this.speech.reset();
+        this._isProcessing = false;
+        this.inputQueue.rejectAll(new Error("Connection closed"));
+    }
+    /**
+     * Forward select events from a child emitter to this agent.
+     */
+    bubbleEvents(source, events) {
+        for (const event of events) {
+            source.on(event, (...args) => this.emit(event, ...args));
+        }
+    }
+}
+exports.VoiceAgent = VoiceAgent;
+//# sourceMappingURL=VoiceAgent.new.js.map