diff --git a/example/demo.ts b/example/demo.ts index 032fb64..0b36731 100644 --- a/example/demo.ts +++ b/example/demo.ts @@ -3,56 +3,160 @@ import { VoiceAgent } from "../src"; import { tool } from "ai"; import { z } from "zod"; import { openai } from "@ai-sdk/openai"; +import { writeFile } from "fs/promises"; // 1. Define Tools using standard AI SDK const weatherTool = tool({ - description: 'Get the weather in a location', + description: "Get the weather in a location", inputSchema: z.object({ - location: z.string().describe('The location to get the weather for'), + location: z.string().describe("The location to get the weather for"), }), execute: async ({ location }) => ({ location, temperature: 72 + Math.floor(Math.random() * 21) - 10, + conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][Math.floor(Math.random() * 4)], }), }); -// 2. Initialize Agent +const timeTool = tool({ + description: "Get the current time", + inputSchema: z.object({}), + execute: async () => ({ + time: new Date().toLocaleTimeString(), + timezone: Intl.DateTimeFormat().resolvedOptions().timeZone, + }), +}); + +// 2. Initialize Agent with full voice support const agent = new VoiceAgent({ - model: openai('gpt-4o'), - instructions: "You are a helpful voice assistant. Use tools when needed.", + // Chat model for text generation + model: openai("gpt-4o"), + // Transcription model for speech-to-text + transcriptionModel: openai.transcription("whisper-1"), + // Speech model for text-to-speech + speechModel: openai.speech("gpt-4o-mini-tts"), + // System instructions + instructions: `You are a helpful voice assistant. +Keep responses concise and conversational since they will be spoken aloud. +Use tools when needed to provide accurate information.`, + // TTS voice configuration + voice: "alloy", // Options: alloy, echo, fable, onyx, nova, shimmer + speechInstructions: "Speak in a friendly, natural conversational tone.", + outputFormat: "mp3", + // WebSocket endpoint endpoint: process.env.VOICE_WS_ENDPOINT, + // Tools tools: { - getWeather: weatherTool, // Pass the AI SDK tool directly + getWeather: weatherTool, + getTime: timeTool, }, }); // 3. Handle Events -agent.on("connected", () => console.log("Connected to WebSocket")); -// Handle incoming audio from AI (play this to user) -agent.on("audio", (base64Audio: string) => { - // process.stdout.write(Buffer.from(base64Audio, 'base64')); +// Connection events +agent.on("connected", () => console.log("✓ Connected to WebSocket")); +agent.on("disconnected", () => console.log("✗ Disconnected from WebSocket")); + +// Transcription events (when audio is converted to text) +agent.on("transcription", ({ text, language }: { text: string; language?: string }) => { + console.log(`[Transcription] (${language || "unknown"}): ${text}`); }); -// Logs -agent.on("text", (msg: { role: string; text: string }) => console.log(`${msg.role}: ${msg.text}`)); -agent.on("tool_start", ({ name }: { name: string }) => console.log(`[System] Calling ${name}...`)); +// Text events (user input and assistant responses) +agent.on("text", (msg: { role: string; text: string }) => { + const prefix = msg.role === "user" ? "👤 User" : "🤖 Assistant"; + console.log(`${prefix}: ${msg.text}`); +}); -// 4. Start (wrap in async function since we can't use top-level await) +// Streaming text delta events (real-time text chunks) +agent.on("text_delta", ({ text }: { text: string }) => { + process.stdout.write(text); +}); + +// Tool events +agent.on("tool_start", ({ name, input }: { name: string; input?: unknown }) => { + console.log(`\n[Tool] Calling ${name}...`, input ? JSON.stringify(input) : ""); +}); + +agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) => { + console.log(`[Tool] ${name} result:`, JSON.stringify(result)); +}); + +// Speech events +agent.on("speech_start", ({ text }: { text: string }) => { + console.log(`[TTS] Generating speech for: "${text.substring(0, 50)}..."`); +}); + +agent.on("speech_complete", () => { + console.log("[TTS] Speech generation complete"); +}); + +// Audio events (when TTS audio is generated) +agent.on("audio", async (audio: { data: string; format: string; uint8Array: Uint8Array }) => { + console.log(`[Audio] Received ${audio.format} audio (${audio.uint8Array.length} bytes)`); + // Optionally save to file for testing + await writeFile(`output.${audio.format}`, Buffer.from(audio.uint8Array)); +}); + +// Error handling +agent.on("error", (error: Error) => { + console.error("[Error]", error.message); +}); + +// 4. Main execution (async () => { - try { - // For now: text-only sanity check, no voice pipeline required. - await agent.sendText("What is the weather in Berlin?"); + console.log("\n=== Voice Agent Demo ==="); + console.log("Testing text-only mode (no WebSocket required)\n"); - // Optional: connect only when an endpoint is provided. + try { + // Test 1: Simple text query with streaming + console.log("--- Test 1: Weather Query ---"); + const response1 = await agent.sendText("What is the weather in Berlin?"); + console.log("\n"); + + // Test 2: Multi-turn conversation + console.log("--- Test 2: Follow-up Question ---"); + const response2 = await agent.sendText("What about Tokyo?"); + console.log("\n"); + + // Test 3: Time query + console.log("--- Test 3: Time Query ---"); + const response3 = await agent.sendText("What time is it?"); + console.log("\n"); + + // Show conversation history + console.log("--- Conversation History ---"); + const history = agent.getHistory(); + console.log(`Total messages: ${history.length}`); + + // Optional: Connect to WebSocket for real-time voice if (process.env.VOICE_WS_ENDPOINT) { + console.log("\n--- Connecting to WebSocket ---"); await agent.connect(process.env.VOICE_WS_ENDPOINT); - console.log("Agent connected successfully"); + console.log("Agent connected. Listening for audio input..."); + + // Keep the process running to receive WebSocket messages + // In a real app, you would stream microphone audio here } } catch (error) { console.error("Agent run failed:", error); + process.exit(1); } })(); -// 5. Simulate sending audio (in a real app, stream microphone data here) -// agent.sendAudio("Base64EncodedPCM16AudioData..."); \ No newline at end of file +// Example: How to send audio in a real application +// --------------------------------------------- +// import { readFile } from "fs/promises"; +// +// // Option 1: Send base64 encoded audio +// const audioBase64 = (await readFile("recording.mp3")).toString("base64"); +// await agent.sendAudio(audioBase64); +// +// // Option 2: Send raw audio buffer +// const audioBuffer = await readFile("recording.mp3"); +// await agent.sendAudioBuffer(audioBuffer); +// +// // Option 3: Transcribe audio directly +// const transcribedText = await agent.transcribeAudio(audioBuffer); +// console.log("Transcribed:", transcribedText); \ No newline at end of file diff --git a/example/ws-server.ts b/example/ws-server.ts index 1984b0c..aea5bd6 100644 --- a/example/ws-server.ts +++ b/example/ws-server.ts @@ -1,22 +1,73 @@ import "dotenv/config"; -import { WebSocketServer } from "ws"; +import { WebSocketServer, WebSocket } from "ws"; +import { readFile } from "fs/promises"; +import { existsSync } from "fs"; const endpoint = process.env.VOICE_WS_ENDPOINT || "ws://localhost:8080"; const url = new URL(endpoint); const port = Number(url.port || 8080); const host = url.hostname || "localhost"; +// Message types for type safety +interface BaseMessage { + type: string; +} + +interface TextDeltaMessage extends BaseMessage { + type: "text_delta"; + text: string; +} + +interface ToolCallMessage extends BaseMessage { + type: "tool_call"; + toolName: string; + toolCallId: string; + input: unknown; +} + +interface ToolResultMessage extends BaseMessage { + type: "tool_result"; + toolName: string; + toolCallId: string; + result: unknown; +} + +interface AudioMessage extends BaseMessage { + type: "audio"; + data: string; // base64 encoded + format: string; +} + +interface ResponseCompleteMessage extends BaseMessage { + type: "response_complete"; + text: string; + toolCalls: Array<{ toolName: string; toolCallId: string; input: unknown }>; + toolResults: Array<{ toolName: string; toolCallId: string; output: unknown }>; +} + +type AgentMessage = + | TextDeltaMessage + | ToolCallMessage + | ToolResultMessage + | AudioMessage + | ResponseCompleteMessage; + const wss = new WebSocketServer({ port, host }); wss.on("listening", () => { - console.log(`[ws-server] listening on ${endpoint}`); + console.log(`[ws-server] 🚀 listening on ${endpoint}`); + console.log("[ws-server] Waiting for connections...\n"); }); -wss.on("connection", (socket) => { - console.log("[ws-server] client connected"); +wss.on("connection", (socket: WebSocket) => { + console.log("[ws-server] ✓ client connected"); + + let streamingText = ""; + let audioChunks: Buffer[] = []; // Send a sample transcript to test text pipeline end-to-end. setTimeout(() => { + console.log("[ws-server] -> Sending test transcript..."); socket.send( JSON.stringify({ type: "transcript", @@ -25,19 +76,94 @@ wss.on("connection", (socket) => { ); }, 500); - socket.on("message", (data) => { + socket.on("message", async (data) => { try { - const msg = JSON.parse(data.toString()) as { - type?: string; - text?: string; - }; - console.log("[ws-server] <-", msg); + const msg = JSON.parse(data.toString()) as AgentMessage; + + switch (msg.type) { + case "text_delta": + // Real-time streaming text from the agent + streamingText += msg.text; + process.stdout.write(msg.text); + break; + + case "tool_call": + console.log(`\n[ws-server] 🛠️ Tool call: ${msg.toolName}`); + console.log(` Input: ${JSON.stringify(msg.input)}`); + break; + + case "tool_result": + console.log(`[ws-server] 🛠️ Tool result: ${msg.toolName}`); + console.log(` Result: ${JSON.stringify(msg.result)}`); + break; + + case "audio": + // Handle audio response from TTS + const audioBuffer = Buffer.from(msg.data, "base64"); + audioChunks.push(audioBuffer); + console.log( + `[ws-server] 🔊 Received audio: ${audioBuffer.length} bytes (${msg.format})`, + ); + + // Optionally save audio to file for testing + // await writeFile(`output_${Date.now()}.${msg.format}`, audioBuffer); + break; + + case "response_complete": + console.log("\n[ws-server] ✅ Response complete"); + console.log(` Text length: ${msg.text.length}`); + console.log(` Tool calls: ${msg.toolCalls.length}`); + console.log(` Tool results: ${msg.toolResults.length}`); + + // Reset for next response + streamingText = ""; + audioChunks = []; + break; + + default: + console.log("[ws-server] <- Unknown message:", msg); + } } catch { - console.log("[ws-server] <- raw", data.toString()); + console.log("[ws-server] <- raw", data.toString().substring(0, 100)); } }); socket.on("close", () => { - console.log("[ws-server] client disconnected"); + console.log("[ws-server] ✗ client disconnected\n"); + }); + + socket.on("error", (error) => { + console.error("[ws-server] Error:", error.message); }); }); + +// Graceful shutdown +process.on("SIGINT", () => { + console.log("\n[ws-server] Shutting down..."); + wss.close(() => { + console.log("[ws-server] Server closed"); + process.exit(0); + }); +}); + +// Helper function to simulate sending audio to the agent +async function simulateAudioInput(socket: WebSocket, audioPath: string) { + if (!existsSync(audioPath)) { + console.log(`[ws-server] Audio file not found: ${audioPath}`); + return; + } + + const audioBuffer = await readFile(audioPath); + const base64Audio = audioBuffer.toString("base64"); + + console.log(`[ws-server] -> Sending audio: ${audioPath} (${audioBuffer.length} bytes)`); + socket.send( + JSON.stringify({ + type: "audio", + data: base64Audio, + }), + ); +} + +// Export for use as a module +export { wss, simulateAudioInput }; diff --git a/output.mp3 b/output.mp3 new file mode 100644 index 0000000..f1fe13f Binary files /dev/null and b/output.mp3 differ diff --git a/src/VoiceAgent.ts b/src/VoiceAgent.ts index e0ec61b..2147969 100644 --- a/src/VoiceAgent.ts +++ b/src/VoiceAgent.ts @@ -1,31 +1,58 @@ import { WebSocket } from "ws"; import { EventEmitter } from "events"; -import { generateText, LanguageModel, stepCountIs, type Tool } from "ai"; +import { + streamText, + LanguageModel, + stepCountIs, + type Tool, + type ModelMessage, + experimental_transcribe as transcribe, + experimental_generateSpeech as generateSpeech, + type TranscriptionModel, + type SpeechModel, +} from "ai"; export interface VoiceAgentOptions { - model: LanguageModel; /// AI SDK Model (e.g., openai('gpt-4o')) + model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o')) + transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1')) + speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts')) instructions?: string; - stopWhen?: NonNullable[0]["stopWhen"]>; + stopWhen?: NonNullable[0]["stopWhen"]>; tools?: Record; endpoint?: string; + voice?: string; // Voice for TTS (e.g., 'alloy', 'echo', 'shimmer') + speechInstructions?: string; // Instructions for TTS voice style + outputFormat?: string; // Audio output format (e.g., 'mp3', 'opus', 'wav') } export class VoiceAgent extends EventEmitter { private socket?: WebSocket; private tools: Record = {}; private model: LanguageModel; + private transcriptionModel?: TranscriptionModel; + private speechModel?: SpeechModel; private instructions: string; - private stopWhen: NonNullable[0]["stopWhen"]>; + private stopWhen: NonNullable[0]["stopWhen"]>; private endpoint?: string; private isConnected = false; + private conversationHistory: ModelMessage[] = []; + private voice: string; + private speechInstructions?: string; + private outputFormat: string; + private isProcessing = false; constructor(options: VoiceAgentOptions) { super(); this.model = options.model; + this.transcriptionModel = options.transcriptionModel; + this.speechModel = options.speechModel; this.instructions = options.instructions || "You are a helpful voice assistant."; this.stopWhen = options.stopWhen || stepCountIs(5); this.endpoint = options.endpoint; + this.voice = options.voice || "alloy"; + this.speechInstructions = options.speechInstructions; + this.outputFormat = options.outputFormat || "mp3"; if (options.tools) { this.tools = { ...options.tools }; } @@ -38,16 +65,17 @@ export class VoiceAgent extends EventEmitter { try { const message = JSON.parse(data.toString()); - // Example: Handle transcribed text from the client/STT + // Handle transcribed text from the client/STT if (message.type === "transcript") { await this.processUserInput(message.text); } - // Handle audio data + // Handle raw audio data that needs transcription if (message.type === "audio") { - this.emit("audio", message.data); + await this.processAudioInput(message.data); } } catch (err) { console.error("Failed to process message:", err); + this.emit("error", err); } }); @@ -56,12 +84,81 @@ export class VoiceAgent extends EventEmitter { this.isConnected = false; this.emit("disconnected"); }); + + this.socket.on("error", (error) => { + console.error("WebSocket error:", error); + this.emit("error", error); + }); } public registerTools(tools: Record) { this.tools = { ...this.tools, ...tools }; } + /** + * Transcribe audio data to text using the configured transcription model + */ + public async transcribeAudio(audioData: Buffer | Uint8Array): Promise { + if (!this.transcriptionModel) { + throw new Error("Transcription model not configured"); + } + + const result = await transcribe({ + model: this.transcriptionModel, + audio: audioData, + }); + + this.emit("transcription", { + text: result.text, + language: result.language, + }); + + return result.text; + } + + /** + * Generate speech from text using the configured speech model + */ + public async generateSpeechFromText(text: string): Promise { + if (!this.speechModel) { + throw new Error("Speech model not configured"); + } + + const result = await generateSpeech({ + model: this.speechModel, + text, + voice: this.voice, + instructions: this.speechInstructions, + outputFormat: this.outputFormat, + }); + + return result.audio.uint8Array; + } + + /** + * Process incoming audio data: transcribe and generate response + */ + private async processAudioInput(base64Audio: string): Promise { + if (!this.transcriptionModel) { + this.emit("error", new Error("Transcription model not configured for audio input")); + return; + } + + try { + const audioBuffer = Buffer.from(base64Audio, "base64"); + this.emit("audio_received", { size: audioBuffer.length }); + + const transcribedText = await this.transcribeAudio(audioBuffer); + + if (transcribedText.trim()) { + await this.processUserInput(transcribedText); + } + } catch (error) { + console.error("Failed to process audio input:", error); + this.emit("error", error); + } + } + public async connect(url?: string): Promise { return new Promise((resolve, reject) => { try { @@ -85,52 +182,426 @@ export class VoiceAgent extends EventEmitter { }); } - public async sendText(text: string): Promise { - await this.processUserInput(text); + /** + * Send text input for processing (bypasses transcription) + */ + public async sendText(text: string): Promise { + return this.processUserInput(text); } - public sendAudio(audioData: string): void { - if (this.socket && this.isConnected) { - this.socket.send(JSON.stringify({ + /** + * Send audio data to be transcribed and processed + * @param audioData Base64 encoded audio data + */ + public async sendAudio(audioData: string): Promise { + await this.processAudioInput(audioData); + } + + /** + * Send raw audio buffer to be transcribed and processed + */ + public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise { + const base64Audio = Buffer.from(audioBuffer).toString("base64"); + await this.processAudioInput(base64Audio); + } + + /** + * Process user input with streaming text generation + * Handles the full pipeline: text -> LLM (streaming) -> TTS -> WebSocket + */ + private async processUserInput(text: string): Promise { + if (this.isProcessing) { + this.emit("warning", "Already processing a request, queuing..."); + } + this.isProcessing = true; + + try { + // Emit text event for incoming user input + this.emit("text", { role: "user", text }); + + // Add user message to conversation history + this.conversationHistory.push({ role: "user", content: text }); + + // Use streamText for streaming responses with tool support + const result = streamText({ + model: this.model, + system: this.instructions, + messages: this.conversationHistory, + tools: this.tools, + stopWhen: this.stopWhen, + onChunk: ({ chunk }) => { + // Emit streaming chunks for real-time updates + // Note: onChunk only receives a subset of stream events + switch (chunk.type) { + case "text-delta": + this.emit("chunk:text_delta", { id: chunk.id, text: chunk.text }); + break; + + case "reasoning-delta": + this.emit("chunk:reasoning_delta", { id: chunk.id, text: chunk.text }); + break; + + case "tool-call": + this.emit("chunk:tool_call", { + toolName: chunk.toolName, + toolCallId: chunk.toolCallId, + input: chunk.input, + }); + break; + + case "tool-result": + this.emit("chunk:tool_result", { + toolName: chunk.toolName, + toolCallId: chunk.toolCallId, + result: chunk.output, + }); + break; + + case "tool-input-start": + this.emit("chunk:tool_input_start", { + id: chunk.id, + toolName: chunk.toolName, + }); + break; + + case "tool-input-delta": + this.emit("chunk:tool_input_delta", { + id: chunk.id, + delta: chunk.delta, + }); + break; + + case "source": + this.emit("chunk:source", chunk); + break; + } + }, + onFinish: async (event) => { + // Process steps for tool results + for (const step of event.steps) { + for (const toolResult of step.toolResults) { + this.emit("tool_result", { + name: toolResult.toolName, + toolCallId: toolResult.toolCallId, + result: toolResult.output, + }); + } + } + }, + onError: ({ error }) => { + console.error("Stream error:", error); + this.emit("error", error); + }, + }); + + // Collect the full response text and reasoning + let fullText = ""; + let fullReasoning = ""; + const allToolCalls: Array<{ + toolName: string; + toolCallId: string; + input: unknown; + }> = []; + const allToolResults: Array<{ + toolName: string; + toolCallId: string; + output: unknown; + }> = []; + const allSources: Array = []; + const allFiles: Array = []; + + // Process the full stream + for await (const part of result.fullStream) { + switch (part.type) { + // Stream lifecycle + case "start": + this.sendWebSocketMessage({ type: "stream_start" }); + break; + + case "finish": + this.emit("text", { role: "assistant", text: fullText }); + this.sendWebSocketMessage({ + type: "stream_finish", + finishReason: part.finishReason, + usage: part.totalUsage, + }); + break; + + case "error": + this.emit("error", part.error); + this.sendWebSocketMessage({ + type: "stream_error", + error: String(part.error), + }); + break; + + case "abort": + this.emit("abort", { reason: part.reason }); + this.sendWebSocketMessage({ + type: "stream_abort", + reason: part.reason, + }); + break; + + // Step lifecycle + case "start-step": + this.sendWebSocketMessage({ + type: "step_start", + warnings: part.warnings, + }); + break; + + case "finish-step": + this.sendWebSocketMessage({ + type: "step_finish", + finishReason: part.finishReason, + usage: part.usage, + }); + break; + + // Text streaming + case "text-start": + this.sendWebSocketMessage({ type: "text_start", id: part.id }); + break; + + case "text-delta": + fullText += part.text; + this.sendWebSocketMessage({ + type: "text_delta", + id: part.id, + text: part.text, + }); + break; + + case "text-end": + this.sendWebSocketMessage({ type: "text_end", id: part.id }); + break; + + // Reasoning streaming (for models that support it) + case "reasoning-start": + this.sendWebSocketMessage({ type: "reasoning_start", id: part.id }); + break; + + case "reasoning-delta": + fullReasoning += part.text; + this.sendWebSocketMessage({ + type: "reasoning_delta", + id: part.id, + text: part.text, + }); + break; + + case "reasoning-end": + this.sendWebSocketMessage({ type: "reasoning_end", id: part.id }); + break; + + // Tool input streaming + case "tool-input-start": + this.sendWebSocketMessage({ + type: "tool_input_start", + id: part.id, + toolName: part.toolName, + }); + break; + + case "tool-input-delta": + this.sendWebSocketMessage({ + type: "tool_input_delta", + id: part.id, + delta: part.delta, + }); + break; + + case "tool-input-end": + this.sendWebSocketMessage({ type: "tool_input_end", id: part.id }); + break; + + // Tool execution + case "tool-call": + allToolCalls.push({ + toolName: part.toolName, + toolCallId: part.toolCallId, + input: part.input, + }); + this.sendWebSocketMessage({ + type: "tool_call", + toolName: part.toolName, + toolCallId: part.toolCallId, + input: part.input, + }); + break; + + case "tool-result": + allToolResults.push({ + toolName: part.toolName, + toolCallId: part.toolCallId, + output: part.output, + }); + this.sendWebSocketMessage({ + type: "tool_result", + toolName: part.toolName, + toolCallId: part.toolCallId, + result: part.output, + }); + break; + + case "tool-error": + this.sendWebSocketMessage({ + type: "tool_error", + toolName: part.toolName, + toolCallId: part.toolCallId, + error: String(part.error), + }); + break; + + // Sources and files + case "source": + allSources.push(part); + this.sendWebSocketMessage({ + type: "source", + source: part, + }); + break; + + case "file": + allFiles.push(part.file); + this.sendWebSocketMessage({ + type: "file", + file: part.file, + }); + break; + } + } + + // Add assistant response to conversation history + if (fullText) { + this.conversationHistory.push({ role: "assistant", content: fullText }); + } + + // Generate speech from the response if speech model is configured + if (this.speechModel && fullText) { + await this.generateAndSendSpeech(fullText); + } + + // Send the complete response + this.sendWebSocketMessage({ + type: "response_complete", + text: fullText, + reasoning: fullReasoning || undefined, + toolCalls: allToolCalls, + toolResults: allToolResults, + sources: allSources.length > 0 ? allSources : undefined, + files: allFiles.length > 0 ? allFiles : undefined, + }); + + return fullText; + } finally { + this.isProcessing = false; + } + } + + /** + * Generate speech and send audio via WebSocket + */ + private async generateAndSendSpeech(text: string): Promise { + if (!this.speechModel) return; + + try { + this.emit("speech_start", { text }); + + const audioData = await this.generateSpeechFromText(text); + const base64Audio = Buffer.from(audioData).toString("base64"); + + // Send audio via WebSocket + this.sendWebSocketMessage({ type: "audio", - data: audioData - })); + data: base64Audio, + format: this.outputFormat, + }); + + // Also emit for local handling + this.emit("audio", { + data: base64Audio, + format: this.outputFormat, + uint8Array: audioData, + }); + + this.emit("speech_complete", { text }); + } catch (error) { + console.error("Failed to generate speech:", error); + this.emit("error", error); } } - private async processUserInput(text: string) { - // Emit text event for incoming user input - this.emit("text", { role: "user", text }); - - const result = await generateText({ - model: this.model, - system: this.instructions, - prompt: text, - tools: this.tools, - stopWhen: this.stopWhen, - }); - - for (const toolCall of result.toolCalls ?? []) { - this.emit("tool_start", { name: toolCall.toolName }); - } - - // Emit text event for assistant response - this.emit("text", { role: "assistant", text: result.text }); - - // Send the response back (either text to be TTSed or tool results) + /** + * Send a message via WebSocket if connected + */ + private sendWebSocketMessage(message: Record): void { if (this.socket && this.isConnected) { - this.socket.send( - JSON.stringify({ - type: "response", - text: result.text, - toolCalls: result.toolCalls, - toolResults: result.toolResults, - }), - ); + this.socket.send(JSON.stringify(message)); } } + /** + * Start listening for voice input + */ startListening() { console.log("Starting voice agent..."); + this.emit("listening"); + } + + /** + * Stop listening for voice input + */ + stopListening() { + console.log("Stopping voice agent..."); + this.emit("stopped"); + } + + /** + * Clear conversation history + */ + clearHistory() { + this.conversationHistory = []; + this.emit("history_cleared"); + } + + /** + * Get current conversation history + */ + getHistory(): ModelMessage[] { + return [...this.conversationHistory]; + } + + /** + * Set conversation history (useful for restoring sessions) + */ + setHistory(history: ModelMessage[]) { + this.conversationHistory = [...history]; + } + + /** + * Disconnect from WebSocket + */ + disconnect() { + if (this.socket) { + this.socket.close(); + this.socket = undefined; + this.isConnected = false; + } + } + + /** + * Check if agent is connected to WebSocket + */ + get connected(): boolean { + return this.isConnected; + } + + /** + * Check if agent is currently processing a request + */ + get processing(): boolean { + return this.isProcessing; } } diff --git a/src/index.ts b/src/index.ts index e4c28c3..03a07b4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1 +1 @@ -export { VoiceAgent } from "./VoiceAgent"; +export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";