Files
VoiceAgent/example/ws-server-2.ts
Bijit Mondal 6510232655 feat: implement WebSocket server with VoiceAgent for real-time voice interaction
- Added a new WebSocket server implementation in `ws-server-2.ts` that utilizes the `VoiceAgent` for handling voice interactions.
- Integrated weather and time tools using the `ai` library for enhanced responses.
- Refactored existing `ws-server.ts` to streamline the connection handling and event logging.
- Enhanced `VoiceAgent` to support streaming speech generation with improved chunk handling and interruption capabilities.
- Introduced new event listeners for better logging and handling of speech-related events.
- Added graceful shutdown handling for the WebSocket server.
2026-02-13 17:33:22 +05:30

121 lines
4.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import "dotenv/config";
import { WebSocketServer } from "ws";
import { VoiceAgent } from "../src";
import { tool } from "ai";
import { z } from "zod";
import { openai } from "@ai-sdk/openai";
const endpoint = process.env.VOICE_WS_ENDPOINT || "ws://localhost:8080";
const url = new URL(endpoint);
const port = Number(url.port || 8080);
const host = url.hostname || "localhost";
// ── Tools (same as demo.ts) ────────────────────────────────────────────
const weatherTool = tool({
description: "Get the weather in a location",
inputSchema: z.object({
location: z.string().describe("The location to get the weather for"),
}),
execute: async ({ location }) => ({
location,
temperature: 72 + Math.floor(Math.random() * 21) - 10,
conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
Math.floor(Math.random() * 4)
],
}),
});
const timeTool = tool({
description: "Get the current time",
inputSchema: z.object({}),
execute: async () => ({
time: new Date().toLocaleTimeString(),
timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
}),
});
// ── WebSocket server ───────────────────────────────────────────────────
const wss = new WebSocketServer({ port, host });
wss.on("listening", () => {
console.log(`[ws-server] listening on ${endpoint}`);
console.log("[ws-server] Waiting for connections...\n");
});
wss.on("connection", (socket) => {
console.log("[ws-server] ✓ client connected");
// Create a fresh VoiceAgent per connection
const agent = new VoiceAgent({
model: openai("gpt-4o"),
transcriptionModel: openai.transcription("whisper-1"),
speechModel: openai.speech("gpt-4o-mini-tts"),
instructions: `You are a helpful voice assistant.
Keep responses concise and conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`,
voice: "alloy",
speechInstructions: "Speak in a friendly, natural conversational tone.",
outputFormat: "mp3",
streamingSpeech: {
minChunkSize: 40,
maxChunkSize: 180,
parallelGeneration: true,
maxParallelRequests: 2,
},
tools: {
getWeather: weatherTool,
getTime: timeTool,
},
});
// Wire agent events to server logs
agent.on("text", (msg: { role: string; text: string }) => {
const prefix = msg.role === "user" ? "👤 User" : "🤖 Assistant";
console.log(`[ws-server] ${prefix}: ${msg.text}`);
});
agent.on("chunk:text_delta", ({ text }: { text: string }) => {
process.stdout.write(text);
});
agent.on("chunk:tool_call", ({ toolName }: { toolName: string }) => {
console.log(`\n[ws-server] 🛠️ Tool call: ${toolName}`);
});
agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) => {
console.log(`[ws-server] 🛠️ Tool result (${name}):`, JSON.stringify(result));
});
agent.on("speech_start", () => console.log("[ws-server] 🔊 Speech started"));
agent.on("speech_complete", () => console.log("[ws-server] 🔊 Speech complete"));
agent.on("speech_interrupted", ({ reason }: { reason: string }) =>
console.log(`[ws-server] ⏸️ Speech interrupted: ${reason}`),
);
agent.on("audio_chunk", ({ chunkId, format, uint8Array }: { chunkId: number; format: string; uint8Array: Uint8Array }) => {
console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`);
});
agent.on("error", (err: Error) => console.error("[ws-server] ❌ Error:", err.message));
agent.on("disconnected", () => {
console.log("[ws-server] ✗ client disconnected\n");
});
// Hand the accepted socket to the agent this is the key line.
// The agent will listen for "transcript", "audio", "interrupt" messages
// and send back "text_delta", "audio_chunk", "response_complete", etc.
agent.handleSocket(socket);
});
// Graceful shutdown
process.on("SIGINT", () => {
console.log("\n[ws-server] Shutting down...");
wss.close(() => {
console.log("[ws-server] Server closed");
process.exit(0);
});
});
export { wss };