From 6510232655eb00523fee83c4d2c7bd33d0c411d0 Mon Sep 17 00:00:00 2001
From: Bijit Mondal <bijitmondal3011@rediffmail.com>
Date: Fri, 13 Feb 2026 17:16:12 +0530
Subject: [PATCH] feat: implement WebSocket server with VoiceAgent for
 real-time voice interaction

- Added a new WebSocket server implementation in `ws-server-2.ts` that utilizes the `VoiceAgent` for handling voice interactions.
- Integrated weather and time tools using the `ai` library for enhanced responses.
- Refactored existing `ws-server.ts` to streamline the connection handling and event logging.
- Enhanced `VoiceAgent` to support streaming speech generation with improved chunk handling and interruption capabilities.
- Introduced new event listeners for better logging and handling of speech-related events.
- Added graceful shutdown handling for the WebSocket server.
---
 README.md                 |   54 +-
 example/demo.ts           |   74 ++-
 example/voice-client.html | 1088 +++++++++++++++++++++++++++++++++++++
 example/ws-server-2.ts    |  120 ++++
 example/ws-server.ts      |  243 ++++-----
 src/VoiceAgent.ts         |  344 +++++++++++-
 6 files changed, 1749 insertions(+), 174 deletions(-)
 create mode 100644 example/voice-client.html
 create mode 100644 example/ws-server-2.ts

diff --git a/README.md b/README.md
index 526fc8f..736762d 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,14 @@
 # voice-agent-ai-sdk
 
-Minimal voice/text agent SDK built on AI SDK with optional WebSocket transport.
+Streaming voice/text agent SDK built on AI SDK with optional WebSocket transport.
 
 ## Current status
 
-- Text flow works via `sendText()` (no WebSocket required).
-- WebSocket flow works when `connect()` is used with a running WS endpoint.
-- Voice streaming is not implemented yet.
+- Streaming text generation is implemented via `streamText`.
+- Tool calling is supported in-stream.
+- Speech synthesis is implemented with chunked streaming TTS.
+- Audio transcription is supported (when `transcriptionModel` is configured).
+- WebSocket protocol events are emitted for stream, tool, and speech lifecycle.
 
 ## Prerequisites
 
@@ -25,13 +27,35 @@ Minimal voice/text agent SDK built on AI SDK with optional WebSocket transport.
    OPENAI_API_KEY=your_openai_api_key
    VOICE_WS_ENDPOINT=ws://localhost:8080
 
+`VOICE_WS_ENDPOINT` is optional for text-only usage.
+
+## VoiceAgent configuration
+
+The agent accepts:
+
+- `model` (required): chat model
+- `transcriptionModel` (optional): STT model
+- `speechModel` (optional): TTS model
+- `instructions` (optional): system prompt
+- `stopWhen` (optional): stopping condition
+- `tools` (optional): AI SDK tools map
+- `endpoint` (optional): WebSocket endpoint
+- `voice` (optional): TTS voice, default `alloy`
+- `speechInstructions` (optional): style instructions for TTS
+- `outputFormat` (optional): audio format, default `mp3`
+- `streamingSpeech` (optional):
+   - `minChunkSize`
+   - `maxChunkSize`
+   - `parallelGeneration`
+   - `maxParallelRequests`
+
 ## Run (text-only check)
 
-This validates model + tool calls without requiring WebSocket:
+This validates LLM + tool + streaming speech without requiring WebSocket:
 
 pnpm demo
 
-Expected logs include `text` events and optional `tool_start`.
+Expected logs include `text`, `chunk:text_delta`, tool events, and speech chunk events.
 
 ## Run (WebSocket check)
 
@@ -45,7 +69,22 @@ Expected logs include `text` events and optional `tool_start`.
 
 The demo will:
 - run `sendText()` first (text-only sanity check), then
-- connect to `VOICE_WS_ENDPOINT` if provided.
+- connect to `VOICE_WS_ENDPOINT` if provided,
+- emit streaming protocol messages (`text_delta`, `tool_call`, `audio_chunk`, `response_complete`, etc.).
+
+## Browser voice client (HTML)
+
+A simple browser client is available at [example/voice-client.html](example/voice-client.html).
+
+What it does:
+- captures microphone speech using Web Speech API (speech-to-text)
+- sends transcript to the agent via WebSocket (`type: "transcript"`)
+- receives streaming `audio_chunk` messages and plays them in order
+
+How to use:
+1. Start your agent server/WebSocket endpoint.
+2. Open [example/voice-client.html](example/voice-client.html) in a browser (Chrome/Edge recommended).
+3. Connect to `ws://localhost:8080` (or your endpoint), then click **Start Mic**.
 
 ## Scripts
 
@@ -58,3 +97,4 @@ The demo will:
 
 - If `VOICE_WS_ENDPOINT` is empty, WebSocket connect is skipped.
 - The sample WS server sends a mock `transcript` message for end-to-end testing.
+- Streaming TTS uses chunk queueing and supports interruption (`interrupt`).
diff --git a/example/demo.ts b/example/demo.ts
index 0b36731..87ada97 100644
--- a/example/demo.ts
+++ b/example/demo.ts
@@ -43,6 +43,13 @@ Use tools when needed to provide accurate information.`,
     voice: "alloy", // Options: alloy, echo, fable, onyx, nova, shimmer
     speechInstructions: "Speak in a friendly, natural conversational tone.",
     outputFormat: "mp3",
+    // Streaming speech tuning
+    streamingSpeech: {
+        minChunkSize: 40,
+        maxChunkSize: 180,
+        parallelGeneration: true,
+        maxParallelRequests: 2,
+    },
     // WebSocket endpoint
     endpoint: process.env.VOICE_WS_ENDPOINT,
     // Tools
@@ -70,13 +77,13 @@ agent.on("text", (msg: { role: string; text: string }) => {
 });
 
 // Streaming text delta events (real-time text chunks)
-agent.on("text_delta", ({ text }: { text: string }) => {
+agent.on("chunk:text_delta", ({ text }: { text: string }) => {
     process.stdout.write(text);
 });
 
-// Tool events
-agent.on("tool_start", ({ name, input }: { name: string; input?: unknown }) => {
-    console.log(`\n[Tool] Calling ${name}...`, input ? JSON.stringify(input) : "");
+// Tool events (stream-level)
+agent.on("chunk:tool_call", ({ toolName, input }: { toolName: string; input: unknown }) => {
+    console.log(`\n[Tool] Calling ${toolName}...`, input ? JSON.stringify(input) : "");
 });
 
 agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) => {
@@ -84,19 +91,51 @@ agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) =>
 });
 
 // Speech events
-agent.on("speech_start", ({ text }: { text: string }) => {
-    console.log(`[TTS] Generating speech for: "${text.substring(0, 50)}..."`);
+agent.on("speech_start", ({ streaming }: { streaming: boolean }) => {
+    console.log(`[TTS] Speech started (streaming=${streaming})`);
 });
 
 agent.on("speech_complete", () => {
     console.log("[TTS] Speech generation complete");
 });
 
-// Audio events (when TTS audio is generated)
+agent.on("speech_chunk_queued", ({ id, text }: { id: number; text: string }) => {
+    console.log(`[TTS] Queued chunk #${id}: ${text.substring(0, 40)}...`);
+});
+
+// Streaming audio chunk events
+agent.on(
+    "audio_chunk",
+    async ({ chunkId, format, uint8Array }: { chunkId: number; format: string; uint8Array: Uint8Array }) => {
+        console.log(`[Audio] Chunk #${chunkId} (${uint8Array.length} bytes, ${format})`);
+        await writeFile(`output_chunk_${chunkId}.${format}`, Buffer.from(uint8Array));
+    },
+);
+
+// Full audio event (non-streaming fallback via generateAndSendSpeechFull)
 agent.on("audio", async (audio: { data: string; format: string; uint8Array: Uint8Array }) => {
-    console.log(`[Audio] Received ${audio.format} audio (${audio.uint8Array.length} bytes)`);
-    // Optionally save to file for testing
-    await writeFile(`output.${audio.format}`, Buffer.from(audio.uint8Array));
+    console.log(`[Audio] Full response audio (${audio.uint8Array.length} bytes, ${audio.format})`);
+    await writeFile(`output_full.${audio.format}`, Buffer.from(audio.uint8Array));
+});
+
+// Speech interruption (barge-in)
+agent.on("speech_interrupted", ({ reason }: { reason: string }) => {
+    console.log(`[TTS] Speech interrupted: ${reason}`);
+});
+
+// Transcription event (when server-side Whisper is used)
+agent.on("transcription", ({ text, language }: { text: string; language?: string }) => {
+    console.log(`[STT] Transcription (${language || "unknown"}): ${text}`);
+});
+
+// Audio received event
+agent.on("audio_received", ({ size }: { size: number }) => {
+    console.log(`[Audio] Received ${(size / 1024).toFixed(1)} KB of audio input`);
+});
+
+// Warning events
+agent.on("warning", (msg: string) => {
+    console.warn(`[Warning] ${msg}`);
 });
 
 // Error handling
@@ -111,20 +150,9 @@ agent.on("error", (error: Error) => {
 
     try {
         // Test 1: Simple text query with streaming
-        console.log("--- Test 1: Weather Query ---");
-        const response1 = await agent.sendText("What is the weather in Berlin?");
+        console.log("--- Test 1: Text Query ---");
+        await agent.sendText("What's the weather in San Francisco?");
         console.log("\n");
-
-        // Test 2: Multi-turn conversation
-        console.log("--- Test 2: Follow-up Question ---");
-        const response2 = await agent.sendText("What about Tokyo?");
-        console.log("\n");
-
-        // Test 3: Time query
-        console.log("--- Test 3: Time Query ---");
-        const response3 = await agent.sendText("What time is it?");
-        console.log("\n");
-
         // Show conversation history
         console.log("--- Conversation History ---");
         const history = agent.getHistory();
diff --git a/example/voice-client.html b/example/voice-client.html
new file mode 100644
index 0000000..27b67d4
--- /dev/null
+++ b/example/voice-client.html
@@ -0,0 +1,1088 @@
+<!doctype html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Voice Agent Web Client</title>
+    <style>
+        *,
+        *::before,
+        *::after {
+            box-sizing: border-box;
+        }
+
+        body {
+            font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
+            max-width: 880px;
+            margin: 24px auto;
+            padding: 0 16px;
+            color: #1a1a1a;
+            background: #f8f9fa;
+        }
+
+        h1 {
+            margin-bottom: 4px;
+        }
+
+        .subtitle {
+            color: #666;
+            font-size: 13px;
+            margin-bottom: 20px;
+        }
+
+        .card {
+            background: #fff;
+            border: 1px solid #e0e0e0;
+            border-radius: 10px;
+            padding: 16px;
+            margin-bottom: 14px;
+        }
+
+        .row {
+            display: flex;
+            gap: 8px;
+            margin-bottom: 12px;
+            align-items: center;
+            flex-wrap: wrap;
+        }
+
+        input[type="text"],
+        button,
+        select {
+            padding: 10px 14px;
+            font-size: 14px;
+            border-radius: 6px;
+            border: 1px solid #ccc;
+        }
+
+        input[type="text"] {
+            flex: 1;
+            min-width: 200px;
+        }
+
+        button {
+            cursor: pointer;
+            background: #fff;
+            transition: background 0.15s, border-color 0.15s;
+            white-space: nowrap;
+        }
+
+        button:hover:not(:disabled) {
+            background: #f0f0f0;
+        }
+
+        button:disabled {
+            opacity: 0.45;
+            cursor: default;
+        }
+
+        button.primary {
+            background: #2563eb;
+            color: #fff;
+            border-color: #2563eb;
+        }
+
+        button.primary:hover:not(:disabled) {
+            background: #1d4ed8;
+        }
+
+        button.danger {
+            background: #dc2626;
+            color: #fff;
+            border-color: #dc2626;
+        }
+
+        button.danger:hover:not(:disabled) {
+            background: #b91c1c;
+        }
+
+        @keyframes pulse {
+
+            0%,
+            100% {
+                opacity: 1;
+            }
+
+            50% {
+                opacity: 0.7;
+            }
+        }
+
+        select {
+            background: #fff;
+        }
+
+        #status {
+            font-weight: 600;
+            margin: 0 0 6px;
+            font-size: 14px;
+        }
+
+        .status-dot {
+            display: inline-block;
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            margin-right: 6px;
+            vertical-align: middle;
+        }
+
+        .status-dot.disconnected {
+            background: #9ca3af;
+        }
+
+        .status-dot.connected {
+            background: #22c55e;
+        }
+
+        .status-dot.listening {
+            background: #f59e0b;
+            animation: pulse 1s infinite;
+        }
+
+        .status-dot.speaking {
+            background: #3b82f6;
+            animation: pulse 0.8s infinite;
+        }
+
+        .panel {
+            border: 1px solid #e5e7eb;
+            border-radius: 8px;
+            padding: 12px;
+            min-height: 48px;
+            margin-bottom: 12px;
+            background: #fafafa;
+        }
+
+        .panel.transcript {
+            border-left: 3px solid #2563eb;
+        }
+
+        .panel.assistant {
+            border-left: 3px solid #22c55e;
+        }
+
+        .panel.reasoning {
+            border-left: 3px solid #f59e0b;
+            font-style: italic;
+            color: #666;
+            font-size: 13px;
+        }
+
+        .panel.tools {
+            border-left: 3px solid #8b5cf6;
+            font-size: 13px;
+        }
+
+        #log {
+            white-space: pre-wrap;
+            background: #1a1a2e;
+            color: #c8d6e5;
+            max-height: 280px;
+            overflow: auto;
+            font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace;
+            font-size: 12px;
+            border-radius: 8px;
+            padding: 12px;
+        }
+
+        h3 {
+            margin: 14px 0 6px;
+            font-size: 15px;
+            color: #374151;
+        }
+
+        label {
+            font-size: 13px;
+            color: #555;
+            margin-right: 4px;
+        }
+
+        .text-input-row {
+            display: flex;
+            gap: 8px;
+        }
+
+        .text-input-row input {
+            flex: 1;
+        }
+
+        .badge {
+            display: inline-block;
+            font-size: 11px;
+            padding: 2px 8px;
+            border-radius: 10px;
+            font-weight: 600;
+            margin-left: 6px;
+            vertical-align: middle;
+        }
+
+        .badge.streaming {
+            background: #dbeafe;
+            color: #2563eb;
+        }
+
+        .badge.idle {
+            background: #f3f4f6;
+            color: #6b7280;
+        }
+
+        .audio-viz {
+            height: 4px;
+            background: #e5e7eb;
+            border-radius: 2px;
+            margin: 8px 0;
+            overflow: hidden;
+        }
+
+        .audio-viz-bar {
+            height: 100%;
+            background: #3b82f6;
+            border-radius: 2px;
+            width: 0%;
+            transition: width 0.15s;
+        }
+
+        .hidden {
+            display: none !important;
+        }
+    </style>
+</head>
+
+<body>
+    <h1>🎙️ Voice Agent Web Client</h1>
+    <p class="subtitle">Real-time voice I/O with streaming speech generation. Supports browser STT or server-side
+        Whisper transcription.</p>
+
+    <!-- Connection -->
+    <div class="card">
+        <div class="row">
+            <input type="text" id="endpoint" value="ws://localhost:8080" placeholder="WebSocket endpoint" />
+            <button id="connectBtn" class="primary">Connect</button>
+            <button id="disconnectBtn" disabled>Disconnect</button>
+        </div>
+        <div id="status"><span class="status-dot disconnected"></span>Disconnected</div>
+    </div>
+
+    <!-- Input Controls -->
+    <div class="card">
+        <div class="row">
+            <label for="inputMode">Input mode:</label>
+            <select id="inputMode">
+                <option value="browser-stt">Browser Speech Recognition</option>
+                <option value="server-whisper">Server-side Whisper</option>
+            </select>
+        </div>
+        <div class="row">
+            <button id="startMicBtn" disabled>🎤 Start Mic</button>
+            <button id="stopMicBtn" disabled>⏹ Stop Mic</button>
+            <button id="interruptBtn" class="danger" disabled>✋ Interrupt</button>
+        </div>
+        <div class="audio-viz" id="audioViz">
+            <div class="audio-viz-bar" id="audioVizBar"></div>
+        </div>
+        <div class="text-input-row">
+            <input type="text" id="textInput" placeholder="Or type a message and press Enter..." disabled />
+            <button id="sendTextBtn" class="primary" disabled>Send</button>
+        </div>
+    </div>
+
+    <!-- Output Panels -->
+    <h3>👤 You said</h3>
+    <div class="panel transcript" id="transcript">&mdash;</div>
+
+    <h3>🤖 Assistant <span class="badge idle" id="streamBadge">idle</span></h3>
+    <div class="panel assistant" id="assistant"></div>
+
+    <div id="reasoningSection" class="hidden">
+        <h3>💭 Reasoning</h3>
+        <div class="panel reasoning" id="reasoning"></div>
+    </div>
+
+    <div id="toolsSection" class="hidden">
+        <h3>🛠️ Tools</h3>
+        <div class="panel tools" id="tools"></div>
+    </div>
+
+    <h3>📋 Logs</h3>
+    <div id="log"></div>
+
+    <script>
+        // ── Elements ────────────────────────────────────────────────────────
+        const endpointEl = document.getElementById("endpoint");
+        const connectBtn = document.getElementById("connectBtn");
+        const disconnectBtn = document.getElementById("disconnectBtn");
+        const inputModeEl = document.getElementById("inputMode");
+        const startMicBtn = document.getElementById("startMicBtn");
+        const stopMicBtn = document.getElementById("stopMicBtn");
+        const interruptBtn = document.getElementById("interruptBtn");
+        const textInput = document.getElementById("textInput");
+        const sendTextBtn = document.getElementById("sendTextBtn");
+        const statusEl = document.getElementById("status");
+        const transcriptEl = document.getElementById("transcript");
+        const assistantEl = document.getElementById("assistant");
+        const reasoningSection = document.getElementById("reasoningSection");
+        const reasoningEl = document.getElementById("reasoning");
+        const toolsSection = document.getElementById("toolsSection");
+        const toolsEl = document.getElementById("tools");
+        const logEl = document.getElementById("log");
+        const streamBadge = document.getElementById("streamBadge");
+        const audioVizBar = document.getElementById("audioVizBar");
+
+        // ── State ───────────────────────────────────────────────────────────
+        let ws = null;
+        let connected = false;
+
+        // Browser STT state
+        let recognition = null;
+        let micShouldRun = false;
+        let micRestartTimer = null;
+
+        // Server Whisper recording state
+        let mediaStream = null;          // mic stream stays open while listening
+        let mediaRecorder = null;        // created per speech segment
+        let audioChunks = [];
+        let whisperListening = false;    // mic is open and VAD is running
+        let whisperSegmentActive = false; // currently capturing a speech segment
+
+        // VAD (Voice Activity Detection) config
+        const VAD_SPEECH_THRESHOLD = 12;   // RMS above this = speech detected
+        const VAD_SILENCE_TIMEOUT = 1500;  // ms of silence before auto-sending segment
+        const VAD_MIN_SEGMENT_MS = 300;    // ignore segments shorter than this
+        const VAD_POLL_INTERVAL = 60;      // ms between VAD checks
+        let vadSilenceTimer = null;
+        let vadPollTimer = null;
+        let segmentStartTime = 0;
+
+        // Audio playback state
+        let audioContext = null;
+        let audioQueue = [];
+        let isPlaying = false;
+        let currentAudioSource = null;
+        let currentAudioElement = null;
+
+        // Analyser (shared between viz and VAD)
+        let analyserNode = null;
+        let analyserSource = null;
+        let vizAnimFrame = null;
+
+        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+
+        // ── Helpers ─────────────────────────────────────────────────────────
+        function log(msg) {
+            const ts = new Date().toLocaleTimeString("en-US", { hour12: false });
+            logEl.textContent += `[${ts}] ${msg}\n`;
+            logEl.scrollTop = logEl.scrollHeight;
+        }
+
+        function setStatus(text, state) {
+            statusEl.innerHTML = `<span class="status-dot ${state}"></span>${text}`;
+        }
+
+        function setBadge(text, cls) {
+            streamBadge.textContent = text;
+            streamBadge.className = `badge ${cls}`;
+        }
+
+        function setConnectedUI(on) {
+            connected = on;
+            connectBtn.disabled = on;
+            disconnectBtn.disabled = !on;
+            startMicBtn.disabled = !on;
+            stopMicBtn.disabled = true;
+            interruptBtn.disabled = !on;
+            textInput.disabled = !on;
+            sendTextBtn.disabled = !on;
+        }
+
+        function getInputMode() {
+            return inputModeEl.value;
+        }
+
+        function isAssistantSpeaking() {
+            return isPlaying || audioQueue.length > 0;
+        }
+
+        // ── Auto barge-in: interrupt assistant when user starts talking ─────
+        function autoBargeIn() {
+            if (!isAssistantSpeaking()) return;
+            stopAudioPlayback();
+            if (ws && connected) {
+                ws.send(JSON.stringify({ type: "interrupt", reason: "user_speaking" }));
+                log("⚡ Auto-interrupt: user started speaking");
+            }
+        }
+
+        // ── AudioContext playback ───────────────────────────────────────────
+        function getAudioContext() {
+            if (!audioContext) {
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+            }
+            if (audioContext.state === "suspended") {
+                audioContext.resume();
+            }
+            return audioContext;
+        }
+
+        async function playNextAudioChunk() {
+            if (isPlaying || audioQueue.length === 0) return;
+            isPlaying = true;
+            setStatus("Playing audio...", "speaking");
+
+            const { bytes, format } = audioQueue.shift();
+
+            try {
+                const ctx = getAudioContext();
+                const arrayBuffer = bytes.buffer.slice(
+                    bytes.byteOffset,
+                    bytes.byteOffset + bytes.byteLength
+                );
+
+                try {
+                    const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
+                    await new Promise((resolve) => {
+                        const source = ctx.createBufferSource();
+                        source.buffer = audioBuffer;
+                        source.connect(ctx.destination);
+                        currentAudioSource = source;
+                        source.onended = resolve;
+                        source.start(0);
+                    });
+                    currentAudioSource = null;
+                } catch (_decodeErr) {
+                    const mime = format === "mp3" ? "audio/mpeg" : `audio/${format}`;
+                    const blob = new Blob([bytes], { type: mime });
+                    const url = URL.createObjectURL(blob);
+                    const audio = new Audio(url);
+                    currentAudioElement = audio;
+                    await audio.play();
+                    await new Promise((resolve) => {
+                        audio.onended = resolve;
+                        audio.onerror = resolve;
+                    });
+                    currentAudioElement = null;
+                    URL.revokeObjectURL(url);
+                }
+            } catch (err) {
+                log(`Audio play error: ${err?.message || err}`);
+            } finally {
+                isPlaying = false;
+                if (audioQueue.length > 0) {
+                    playNextAudioChunk();
+                } else if (connected) {
+                    setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected", whisperListening || micShouldRun ? "listening" : "connected");
+                }
+            }
+        }
+
+        function stopAudioPlayback() {
+            if (currentAudioSource) {
+                try { currentAudioSource.stop(); } catch (_) { }
+                currentAudioSource = null;
+            }
+            if (currentAudioElement) {
+                try { currentAudioElement.pause(); currentAudioElement.src = ""; } catch (_) { }
+                currentAudioElement = null;
+            }
+            audioQueue = [];
+            isPlaying = false;
+        }
+
+        function decodeBase64ToBytes(base64) {
+            const binary = atob(base64);
+            const len = binary.length;
+            const bytes = new Uint8Array(len);
+            for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
+            return bytes;
+        }
+
+        // ── RMS audio level from analyser ───────────────────────────────────
+        function getCurrentRMS() {
+            if (!analyserNode) return 0;
+            const data = new Uint8Array(analyserNode.frequencyBinCount);
+            analyserNode.getByteFrequencyData(data);
+            let sum = 0;
+            for (let i = 0; i < data.length; i++) sum += data[i];
+            return sum / data.length;
+        }
+
+        // ── Shared analyser setup ───────────────────────────────────────────
+        function setupAnalyser(stream) {
+            try {
+                const ctx = getAudioContext();
+                analyserSource = ctx.createMediaStreamSource(stream);
+                analyserNode = ctx.createAnalyser();
+                analyserNode.fftSize = 256;
+                analyserSource.connect(analyserNode);
+            } catch (_) { }
+        }
+
+        function teardownAnalyser() {
+            if (analyserSource) {
+                try { analyserSource.disconnect(); } catch (_) { }
+                analyserSource = null;
+            }
+            analyserNode = null;
+        }
+
+        // ── Audio level visualization ───────────────────────────────────────
+        function startViz() {
+            function update() {
+                if (!whisperListening && !micShouldRun) {
+                    audioVizBar.style.width = "0%";
+                    return;
+                }
+                const rms = getCurrentRMS();
+                const pct = Math.min(100, (rms / 128) * 100);
+                audioVizBar.style.width = `${pct}%`;
+                vizAnimFrame = requestAnimationFrame(update);
+            }
+            update();
+        }
+
+        function stopViz() {
+            if (vizAnimFrame) { cancelAnimationFrame(vizAnimFrame); vizAnimFrame = null; }
+            audioVizBar.style.width = "0%";
+        }
+
+        // ══════════════════════════════════════════════════════════════════════
+        // ── Browser Speech Recognition (STT) with auto barge-in ─────────────
+        // ══════════════════════════════════════════════════════════════════════
+        function initBrowserRecognition() {
+            if (recognition || !SpeechRecognition) return;
+
+            recognition = new SpeechRecognition();
+            recognition.lang = "en-US";
+            recognition.interimResults = true;
+            recognition.continuous = true;
+            recognition.maxAlternatives = 1;
+
+            recognition.onstart = () => {
+                setStatus("Listening (browser STT)...", "listening");
+                startMicBtn.disabled = true;
+                stopMicBtn.disabled = false;
+                log("Browser STT started");
+            };
+
+            recognition.onresult = (event) => {
+                for (let i = event.resultIndex; i < event.results.length; i++) {
+                    const text = event.results[i][0].transcript.trim();
+                    if (!text) continue;
+
+                    if (event.results[i].isFinal) {
+                        // Auto barge-in on final transcript
+                        autoBargeIn();
+
+                        transcriptEl.textContent = text;
+                        resetOutputPanels();
+
+                        if (ws && connected) {
+                            ws.send(JSON.stringify({ type: "transcript", text }));
+                            log(`→ Sent transcript: ${text}`);
+                        }
+                    } else {
+                        // Auto barge-in as soon as interim speech is detected
+                        autoBargeIn();
+                        transcriptEl.textContent = text + " …";
+                    }
+                }
+            };
+
+            recognition.onerror = (event) => {
+                log(`Browser STT error: ${event.error}`);
+                if (["not-allowed", "service-not-allowed", "audio-capture"].includes(event.error)) {
+                    micShouldRun = false;
+                    stopMicBtn.disabled = true;
+                    startMicBtn.disabled = !connected;
+                    setStatus("Mic permission error", "disconnected");
+                }
+            };
+
+            recognition.onend = () => {
+                if (micShouldRun && connected) {
+                    setStatus("Listening (restarting)...", "listening");
+                    micRestartTimer = setTimeout(() => {
+                        try { recognition.start(); } catch { }
+                    }, 250);
+                    return;
+                }
+                stopMicBtn.disabled = true;
+                startMicBtn.disabled = !connected;
+                if (connected) setStatus("Connected", "connected");
+                log("Browser STT stopped");
+            };
+        }
+
+        function startBrowserSTT() {
+            if (!SpeechRecognition) {
+                alert("Web Speech API not supported. Use Chrome/Edge or switch to Server Whisper mode.");
+                return;
+            }
+            initBrowserRecognition();
+            micShouldRun = true;
+            stopMicBtn.disabled = false;
+            startMicBtn.disabled = true;
+            try { recognition.start(); } catch { }
+        }
+
+        function stopBrowserSTT() {
+            micShouldRun = false;
+            if (micRestartTimer) clearTimeout(micRestartTimer);
+            if (recognition) recognition.stop();
+        }
+
+        // ══════════════════════════════════════════════════════════════════════
+        // ── Server-side Whisper with VAD auto-segmentation & auto barge-in ──
+        // ══════════════════════════════════════════════════════════════════════
+
+        /**
+         * Opens the mic and starts VAD polling.
+         * The mic stays open until user clicks Stop.
+         * Speech segments are detected/sent automatically.
+         */
+        async function startWhisperListening() {
+            try {
+                mediaStream = await navigator.mediaDevices.getUserMedia({
+                    audio: {
+                        channelCount: 1,
+                        sampleRate: 16000,
+                        echoCancellation: true,
+                        noiseSuppression: true,
+                    }
+                });
+            } catch (err) {
+                log(`Mic permission failed: ${err?.message || err}`);
+                setStatus("Mic permission denied", "disconnected");
+                return;
+            }
+
+            whisperListening = true;
+            startMicBtn.disabled = true;
+            stopMicBtn.disabled = false;
+
+            setupAnalyser(mediaStream);
+            startViz();
+            startVADPolling();
+
+            setStatus("Listening (Whisper VAD)...", "listening");
+            log("Whisper VAD listening started — speak and it will auto-detect");
+        }
+
+        function stopWhisperListening() {
+            whisperListening = false;
+
+            // Stop VAD
+            stopVADPolling();
+
+            // If a segment is active, finish and send it
+            if (whisperSegmentActive) {
+                finishWhisperSegment();
+            }
+
+            // Release mic
+            if (mediaStream) {
+                mediaStream.getTracks().forEach(t => t.stop());
+                mediaStream = null;
+            }
+
+            teardownAnalyser();
+            stopViz();
+
+            startMicBtn.disabled = !connected;
+            stopMicBtn.disabled = true;
+            if (connected) setStatus("Connected", "connected");
+            log("Whisper VAD listening stopped");
+        }
+
+        /** Start polling the analyser for voice activity */
+        function startVADPolling() {
+            stopVADPolling();
+            vadPollTimer = setInterval(vadCheck, VAD_POLL_INTERVAL);
+        }
+
+        function stopVADPolling() {
+            if (vadPollTimer) { clearInterval(vadPollTimer); vadPollTimer = null; }
+            if (vadSilenceTimer) { clearTimeout(vadSilenceTimer); vadSilenceTimer = null; }
+        }
+
+        /** Core VAD logic — runs every VAD_POLL_INTERVAL ms */
+        function vadCheck() {
+            if (!whisperListening) return;
+
+            const rms = getCurrentRMS();
+            const isSpeech = rms > VAD_SPEECH_THRESHOLD;
+
+            if (isSpeech) {
+                // ── Speech detected ──
+                // Auto barge-in: if assistant is speaking, interrupt it
+                autoBargeIn();
+
+                if (!whisperSegmentActive) {
+                    // Start a new recording segment
+                    beginWhisperSegment();
+                }
+
+                // Reset silence timer — user is still talking
+                if (vadSilenceTimer) {
+                    clearTimeout(vadSilenceTimer);
+                    vadSilenceTimer = null;
+                }
+            } else if (whisperSegmentActive && !vadSilenceTimer) {
+                // ── Silence while recording → start countdown ──
+                vadSilenceTimer = setTimeout(() => {
+                    vadSilenceTimer = null;
+                    if (whisperSegmentActive) {
+                        finishWhisperSegment();
+                    }
+                }, VAD_SILENCE_TIMEOUT);
+            }
+        }
+
+        /** Begin recording a new speech segment */
+        function beginWhisperSegment() {
+            if (!mediaStream || whisperSegmentActive) return;
+
+            audioChunks = [];
+            whisperSegmentActive = true;
+            segmentStartTime = Date.now();
+
+            const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
+                ? "audio/webm;codecs=opus"
+                : MediaRecorder.isTypeSupported("audio/mp4")
+                    ? "audio/mp4"
+                    : "audio/webm";
+
+            mediaRecorder = new MediaRecorder(mediaStream, { mimeType });
+
+            mediaRecorder.ondataavailable = (event) => {
+                if (event.data.size > 0) audioChunks.push(event.data);
+            };
+
+            mediaRecorder.start(200);
+            setStatus("Speaking... (Whisper VAD)", "listening");
+            log("🎙️ Speech detected — recording segment");
+        }
+
+        /** Stop the current segment, encode, and send to server */
+        function finishWhisperSegment() {
+            if (!whisperSegmentActive || !mediaRecorder) return;
+
+            whisperSegmentActive = false;
+            const duration = Date.now() - segmentStartTime;
+
+            // Stop will trigger ondataavailable one last time then we process
+            mediaRecorder.onstop = async () => {
+                if (audioChunks.length === 0) return;
+
+                // Ignore very short segments (clicks, pops)
+                if (duration < VAD_MIN_SEGMENT_MS) {
+                    log(`Ignored short segment (${duration}ms)`);
+                    audioChunks = [];
+                    return;
+                }
+
+                const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
+                audioChunks = [];
+
+                const arrayBuffer = await blob.arrayBuffer();
+                const uint8 = new Uint8Array(arrayBuffer);
+
+                // Base64 encode in chunks to avoid stack overflow
+                let binary = "";
+                const chunkSize = 8192;
+                for (let i = 0; i < uint8.length; i += chunkSize) {
+                    const slice = uint8.subarray(i, Math.min(i + chunkSize, uint8.length));
+                    binary += String.fromCharCode.apply(null, slice);
+                }
+                const base64 = btoa(binary);
+
+                resetOutputPanels();
+
+                if (ws && connected) {
+                    ws.send(JSON.stringify({ type: "audio", data: base64 }));
+                    log(`→ Sent audio segment (${(uint8.length / 1024).toFixed(1)} KB, ${duration}ms) for Whisper`);
+                    transcriptEl.textContent = "🎙️ Transcribing audio...";
+                }
+            };
+
+            try {
+                mediaRecorder.stop();
+            } catch (_) { }
+
+            if (whisperListening) {
+                setStatus("Listening (Whisper VAD)...", "listening");
+            }
+        }
+
+        // ── Unified Mic Controls ────────────────────────────────────────────
+        async function startMic() {
+            const mode = getInputMode();
+            if (mode === "browser-stt") {
+                startBrowserSTT();
+            } else {
+                await startWhisperListening();
+            }
+        }
+
+        function stopMic() {
+            const mode = getInputMode();
+            if (mode === "browser-stt") {
+                stopBrowserSTT();
+            } else {
+                stopWhisperListening();
+            }
+        }
+
+        function resetOutputPanels() {
+            assistantEl.textContent = "";
+            reasoningEl.textContent = "";
+            reasoningSection.classList.add("hidden");
+            toolsEl.innerHTML = "";
+            toolsSection.classList.add("hidden");
+            setBadge("idle", "idle");
+        }
+
+        // ── Text Input ──────────────────────────────────────────────────────
+        function sendTextMessage() {
+            const text = textInput.value.trim();
+            if (!text || !ws || !connected) return;
+
+            autoBargeIn(); // interrupt if assistant is speaking
+            transcriptEl.textContent = text;
+            resetOutputPanels();
+
+            ws.send(JSON.stringify({ type: "transcript", text }));
+            log(`→ Sent text: ${text}`);
+            textInput.value = "";
+        }
+
+        // ── Server Message Handler ──────────────────────────────────────────
+        function handleServerMessage(msg) {
+            switch (msg.type) {
+                // ── Stream lifecycle ────────────────────
+                case "stream_start":
+                    assistantEl.textContent = "";
+                    setBadge("streaming", "streaming");
+                    log("⏳ Stream started");
+                    break;
+
+                case "stream_finish":
+                    log(`✓ Stream finished (reason: ${msg.finishReason || "unknown"})`);
+                    break;
+
+                case "stream_error":
+                    log(`❌ Stream error: ${msg.error || "unknown"}`);
+                    setBadge("error", "idle");
+                    break;
+
+                case "stream_abort":
+                    log(`⚠️ Stream aborted: ${msg.reason || "unknown"}`);
+                    setBadge("aborted", "idle");
+                    break;
+
+                // ── Step lifecycle ──────────────────────
+                case "step_start":
+                    log("  → Step start");
+                    break;
+
+                case "step_finish":
+                    log(`  → Step finish (${msg.finishReason || ""})`);
+                    break;
+
+                // ── Text streaming ─────────────────────
+                case "text_start":
+                    break;
+
+                case "text_delta":
+                    assistantEl.textContent += msg.text || "";
+                    break;
+
+                case "text_end":
+                    break;
+
+                // ── Reasoning streaming ────────────────
+                case "reasoning_start":
+                    reasoningSection.classList.remove("hidden");
+                    reasoningEl.textContent = "";
+                    break;
+
+                case "reasoning_delta":
+                    reasoningSection.classList.remove("hidden");
+                    reasoningEl.textContent += msg.text || "";
+                    break;
+
+                case "reasoning_end":
+                    break;
+
+                // ── Tool streaming ─────────────────────
+                case "tool_input_start":
+                    toolsSection.classList.remove("hidden");
+                    toolsEl.innerHTML += `<div><strong>🛠️ ${msg.toolName || ""}</strong> `;
+                    break;
+
+                case "tool_input_delta":
+                    break;
+
+                case "tool_input_end":
+                    toolsEl.innerHTML += `</div>`;
+                    break;
+
+                case "tool_call":
+                    toolsSection.classList.remove("hidden");
+                    toolsEl.innerHTML += `<div>📞 <strong>${msg.toolName}</strong>(${JSON.stringify(msg.input || {})})</div>`;
+                    log(`🛠️ Tool call: ${msg.toolName}`);
+                    break;
+
+                case "tool_result":
+                    toolsSection.classList.remove("hidden");
+                    toolsEl.innerHTML += `<div>✅ ${msg.toolName}: ${JSON.stringify(msg.result || {})}</div>`;
+                    log(`🛠️ Tool result: ${msg.toolName}`);
+                    break;
+
+                case "tool_error":
+                    toolsSection.classList.remove("hidden");
+                    toolsEl.innerHTML += `<div>❌ ${msg.toolName}: ${msg.error}</div>`;
+                    log(`🛠️ Tool error: ${msg.toolName} — ${msg.error}`);
+                    break;
+
+                // ── Speech streaming audio ─────────────
+                case "speech_stream_start":
+                    log("🔊 Speech stream started");
+                    setBadge("speaking", "streaming");
+                    break;
+
+                case "speech_stream_end":
+                    log("🔊 Speech stream ended");
+                    if (audioQueue.length === 0 && !isPlaying) {
+                        setBadge("idle", "idle");
+                    }
+                    break;
+
+                case "audio_chunk": {
+                    const bytes = decodeBase64ToBytes(msg.data);
+                    audioQueue.push({ bytes, format: msg.format || "mp3" });
+                    log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`);
+                    playNextAudioChunk();
+                    break;
+                }
+
+                // Full audio (non-streaming fallback)
+                case "audio": {
+                    const bytes = decodeBase64ToBytes(msg.data);
+                    audioQueue.push({ bytes, format: msg.format || "mp3" });
+                    log(`🔊 Full audio (${bytes.length} bytes)`);
+                    playNextAudioChunk();
+                    break;
+                }
+
+                // ── Speech interruption (barge-in) ─────
+                case "speech_interrupted":
+                    stopAudioPlayback();
+                    log(`⏸️ Speech interrupted: ${msg.reason || "unknown"}`);
+                    setBadge("interrupted", "idle");
+                    if (connected) {
+                        setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected",
+                                  whisperListening || micShouldRun ? "listening" : "connected");
+                    }
+                    break;
+
+                // ── Response complete ──────────────────
+                case "response_complete":
+                    setBadge("done", "idle");
+                    log(`✅ Response complete (${(msg.text || "").length} chars)`);
+                    break;
+
+                // ── Sources / files ────────────────────
+                case "source":
+                    log(`📎 Source: ${JSON.stringify(msg.source || {})}`);
+                    break;
+
+                case "file":
+                    log(`📄 File received`);
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        // ── WebSocket Connection ────────────────────────────────────────────
+        function connect() {
+            const endpoint = endpointEl.value.trim();
+            if (!endpoint) return;
+
+            setStatus("Connecting...", "disconnected");
+            ws = new WebSocket(endpoint);
+
+            ws.onopen = () => {
+                setStatus("Connected", "connected");
+                setConnectedUI(true);
+                log(`✓ Connected to ${endpoint}`);
+            };
+
+            ws.onclose = () => {
+                setStatus("Disconnected", "disconnected");
+                setConnectedUI(false);
+                stopMic();
+                stopAudioPlayback();
+                log("✗ Disconnected");
+            };
+
+            ws.onerror = () => {
+                log("❌ WebSocket error");
+            };
+
+            ws.onmessage = (event) => {
+                try {
+                    const msg = JSON.parse(event.data);
+                    handleServerMessage(msg);
+                } catch {
+                    log("Received non-JSON message");
+                }
+            };
+        }
+
+        function disconnect() {
+            stopMic();
+            stopAudioPlayback();
+            if (ws) {
+                ws.close();
+                ws = null;
+            }
+        }
+
+        function interrupt() {
+            if (!ws || !connected) return;
+            stopAudioPlayback();
+            ws.send(JSON.stringify({ type: "interrupt", reason: "user_clicked_interrupt" }));
+            log("→ Sent interrupt");
+        }
+
+        // ── Event Listeners ─────────────────────────────────────────────────
+        connectBtn.addEventListener("click", connect);
+        disconnectBtn.addEventListener("click", disconnect);
+        startMicBtn.addEventListener("click", startMic);
+        stopMicBtn.addEventListener("click", stopMic);
+        interruptBtn.addEventListener("click", interrupt);
+        sendTextBtn.addEventListener("click", sendTextMessage);
+
+        textInput.addEventListener("keydown", (e) => {
+            if (e.key === "Enter" && !e.shiftKey) {
+                e.preventDefault();
+                sendTextMessage();
+            }
+        });
+
+        // Warn about unsupported features
+        if (!SpeechRecognition) {
+            log("⚠️ Web Speech API unavailable — use Server Whisper mode or Chrome/Edge.");
+            inputModeEl.value = "server-whisper";
+        }
+        if (!window.isSecureContext && location.hostname !== "localhost") {
+            log("⚠️ Mic may fail on non-secure origins. Use HTTPS or localhost.");
+        }
+    </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/example/ws-server-2.ts b/example/ws-server-2.ts
new file mode 100644
index 0000000..9948a85
--- /dev/null
+++ b/example/ws-server-2.ts
@@ -0,0 +1,120 @@
+import "dotenv/config";
+import { WebSocketServer } from "ws";
+import { VoiceAgent } from "../src";
+import { tool } from "ai";
+import { z } from "zod";
+import { openai } from "@ai-sdk/openai";
+
+const endpoint = process.env.VOICE_WS_ENDPOINT || "ws://localhost:8080";
+const url = new URL(endpoint);
+const port = Number(url.port || 8080);
+const host = url.hostname || "localhost";
+
+// ── Tools (same as demo.ts) ────────────────────────────────────────────
+const weatherTool = tool({
+    description: "Get the weather in a location",
+    inputSchema: z.object({
+        location: z.string().describe("The location to get the weather for"),
+    }),
+    execute: async ({ location }) => ({
+        location,
+        temperature: 72 + Math.floor(Math.random() * 21) - 10,
+        conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
+            Math.floor(Math.random() * 4)
+        ],
+    }),
+});
+
+const timeTool = tool({
+    description: "Get the current time",
+    inputSchema: z.object({}),
+    execute: async () => ({
+        time: new Date().toLocaleTimeString(),
+        timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
+    }),
+});
+
+// ── WebSocket server ───────────────────────────────────────────────────
+const wss = new WebSocketServer({ port, host });
+
+wss.on("listening", () => {
+    console.log(`[ws-server] listening on ${endpoint}`);
+    console.log("[ws-server] Waiting for connections...\n");
+});
+
+wss.on("connection", (socket) => {
+    console.log("[ws-server] ✓ client connected");
+
+    // Create a fresh VoiceAgent per connection
+    const agent = new VoiceAgent({
+        model: openai("gpt-4o"),
+        transcriptionModel: openai.transcription("whisper-1"),
+        speechModel: openai.speech("gpt-4o-mini-tts"),
+        instructions: `You are a helpful voice assistant.
+Keep responses concise and conversational since they will be spoken aloud.
+Use tools when needed to provide accurate information.`,
+        voice: "alloy",
+        speechInstructions: "Speak in a friendly, natural conversational tone.",
+        outputFormat: "mp3",
+        streamingSpeech: {
+            minChunkSize: 40,
+            maxChunkSize: 180,
+            parallelGeneration: true,
+            maxParallelRequests: 2,
+        },
+        tools: {
+            getWeather: weatherTool,
+            getTime: timeTool,
+        },
+    });
+
+    // Wire agent events to server logs
+    agent.on("text", (msg: { role: string; text: string }) => {
+        const prefix = msg.role === "user" ? "👤 User" : "🤖 Assistant";
+        console.log(`[ws-server] ${prefix}: ${msg.text}`);
+    });
+
+    agent.on("chunk:text_delta", ({ text }: { text: string }) => {
+        process.stdout.write(text);
+    });
+
+    agent.on("chunk:tool_call", ({ toolName }: { toolName: string }) => {
+        console.log(`\n[ws-server] 🛠️  Tool call: ${toolName}`);
+    });
+
+    agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) => {
+        console.log(`[ws-server] 🛠️  Tool result (${name}):`, JSON.stringify(result));
+    });
+
+    agent.on("speech_start", () => console.log("[ws-server] 🔊 Speech started"));
+    agent.on("speech_complete", () => console.log("[ws-server] 🔊 Speech complete"));
+    agent.on("speech_interrupted", ({ reason }: { reason: string }) =>
+        console.log(`[ws-server] ⏸️  Speech interrupted: ${reason}`),
+    );
+
+    agent.on("audio_chunk", ({ chunkId, format, uint8Array }: { chunkId: number; format: string; uint8Array: Uint8Array }) => {
+        console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`);
+    });
+
+    agent.on("error", (err: Error) => console.error("[ws-server] ❌ Error:", err.message));
+
+    agent.on("disconnected", () => {
+        console.log("[ws-server] ✗ client disconnected\n");
+    });
+
+    // Hand the accepted socket to the agent – this is the key line.
+    // The agent will listen for "transcript", "audio", "interrupt" messages
+    // and send back "text_delta", "audio_chunk", "response_complete", etc.
+    agent.handleSocket(socket);
+});
+
+// Graceful shutdown
+process.on("SIGINT", () => {
+    console.log("\n[ws-server] Shutting down...");
+    wss.close(() => {
+        console.log("[ws-server] Server closed");
+        process.exit(0);
+    });
+});
+
+export { wss };
diff --git a/example/ws-server.ts b/example/ws-server.ts
index aea5bd6..f648c61 100644
--- a/example/ws-server.ts
+++ b/example/ws-server.ts
@@ -1,140 +1,131 @@
 import "dotenv/config";
-import { WebSocketServer, WebSocket } from "ws";
-import { readFile } from "fs/promises";
-import { existsSync } from "fs";
+import { WebSocketServer } from "ws";
+import { VoiceAgent } from "../src";
+import { tool } from "ai";
+import { z } from "zod";
+import { openai } from "@ai-sdk/openai";
 
 const endpoint = process.env.VOICE_WS_ENDPOINT || "ws://localhost:8080";
 const url = new URL(endpoint);
 const port = Number(url.port || 8080);
 const host = url.hostname || "localhost";
 
-// Message types for type safety
-interface BaseMessage {
-    type: string;
-}
+// ── Tools (same as demo.ts) ────────────────────────────────────────────
+const weatherTool = tool({
+    description: "Get the weather in a location",
+    inputSchema: z.object({
+        location: z.string().describe("The location to get the weather for"),
+    }),
+    execute: async ({ location }) => ({
+        location,
+        temperature: 72 + Math.floor(Math.random() * 21) - 10,
+        conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
+            Math.floor(Math.random() * 4)
+        ],
+    }),
+});
 
-interface TextDeltaMessage extends BaseMessage {
-    type: "text_delta";
-    text: string;
-}
-
-interface ToolCallMessage extends BaseMessage {
-    type: "tool_call";
-    toolName: string;
-    toolCallId: string;
-    input: unknown;
-}
-
-interface ToolResultMessage extends BaseMessage {
-    type: "tool_result";
-    toolName: string;
-    toolCallId: string;
-    result: unknown;
-}
-
-interface AudioMessage extends BaseMessage {
-    type: "audio";
-    data: string; // base64 encoded
-    format: string;
-}
-
-interface ResponseCompleteMessage extends BaseMessage {
-    type: "response_complete";
-    text: string;
-    toolCalls: Array<{ toolName: string; toolCallId: string; input: unknown }>;
-    toolResults: Array<{ toolName: string; toolCallId: string; output: unknown }>;
-}
-
-type AgentMessage =
-    | TextDeltaMessage
-    | ToolCallMessage
-    | ToolResultMessage
-    | AudioMessage
-    | ResponseCompleteMessage;
+const timeTool = tool({
+    description: "Get the current time",
+    inputSchema: z.object({}),
+    execute: async () => ({
+        time: new Date().toLocaleTimeString(),
+        timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
+    }),
+});
 
+// ── WebSocket server ───────────────────────────────────────────────────
 const wss = new WebSocketServer({ port, host });
 
 wss.on("listening", () => {
-    console.log(`[ws-server] 🚀 listening on ${endpoint}`);
+    console.log(`[ws-server] listening on ${endpoint}`);
     console.log("[ws-server] Waiting for connections...\n");
 });
 
-wss.on("connection", (socket: WebSocket) => {
+wss.on("connection", (socket) => {
     console.log("[ws-server] ✓ client connected");
 
-    let streamingText = "";
-    let audioChunks: Buffer[] = [];
-
-    // Send a sample transcript to test text pipeline end-to-end.
-    setTimeout(() => {
-        console.log("[ws-server] -> Sending test transcript...");
-        socket.send(
-            JSON.stringify({
-                type: "transcript",
-                text: "What is the weather in Berlin?",
-            }),
-        );
-    }, 500);
-
-    socket.on("message", async (data) => {
-        try {
-            const msg = JSON.parse(data.toString()) as AgentMessage;
-
-            switch (msg.type) {
-                case "text_delta":
-                    // Real-time streaming text from the agent
-                    streamingText += msg.text;
-                    process.stdout.write(msg.text);
-                    break;
-
-                case "tool_call":
-                    console.log(`\n[ws-server] 🛠️ Tool call: ${msg.toolName}`);
-                    console.log(`           Input: ${JSON.stringify(msg.input)}`);
-                    break;
-
-                case "tool_result":
-                    console.log(`[ws-server] 🛠️ Tool result: ${msg.toolName}`);
-                    console.log(`           Result: ${JSON.stringify(msg.result)}`);
-                    break;
-
-                case "audio":
-                    // Handle audio response from TTS
-                    const audioBuffer = Buffer.from(msg.data, "base64");
-                    audioChunks.push(audioBuffer);
-                    console.log(
-                        `[ws-server] 🔊 Received audio: ${audioBuffer.length} bytes (${msg.format})`,
-                    );
-
-                    // Optionally save audio to file for testing
-                    // await writeFile(`output_${Date.now()}.${msg.format}`, audioBuffer);
-                    break;
-
-                case "response_complete":
-                    console.log("\n[ws-server] ✅ Response complete");
-                    console.log(`           Text length: ${msg.text.length}`);
-                    console.log(`           Tool calls: ${msg.toolCalls.length}`);
-                    console.log(`           Tool results: ${msg.toolResults.length}`);
-
-                    // Reset for next response
-                    streamingText = "";
-                    audioChunks = [];
-                    break;
-
-                default:
-                    console.log("[ws-server] <- Unknown message:", msg);
-            }
-        } catch {
-            console.log("[ws-server] <- raw", data.toString().substring(0, 100));
-        }
+    // Create a fresh VoiceAgent per connection
+    const agent = new VoiceAgent({
+        model: openai("gpt-4o"),
+        transcriptionModel: openai.transcription("whisper-1"),
+        speechModel: openai.speech("gpt-4o-mini-tts"),
+        instructions: `You are a helpful voice assistant.
+Keep responses concise and conversational since they will be spoken aloud.
+Use tools when needed to provide accurate information.`,
+        voice: "alloy",
+        speechInstructions: "Speak in a friendly, natural conversational tone.",
+        outputFormat: "mp3",
+        streamingSpeech: {
+            minChunkSize: 40,
+            maxChunkSize: 180,
+            parallelGeneration: true,
+            maxParallelRequests: 2,
+        },
+        tools: {
+            getWeather: weatherTool,
+            getTime: timeTool,
+        },
     });
 
-    socket.on("close", () => {
+    // Wire agent events to server logs
+    agent.on("text", (msg: { role: string; text: string }) => {
+        const prefix = msg.role === "user" ? "👤 User" : "🤖 Assistant";
+        console.log(`[ws-server] ${prefix}: ${msg.text}`);
+    });
+
+    agent.on("chunk:text_delta", ({ text }: { text: string }) => {
+        process.stdout.write(text);
+    });
+
+    agent.on("chunk:tool_call", ({ toolName }: { toolName: string }) => {
+        console.log(`\n[ws-server] 🛠️  Tool call: ${toolName}`);
+    });
+
+    agent.on("tool_result", ({ name, result }: { name: string; result: unknown }) => {
+        console.log(`[ws-server] 🛠️  Tool result (${name}):`, JSON.stringify(result));
+    });
+
+    agent.on("speech_start", () => console.log("[ws-server] 🔊 Speech started"));
+    agent.on("speech_complete", () => console.log("[ws-server] 🔊 Speech complete"));
+    agent.on("speech_interrupted", ({ reason }: { reason: string }) =>
+        console.log(`[ws-server] ⏸️  Speech interrupted: ${reason}`),
+    );
+
+    agent.on("audio_chunk", ({ chunkId, format, uint8Array }: { chunkId: number; format: string; uint8Array: Uint8Array }) => {
+        console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`);
+    });
+
+    agent.on("transcription", ({ text, language }: { text: string; language?: string }) => {
+        console.log(`[ws-server] 📝 Transcription (${language || "unknown"}): ${text}`);
+    });
+
+    agent.on("audio_received", ({ size }: { size: number }) => {
+        console.log(`[ws-server] 🎤 Audio received: ${(size / 1024).toFixed(1)} KB`);
+    });
+
+    agent.on("chunk:reasoning_delta", ({ text }: { text: string }) => {
+        process.stdout.write(text);
+    });
+
+    agent.on("warning", (msg: string) => {
+        console.log(`[ws-server] ⚠️  Warning: ${msg}`);
+    });
+
+    agent.on("speech_chunk_queued", ({ id, text }: { id: number; text: string }) => {
+        console.log(`[ws-server] 🔊 Queued speech chunk #${id}: ${text.substring(0, 50)}...`);
+    });
+
+    agent.on("error", (err: Error) => console.error("[ws-server] ❌ Error:", err.message));
+
+    agent.on("disconnected", () => {
         console.log("[ws-server] ✗ client disconnected\n");
     });
 
-    socket.on("error", (error) => {
-        console.error("[ws-server] Error:", error.message);
-    });
+    // Hand the accepted socket to the agent – this is the key line.
+    // The agent will listen for "transcript", "audio", "interrupt" messages
+    // and send back "text_delta", "audio_chunk", "response_complete", etc.
+    agent.handleSocket(socket);
 });
 
 // Graceful shutdown
@@ -146,24 +137,4 @@ process.on("SIGINT", () => {
     });
 });
 
-// Helper function to simulate sending audio to the agent
-async function simulateAudioInput(socket: WebSocket, audioPath: string) {
-    if (!existsSync(audioPath)) {
-        console.log(`[ws-server] Audio file not found: ${audioPath}`);
-        return;
-    }
-
-    const audioBuffer = await readFile(audioPath);
-    const base64Audio = audioBuffer.toString("base64");
-
-    console.log(`[ws-server] -> Sending audio: ${audioPath} (${audioBuffer.length} bytes)`);
-    socket.send(
-        JSON.stringify({
-            type: "audio",
-            data: base64Audio,
-        }),
-    );
-}
-
-// Export for use as a module
-export { wss, simulateAudioInput };
+export { wss };
diff --git a/src/VoiceAgent.ts b/src/VoiceAgent.ts
index 2147969..b447e66 100644
--- a/src/VoiceAgent.ts
+++ b/src/VoiceAgent.ts
@@ -12,6 +12,29 @@ import {
   type SpeechModel,
 } from "ai";
 
+/**
+ * Represents a chunk of text to be converted to speech
+ */
+interface SpeechChunk {
+  id: number;
+  text: string;
+  audioPromise?: Promise<Uint8Array | null>;
+}
+
+/**
+ * Configuration for streaming speech behavior
+ */
+interface StreamingSpeechConfig {
+  /** Minimum characters before generating speech for a chunk */
+  minChunkSize: number;
+  /** Maximum characters per chunk (will split at sentence boundary before this) */
+  maxChunkSize: number;
+  /** Whether to enable parallel TTS generation */
+  parallelGeneration: boolean;
+  /** Maximum number of parallel TTS requests */
+  maxParallelRequests: number;
+}
+
 export interface VoiceAgentOptions {
   model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
   transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1'))
@@ -23,6 +46,8 @@ export interface VoiceAgentOptions {
   voice?: string; // Voice for TTS (e.g., 'alloy', 'echo', 'shimmer')
   speechInstructions?: string; // Instructions for TTS voice style
   outputFormat?: string; // Audio output format (e.g., 'mp3', 'opus', 'wav')
+  /** Configuration for streaming speech generation */
+  streamingSpeech?: Partial<StreamingSpeechConfig>;
 }
 
 export class VoiceAgent extends EventEmitter {
@@ -41,6 +66,14 @@ export class VoiceAgent extends EventEmitter {
   private outputFormat: string;
   private isProcessing = false;
 
+  // Streaming speech state
+  private streamingSpeechConfig: StreamingSpeechConfig;
+  private currentSpeechAbortController?: AbortController;
+  private speechChunkQueue: SpeechChunk[] = [];
+  private nextChunkId = 0;
+  private isSpeaking = false;
+  private pendingTextBuffer = "";
+
   constructor(options: VoiceAgentOptions) {
     super();
     this.model = options.model;
@@ -56,6 +89,15 @@ export class VoiceAgent extends EventEmitter {
     if (options.tools) {
       this.tools = { ...options.tools };
     }
+
+    // Initialize streaming speech config with defaults
+    this.streamingSpeechConfig = {
+      minChunkSize: 50,
+      maxChunkSize: 200,
+      parallelGeneration: true,
+      maxParallelRequests: 3,
+      ...options.streamingSpeech,
+    };
   }
 
   private setupListeners() {
@@ -67,12 +109,24 @@ export class VoiceAgent extends EventEmitter {
 
         // Handle transcribed text from the client/STT
         if (message.type === "transcript") {
+          // Interrupt ongoing speech when user starts speaking (barge-in)
+          if (this.isSpeaking) {
+            this.interruptSpeech("user_speaking");
+          }
           await this.processUserInput(message.text);
         }
         // Handle raw audio data that needs transcription
         if (message.type === "audio") {
+          // Interrupt ongoing speech when user starts speaking (barge-in)
+          if (this.isSpeaking) {
+            this.interruptSpeech("user_speaking");
+          }
           await this.processAudioInput(message.data);
         }
+        // Handle explicit interrupt request from client
+        if (message.type === "interrupt") {
+          this.interruptSpeech(message.reason || "client_request");
+        }
       } catch (err) {
         console.error("Failed to process message:", err);
         this.emit("error", err);
@@ -118,8 +172,12 @@ export class VoiceAgent extends EventEmitter {
 
   /**
    * Generate speech from text using the configured speech model
+   * @param abortSignal Optional signal to cancel the speech generation
    */
-  public async generateSpeechFromText(text: string): Promise<Uint8Array> {
+  public async generateSpeechFromText(
+    text: string,
+    abortSignal?: AbortSignal
+  ): Promise<Uint8Array> {
     if (!this.speechModel) {
       throw new Error("Speech model not configured");
     }
@@ -130,11 +188,246 @@ export class VoiceAgent extends EventEmitter {
       voice: this.voice,
       instructions: this.speechInstructions,
       outputFormat: this.outputFormat,
+      abortSignal,
     });
 
     return result.audio.uint8Array;
   }
 
+  /**
+   * Interrupt ongoing speech generation and playback (barge-in support)
+   */
+  public interruptSpeech(reason: string = "interrupted"): void {
+    if (!this.isSpeaking && this.speechChunkQueue.length === 0) {
+      return;
+    }
+
+    // Abort any pending speech generation requests
+    if (this.currentSpeechAbortController) {
+      this.currentSpeechAbortController.abort();
+      this.currentSpeechAbortController = undefined;
+    }
+
+    // Clear the speech queue
+    this.speechChunkQueue = [];
+    this.pendingTextBuffer = "";
+    this.isSpeaking = false;
+
+    // Notify clients to stop audio playback
+    this.sendWebSocketMessage({
+      type: "speech_interrupted",
+      reason,
+    });
+
+    this.emit("speech_interrupted", { reason });
+  }
+
+  /**
+   * Extract complete sentences from text buffer
+   * Returns [extractedSentences, remainingBuffer]
+   */
+  private extractSentences(text: string): [string[], string] {
+    const sentences: string[] = [];
+    let remaining = text;
+
+    // Match sentences ending with . ! ? followed by space or end of string
+    // Also handles common abbreviations and edge cases
+    const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
+    let lastIndex = 0;
+    let match;
+
+    while ((match = sentenceEndPattern.exec(text)) !== null) {
+      const sentence = text.slice(lastIndex, match.index + match[0].length).trim();
+      if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
+        sentences.push(sentence);
+        lastIndex = match.index + match[0].length;
+      } else if (sentences.length > 0) {
+        // Append short sentence to previous one
+        sentences[sentences.length - 1] += " " + sentence;
+        lastIndex = match.index + match[0].length;
+      }
+    }
+
+    remaining = text.slice(lastIndex);
+
+    // If remaining text is too long, force split at clause boundaries
+    if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
+      const clausePattern = /[,;:]\s+/g;
+      let clauseMatch;
+      let splitIndex = 0;
+
+      while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
+        if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
+          splitIndex = clauseMatch.index + clauseMatch[0].length;
+          break;
+        }
+      }
+
+      if (splitIndex > 0) {
+        sentences.push(remaining.slice(0, splitIndex).trim());
+        remaining = remaining.slice(splitIndex);
+      }
+    }
+
+    return [sentences, remaining];
+  }
+
+  /**
+   * Queue a text chunk for speech generation
+   */
+  private queueSpeechChunk(text: string): void {
+    if (!this.speechModel || !text.trim()) return;
+
+    const chunk: SpeechChunk = {
+      id: this.nextChunkId++,
+      text: text.trim(),
+    };
+
+    // Start generating audio immediately (parallel generation)
+    if (this.streamingSpeechConfig.parallelGeneration) {
+      const activeRequests = this.speechChunkQueue.filter(c => c.audioPromise).length;
+
+      if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
+        chunk.audioPromise = this.generateChunkAudio(chunk);
+      }
+    }
+
+    this.speechChunkQueue.push(chunk);
+    this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
+
+    // Start processing queue if not already
+    if (!this.isSpeaking) {
+      this.processSpeechQueue();
+    }
+  }
+
+  /**
+   * Generate audio for a single chunk
+   */
+  private async generateChunkAudio(chunk: SpeechChunk): Promise<Uint8Array | null> {
+    if (!this.currentSpeechAbortController) {
+      this.currentSpeechAbortController = new AbortController();
+    }
+
+    try {
+      const audioData = await this.generateSpeechFromText(
+        chunk.text,
+        this.currentSpeechAbortController.signal
+      );
+      return audioData;
+    } catch (error) {
+      if ((error as Error).name === "AbortError") {
+        return null; // Cancelled, don't report as error
+      }
+      console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
+      this.emit("error", error);
+      return null;
+    }
+  }
+
+  /**
+   * Process the speech queue and send audio chunks in order
+   */
+  private async processSpeechQueue(): Promise<void> {
+    if (this.isSpeaking) return;
+    this.isSpeaking = true;
+
+    this.emit("speech_start", { streaming: true });
+    this.sendWebSocketMessage({ type: "speech_stream_start" });
+
+    try {
+      while (this.speechChunkQueue.length > 0) {
+        const chunk = this.speechChunkQueue[0];
+
+        // Ensure audio generation has started
+        if (!chunk.audioPromise) {
+          chunk.audioPromise = this.generateChunkAudio(chunk);
+        }
+
+        // Wait for this chunk's audio
+        const audioData = await chunk.audioPromise;
+
+        // Check if we were interrupted while waiting
+        if (!this.isSpeaking) break;
+
+        // Remove from queue after processing
+        this.speechChunkQueue.shift();
+
+        if (audioData) {
+          const base64Audio = Buffer.from(audioData).toString("base64");
+
+          // Send audio chunk via WebSocket
+          this.sendWebSocketMessage({
+            type: "audio_chunk",
+            chunkId: chunk.id,
+            data: base64Audio,
+            format: this.outputFormat,
+            text: chunk.text,
+          });
+
+          // Emit for local handling
+          this.emit("audio_chunk", {
+            chunkId: chunk.id,
+            data: base64Audio,
+            format: this.outputFormat,
+            text: chunk.text,
+            uint8Array: audioData,
+          });
+        }
+
+        // Start generating next chunks in parallel
+        if (this.streamingSpeechConfig.parallelGeneration) {
+          const activeRequests = this.speechChunkQueue.filter(c => c.audioPromise).length;
+          const toStart = Math.min(
+            this.streamingSpeechConfig.maxParallelRequests - activeRequests,
+            this.speechChunkQueue.length
+          );
+
+          for (let i = 0; i < toStart; i++) {
+            const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise);
+            if (nextChunk) {
+              nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
+            }
+          }
+        }
+      }
+    } finally {
+      this.isSpeaking = false;
+      this.currentSpeechAbortController = undefined;
+
+      this.sendWebSocketMessage({ type: "speech_stream_end" });
+      this.emit("speech_complete", { streaming: true });
+    }
+  }
+
+  /**
+   * Process text deltra for streaming speech
+   * Call this as text chunks arrive from LLM
+   */
+  private processTextForStreamingSpeech(textDelta: string): void {
+    if (!this.speechModel) return;
+
+    this.pendingTextBuffer += textDelta;
+
+    const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
+    this.pendingTextBuffer = remaining;
+
+    for (const sentence of sentences) {
+      this.queueSpeechChunk(sentence);
+    }
+  }
+
+  /**
+   * Flush any remaining text in the buffer to speech
+   * Call this when stream ends
+   */
+  private flushStreamingSpeech(): void {
+    if (!this.speechModel || !this.pendingTextBuffer.trim()) return;
+
+    this.queueSpeechChunk(this.pendingTextBuffer);
+    this.pendingTextBuffer = "";
+  }
+
   /**
    * Process incoming audio data: transcribe and generate response
    */
@@ -182,6 +475,18 @@ export class VoiceAgent extends EventEmitter {
     });
   }
 
+  /**
+   * Attach an existing WebSocket (server-side usage).
+   * Use this when a WS server accepts a connection and you want the
+   * agent to handle messages on that socket.
+   */
+  public handleSocket(socket: WebSocket): void {
+    this.socket = socket;
+    this.isConnected = true;
+    this.setupListeners();
+    this.emit("connected");
+  }
+
   /**
    * Send text input for processing (bypasses transcription)
    */
@@ -366,6 +671,8 @@ export class VoiceAgent extends EventEmitter {
 
           case "text-delta":
             fullText += part.text;
+            // Process text for streaming speech as it arrives
+            this.processTextForStreamingSpeech(part.text);
             this.sendWebSocketMessage({
               type: "text_delta",
               id: part.id,
@@ -374,6 +681,8 @@ export class VoiceAgent extends EventEmitter {
             break;
 
           case "text-end":
+            // Flush any remaining text to speech when text stream ends
+            this.flushStreamingSpeech();
             this.sendWebSocketMessage({ type: "text_end", id: part.id });
             break;
 
@@ -478,9 +787,13 @@ export class VoiceAgent extends EventEmitter {
         this.conversationHistory.push({ role: "assistant", content: fullText });
       }
 
-      // Generate speech from the response if speech model is configured
-      if (this.speechModel && fullText) {
-        await this.generateAndSendSpeech(fullText);
+      // Ensure any remaining speech is flushed (in case text-end wasn't triggered)
+      this.flushStreamingSpeech();
+
+      // Wait for all speech chunks to complete before signaling response complete
+      // This ensures audio playback can finish
+      while (this.speechChunkQueue.length > 0 || this.isSpeaking) {
+        await new Promise(resolve => setTimeout(resolve, 100));
       }
 
       // Send the complete response
@@ -501,13 +814,14 @@ export class VoiceAgent extends EventEmitter {
   }
 
   /**
-   * Generate speech and send audio via WebSocket
+   * Generate speech for full text at once (non-streaming fallback)
+   * Useful when you want to bypass streaming speech for short responses
    */
-  private async generateAndSendSpeech(text: string): Promise<void> {
+  public async generateAndSendSpeechFull(text: string): Promise<void> {
     if (!this.speechModel) return;
 
     try {
-      this.emit("speech_start", { text });
+      this.emit("speech_start", { text, streaming: false });
 
       const audioData = await this.generateSpeechFromText(text);
       const base64Audio = Buffer.from(audioData).toString("base64");
@@ -526,7 +840,7 @@ export class VoiceAgent extends EventEmitter {
         uint8Array: audioData,
       });
 
-      this.emit("speech_complete", { text });
+      this.emit("speech_complete", { text, streaming: false });
     } catch (error) {
       console.error("Failed to generate speech:", error);
       this.emit("error", error);
@@ -604,4 +918,18 @@ export class VoiceAgent extends EventEmitter {
   get processing(): boolean {
     return this.isProcessing;
   }
+
+  /**
+   * Check if agent is currently speaking (generating/playing audio)
+   */
+  get speaking(): boolean {
+    return this.isSpeaking;
+  }
+
+  /**
+   * Get the number of pending speech chunks in the queue
+   */
+  get pendingSpeechChunks(): number {
+    return this.speechChunkQueue.length;
+  }
 }