From 637d57fb41907cfbba232fd8b0f07f4b65cd2d27 Mon Sep 17 00:00:00 2001 From: Bijit Mondal Date: Sat, 14 Feb 2026 14:20:07 +0530 Subject: [PATCH] feat: add serve-client and start-test-environment scripts, enhance voice-client with debugging info --- example/serve-client.js | 29 ++++++ example/start-test-environment.sh | 26 +++++ example/voice-client.html | 167 ++++++++++++++++++++++++------ example/ws-server.ts | 55 +++++++++- package.json | 3 +- src/VoiceAgent.ts | 103 +++++++++++++++--- 6 files changed, 330 insertions(+), 53 deletions(-) create mode 100644 example/serve-client.js create mode 100755 example/start-test-environment.sh diff --git a/example/serve-client.js b/example/serve-client.js new file mode 100644 index 0000000..c8b1651 --- /dev/null +++ b/example/serve-client.js @@ -0,0 +1,29 @@ +const http = require('http'); +const fs = require('fs'); +const path = require('path'); + +const PORT = 3000; + +// Create a simple HTTP server to serve the voice client HTML +const server = http.createServer((req, res) => { + if (req.url === '/' || req.url === '/index.html') { + const htmlPath = path.join(__dirname, 'voice-client.html'); + fs.readFile(htmlPath, (err, data) => { + if (err) { + res.writeHead(500); + res.end('Error loading voice-client.html'); + return; + } + res.writeHead(200, {'Content-Type': 'text/html'}); + res.end(data); + }); + } else { + res.writeHead(404); + res.end('Not found'); + } +}); + +server.listen(PORT, () => { + console.log(`Voice client available at: http://localhost:${PORT}`); + console.log(`Make sure to also start the WebSocket server with: npm run ws:server`); +}); \ No newline at end of file diff --git a/example/start-test-environment.sh b/example/start-test-environment.sh new file mode 100755 index 0000000..5ed0c57 --- /dev/null +++ b/example/start-test-environment.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Kill any previously running servers +echo "Cleaning up any existing processes..." +pkill -f "node.*example/serve-client.js" || true +pkill -f "tsx.*example/ws-server.ts" || true + +echo "Starting WebSocket server..." +npm run ws:server & +WS_SERVER_PID=$! + +# Sleep to ensure WebSocket server has time to start +sleep 2 + +echo "Starting web client server..." +npm run client & +CLIENT_SERVER_PID=$! + +echo "✅ Test environment started!" +echo "📱 Open http://localhost:3000 in your browser" +echo "" +echo "Press Ctrl+C to shut down both servers" + +# Wait for user to Ctrl+C +trap "kill $WS_SERVER_PID $CLIENT_SERVER_PID; echo 'Servers stopped'; exit" INT +wait \ No newline at end of file diff --git a/example/voice-client.html b/example/voice-client.html index 703b790..a5673b9 100644 --- a/example/voice-client.html +++ b/example/voice-client.html @@ -263,6 +263,8 @@
Disconnected
+
Debug: Check browser console (F12) for detailed logs +
@@ -440,6 +442,7 @@ setStatus("Playing audio...", "speaking"); const { bytes, format } = audioQueue.shift(); + console.log(`Playing audio chunk: ${bytes.length} bytes, format: ${format}`); try { const ctx = getAudioContext(); @@ -449,7 +452,10 @@ ); try { + console.log(`Attempting to decode audio with WebAudio API...`); const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0)); + console.log(`Decoded audio successfully: ${audioBuffer.duration.toFixed(2)}s, ${audioBuffer.numberOfChannels} channels, ${audioBuffer.sampleRate}Hz`); + await new Promise((resolve) => { const source = ctx.createBufferSource(); source.buffer = audioBuffer; @@ -457,29 +463,49 @@ currentAudioSource = source; source.onended = resolve; source.start(0); + console.log(`Audio playback started`); }); + console.log(`Audio playback completed`); currentAudioSource = null; - } catch (_decodeErr) { + } catch (decodeErr) { + console.warn(`WebAudio decode failed, falling back to Audio element:`, decodeErr); const mime = getMimeTypeForFormat(format); + console.log(`Using MIME type: ${mime}`); + const blob = new Blob([bytes], { type: mime }); const url = URL.createObjectURL(blob); const audio = new Audio(url); + + audio.onerror = (e) => console.error(`Audio element error:`, e); + audio.oncanplaythrough = () => console.log(`Audio ready to play: ${audio.duration.toFixed(2)}s`); + currentAudioElement = audio; await audio.play(); + console.log(`Audio element playback started`); + await new Promise((resolve) => { - audio.onended = resolve; - audio.onerror = resolve; + audio.onended = () => { + console.log(`Audio element playback completed`); + resolve(); + }; + audio.onerror = (e) => { + console.error(`Audio element playback failed:`, e); + resolve(); + }; }); currentAudioElement = null; URL.revokeObjectURL(url); } } catch (err) { + console.error(`Audio playback error:`, err); log(`Audio play error: ${err?.message || err}`); } finally { isPlaying = false; if (audioQueue.length > 0) { + console.log(`${audioQueue.length} more audio chunks in queue, continuing playback`); playNextAudioChunk(); } else if (connected) { + console.log(`Audio queue empty, returning to ${whisperListening || micShouldRun ? 'listening' : 'connected'} state`); setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected", whisperListening || micShouldRun ? "listening" : "connected"); } } @@ -521,6 +547,7 @@ case "webm": return "audio/webm"; default: + console.log(`Unknown audio format: ${format}, defaulting to mpeg`); return `audio/${format || "mpeg"}`; } } @@ -543,7 +570,10 @@ analyserNode = ctx.createAnalyser(); analyserNode.fftSize = 256; analyserSource.connect(analyserNode); - } catch (_) { } + console.log('Audio analyser setup complete'); + } catch (err) { + console.error('Audio analyser setup failed:', err); + } } function teardownAnalyser() { @@ -671,15 +701,26 @@ */ async function startWhisperListening() { try { + console.log("Starting Whisper VAD listening"); mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, sampleRate: 16000, echoCancellation: true, noiseSuppression: true, + autoGainControl: true } }); + + // Log the actual constraints we got + const tracks = mediaStream.getAudioTracks(); + if (tracks.length > 0) { + const settings = tracks[0].getSettings(); + console.log('Audio track settings:', settings); + log(`🎵 Audio: ${settings.sampleRate}Hz, ${settings.channelCount}ch`); + } } catch (err) { + console.error("Mic permission error:", err); log(`Mic permission failed: ${err?.message || err}`); setStatus("Mic permission denied", "disconnected"); return; @@ -808,13 +849,45 @@ whisperSegmentActive = true; segmentStartTime = Date.now(); - const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") - ? "audio/webm;codecs=opus" - : MediaRecorder.isTypeSupported("audio/mp4") - ? "audio/mp4" - : "audio/webm"; + // Try to choose the best format for Whisper compatibility + let mimeType = ''; + const supportedTypes = [ + "audio/webm;codecs=opus", // Best compatibility with Whisper + "audio/webm", + "audio/ogg;codecs=opus", + "audio/mp4", + "audio/wav" + ]; - mediaRecorder = new MediaRecorder(mediaStream, { mimeType }); + for (const type of supportedTypes) { + if (MediaRecorder.isTypeSupported(type)) { + mimeType = type; + break; + } + } + + if (!mimeType) { + console.warn("No preferred MIME types supported, using default"); + mimeType = ''; // Let the browser choose + } + + console.log(`Using MediaRecorder with MIME type: ${mimeType || 'browser default'}`); + + // Create recorder with bitrate suitable for speech + const recorderOptions = { + mimeType: mimeType, + audioBitsPerSecond: 128000 // 128kbps is good for speech + }; + + try { + mediaRecorder = new MediaRecorder(mediaStream, recorderOptions); + console.log(`MediaRecorder created with mimeType: ${mediaRecorder.mimeType}`); + } catch (err) { + console.error("MediaRecorder creation failed:", err); + // Fallback to default options + mediaRecorder = new MediaRecorder(mediaStream); + console.log(`Fallback MediaRecorder created with mimeType: ${mediaRecorder.mimeType}`); + } mediaRecorder.ondataavailable = (event) => { if (event.data.size > 0) audioChunks.push(event.data); @@ -843,27 +916,45 @@ return; } - const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType }); - audioChunks = []; + try { + const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType }); + audioChunks = []; - const arrayBuffer = await blob.arrayBuffer(); - const uint8 = new Uint8Array(arrayBuffer); + const arrayBuffer = await blob.arrayBuffer(); + const uint8 = new Uint8Array(arrayBuffer); - // Base64 encode in chunks to avoid stack overflow - let binary = ""; - const chunkSize = 8192; - for (let i = 0; i < uint8.length; i += chunkSize) { - const slice = uint8.subarray(i, Math.min(i + chunkSize, uint8.length)); - binary += String.fromCharCode.apply(null, slice); - } - const base64 = btoa(binary); + // Base64 encode in chunks to avoid stack overflow + let binary = ""; + const chunkSize = 8192; + for (let i = 0; i < uint8.length; i += chunkSize) { + const slice = uint8.subarray(i, Math.min(i + chunkSize, uint8.length)); + binary += String.fromCharCode.apply(null, slice); + } + const base64 = btoa(binary); - resetOutputPanels(); + // Log audio size for debugging + console.log(`Prepared audio segment: ${(uint8.length / 1024).toFixed(1)}KB, duration: ${duration}ms, mime: ${mediaRecorder.mimeType}`); - if (ws && connected) { - ws.send(JSON.stringify({ type: "audio", data: base64 })); - log(`→ Sent audio segment (${(uint8.length / 1024).toFixed(1)} KB, ${duration}ms) for Whisper`); - transcriptEl.textContent = "🎙️ Transcribing audio..."; + resetOutputPanels(); + + if (ws && connected) { + // Add format information to help server decode the audio + const message = { + type: "audio", + data: base64, + format: mediaRecorder.mimeType, + sampleRate: 16000, // Match the constraint we requested + duration: duration + }; + + console.log(`Sending audio to server: ${(base64.length / 1000).toFixed(1)}KB, format: ${mediaRecorder.mimeType}`); + ws.send(JSON.stringify(message)); + log(`→ Sent audio segment (${(uint8.length / 1024).toFixed(1)} KB, ${duration}ms) for Whisper`); + transcriptEl.textContent = "🎙️ Transcribing audio..."; + } + } catch (err) { + console.error("Error processing audio segment:", err); + log(`❌ Error processing audio: ${err.message || err}`); } }; @@ -921,6 +1012,18 @@ // ── Server Message Handler ────────────────────────────────────────── function handleServerMessage(msg) { switch (msg.type) { + // ── Transcription feedback ──────────────── + case "transcription_result": + console.log(`Received transcription: "${msg.text}", language: ${msg.language || 'unknown'}`); + transcriptEl.textContent = msg.text; + log(`🎙️ Transcription: ${msg.text}`); + break; + + case "transcription_error": + console.error(`Transcription error: ${msg.error}`); + transcriptEl.textContent = `⚠️ ${msg.error}`; + log(`❌ Transcription error: ${msg.error}`); + break; // ── Stream lifecycle ──────────────────── case "stream_start": assistantEl.textContent = ""; @@ -1022,8 +1125,9 @@ case "audio_chunk": { const bytes = decodeBase64ToBytes(msg.data); - audioQueue.push({ bytes, format: msg.format || "opus" }); - log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "opus"})`); + audioQueue.push({ bytes, format: msg.format || "mp3" }); + log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`); + console.log(`Received audio chunk #${msg.chunkId ?? "?"}: ${bytes.length} bytes, format: ${msg.format || "mp3"}`); playNextAudioChunk(); break; } @@ -1096,9 +1200,12 @@ ws.onmessage = (event) => { try { + console.log(`← Received WebSocket message: ${event.data.length} bytes`); const msg = JSON.parse(event.data); + console.log('Message parsed:', msg.type); handleServerMessage(msg); - } catch { + } catch (err) { + console.error('Parse error:', err); log("Received non-JSON message"); } }; diff --git a/example/ws-server.ts b/example/ws-server.ts index a362eaf..dae8784 100644 --- a/example/ws-server.ts +++ b/example/ws-server.ts @@ -40,6 +40,7 @@ const wss = new WebSocketServer({ port, host }); wss.on("listening", () => { console.log(`[ws-server] listening on ${endpoint}`); console.log("[ws-server] Waiting for connections...\n"); + console.log(`[ws-server] 🌐 Open voice-client.html in your browser and connect to ${endpoint}`); }); wss.on("connection", (socket) => { @@ -55,12 +56,12 @@ Keep responses concise and conversational since they will be spoken aloud. Use tools when needed to provide accurate information.`, voice: "alloy", speechInstructions: "Speak in a friendly, natural conversational tone.", - outputFormat: "opus", + outputFormat: "mp3", // Using mp3 for better browser compatibility streamingSpeech: { - minChunkSize: 40, - maxChunkSize: 180, - parallelGeneration: true, - maxParallelRequests: 2, + minChunkSize: 20, // Smaller chunks for faster streaming + maxChunkSize: 150, // Not too large to ensure timely audio delivery + parallelGeneration: true, // Generate audio chunks in parallel + maxParallelRequests: 3, // Allow up to 3 concurrent TTS requests }, tools: { getWeather: weatherTool, @@ -96,6 +97,50 @@ Use tools when needed to provide accurate information.`, console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`); }); + // Log raw WebSocket messages for debugging + const originalSend = socket.send.bind(socket); + + // Define a wrapper function to log messages before sending + function loggedSend(data: any): void { + let dataSize = 'unknown size'; + if (typeof data === 'string') { + dataSize = `${data.length} chars`; + } else if (data instanceof Buffer || data instanceof ArrayBuffer) { + dataSize = `${Buffer.byteLength(data)} bytes`; + } else if (data instanceof Uint8Array) { + dataSize = `${data.byteLength} bytes`; + } + console.log(`[ws-server] → Sending WebSocket data (${dataSize})`); + } + + // Create a proxy for the send method that logs but preserves original signatures + socket.send = function (data: any, optionsOrCallback?: any, callback?: (err?: Error) => void): void { + loggedSend(data); + + if (typeof optionsOrCallback === 'function') { + // Handle the (data, callback) signature + return originalSend(data, optionsOrCallback); + } else if (optionsOrCallback) { + // Handle the (data, options, callback) signature + return originalSend(data, optionsOrCallback, callback); + } else { + // Handle the (data) signature + return originalSend(data); + } + }; + + socket.on('message', (data: any) => { + let dataSize = 'unknown size'; + if (data instanceof Buffer) { + dataSize = `${data.length} bytes`; + } else if (data instanceof ArrayBuffer) { + dataSize = `${data.byteLength} bytes`; + } else if (typeof data === 'string') { + dataSize = `${data.length} chars`; + } + console.log(`[ws-server] ← Received WebSocket data (${dataSize})`); + }); + agent.on("transcription", ({ text, language }: { text: string; language?: string }) => { console.log(`[ws-server] 📝 Transcription (${language || "unknown"}): ${text}`); }); diff --git a/package.json b/package.json index 2bfe99e..a194126 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,8 @@ "build": "tsc", "dev": "tsc -w", "demo": "tsx example/demo.ts", - "ws:server": "tsx example/ws-server-2.ts", + "ws:server": "tsx example/ws-server.ts", + "client": "node example/serve-client.js", "prepublishOnly": "pnpm build" }, "keywords": [ diff --git a/src/VoiceAgent.ts b/src/VoiceAgent.ts index 003a8e3..18f741a 100644 --- a/src/VoiceAgent.ts +++ b/src/VoiceAgent.ts @@ -156,6 +156,7 @@ export class VoiceAgent extends EventEmitter { this.socket.on("message", async (data) => { try { const message = JSON.parse(data.toString()); + console.log(`Received WebSocket message of type: ${message.type}`); // Handle transcribed text from the client/STT if (message.type === "transcript") { @@ -165,6 +166,7 @@ export class VoiceAgent extends EventEmitter { } // Interrupt ongoing speech when user starts speaking (barge-in) this.interruptCurrentResponse("user_speaking"); + console.log(`Processing transcript: "${message.text}"`); await this.enqueueInput(message.text); } // Handle raw audio data that needs transcription @@ -175,10 +177,12 @@ export class VoiceAgent extends EventEmitter { } // Interrupt ongoing speech when user starts speaking (barge-in) this.interruptCurrentResponse("user_speaking"); - await this.processAudioInput(message.data); + console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || 'unknown'}`); + await this.processAudioInput(message.data, message.format); } // Handle explicit interrupt request from client else if (message.type === "interrupt") { + console.log(`Received interrupt request: ${message.reason || "client_request"}`); this.interruptCurrentResponse(message.reason || "client_request"); } } catch (err) { @@ -246,17 +250,38 @@ export class VoiceAgent extends EventEmitter { throw new Error("Transcription model not configured"); } - const result = await transcribe({ - model: this.transcriptionModel, - audio: audioData, - }); + console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`); - this.emit("transcription", { - text: result.text, - language: result.language, - }); + try { + // Note: The AI SDK transcribe function only accepts these parameters + // We can't directly pass language or temperature to it + const result = await transcribe({ + model: this.transcriptionModel, + audio: audioData, + // If we need to pass additional options to OpenAI Whisper, + // we would need to do it via providerOptions if supported + }); - return result.text; + console.log(`Whisper transcription result: "${result.text}", language: ${result.language || 'unknown'}`); + + this.emit("transcription", { + text: result.text, + language: result.language, + }); + + // Also send transcription to client for immediate feedback + this.sendWebSocketMessage({ + type: "transcription_result", + text: result.text, + language: result.language, + }); + + return result.text; + } catch (error) { + console.error("Whisper transcription failed:", error); + // Re-throw to be handled by the caller + throw error; + } } /** @@ -472,13 +497,16 @@ export class VoiceAgent extends EventEmitter { } try { + console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? '...' : ''}"`); const audioData = await this.generateSpeechFromText( chunk.text, this.currentSpeechAbortController.signal ); + console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`); return audioData; } catch (error) { if ((error as Error).name === "AbortError") { + console.log(`Audio generation aborted for chunk ${chunk.id}`); return null; // Cancelled, don't report as error } console.error(`Failed to generate audio for chunk ${chunk.id}:`, error); @@ -494,6 +522,7 @@ export class VoiceAgent extends EventEmitter { if (this.isSpeaking) return; this.isSpeaking = true; + console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`); this.emit("speech_start", { streaming: true }); this.sendWebSocketMessage({ type: "speech_stream_start" }); @@ -501,6 +530,8 @@ export class VoiceAgent extends EventEmitter { while (this.speechChunkQueue.length > 0) { const chunk = this.speechChunkQueue[0]; + console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`); + // Ensure audio generation has started if (!chunk.audioPromise) { chunk.audioPromise = this.generateChunkAudio(chunk); @@ -510,13 +541,17 @@ export class VoiceAgent extends EventEmitter { const audioData = await chunk.audioPromise; // Check if we were interrupted while waiting - if (!this.isSpeaking) break; + if (!this.isSpeaking) { + console.log(`Speech interrupted during chunk #${chunk.id}`); + break; + } // Remove from queue after processing this.speechChunkQueue.shift(); if (audioData) { const base64Audio = Buffer.from(audioData).toString("base64"); + console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`); // Send audio chunk via WebSocket this.sendWebSocketMessage({ @@ -535,6 +570,8 @@ export class VoiceAgent extends EventEmitter { text: chunk.text, uint8Array: audioData, }); + } else { + console.log(`No audio data generated for chunk #${chunk.id}`); } // Start generating next chunks in parallel @@ -545,14 +582,20 @@ export class VoiceAgent extends EventEmitter { this.speechChunkQueue.length ); - for (let i = 0; i < toStart; i++) { - const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise); - if (nextChunk) { - nextChunk.audioPromise = this.generateChunkAudio(nextChunk); + if (toStart > 0) { + console.log(`Starting parallel generation for ${toStart} more chunks`); + for (let i = 0; i < toStart; i++) { + const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise); + if (nextChunk) { + nextChunk.audioPromise = this.generateChunkAudio(nextChunk); + } } } } } + } catch (error) { + console.error("Error in speech queue processing:", error); + this.emit("error", error); } finally { this.isSpeaking = false; this.currentSpeechAbortController = undefined; @@ -564,6 +607,7 @@ export class VoiceAgent extends EventEmitter { this.speechQueueDonePromise = undefined; } + console.log(`Speech queue processing complete`); this.sendWebSocketMessage({ type: "speech_stream_end" }); this.emit("speech_complete", { streaming: true }); } @@ -600,7 +644,7 @@ export class VoiceAgent extends EventEmitter { /** * Process incoming audio data: transcribe and generate response */ - private async processAudioInput(base64Audio: string): Promise { + private async processAudioInput(base64Audio: string, format?: string): Promise { if (!this.transcriptionModel) { this.emit("error", new Error("Transcription model not configured for audio input")); return; @@ -624,16 +668,28 @@ export class VoiceAgent extends EventEmitter { return; } - this.emit("audio_received", { size: audioBuffer.length }); + this.emit("audio_received", { size: audioBuffer.length, format }); + console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || 'unknown'}`); const transcribedText = await this.transcribeAudio(audioBuffer); + console.log(`Transcribed text: "${transcribedText}"`); if (transcribedText.trim()) { await this.enqueueInput(transcribedText); + } else { + this.emit("warning", "Transcription returned empty text"); + this.sendWebSocketMessage({ + type: "transcription_error", + error: "Whisper returned empty text" + }); } } catch (error) { console.error("Failed to process audio input:", error); this.emit("error", error); + this.sendWebSocketMessage({ + type: "transcription_error", + error: `Transcription failed: ${(error as Error).message || String(error)}` + }); } } @@ -1110,7 +1166,20 @@ export class VoiceAgent extends EventEmitter { try { if (this.socket.readyState === WebSocket.OPEN) { + // Skip logging huge audio data for better readability + if (message.type === "audio_chunk" || message.type === "audio") { + const { data, ...rest } = message as any; + console.log(`Sending WebSocket message: ${message.type}`, + data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "", + rest + ); + } else { + console.log(`Sending WebSocket message: ${message.type}`); + } + this.socket.send(JSON.stringify(message)); + } else { + console.warn(`Cannot send message, socket state: ${this.socket.readyState}`); } } catch (error) { // Socket may have closed between the readyState check and send()