mirror of
https://github.com/Bijit-Mondal/VoiceAgent.git
synced 2026-03-02 18:36:39 +00:00
feat: add serve-client and start-test-environment scripts, enhance voice-client with debugging info
This commit is contained in:
29
example/serve-client.js
Normal file
29
example/serve-client.js
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
const http = require('http');
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const PORT = 3000;
|
||||||
|
|
||||||
|
// Create a simple HTTP server to serve the voice client HTML
|
||||||
|
const server = http.createServer((req, res) => {
|
||||||
|
if (req.url === '/' || req.url === '/index.html') {
|
||||||
|
const htmlPath = path.join(__dirname, 'voice-client.html');
|
||||||
|
fs.readFile(htmlPath, (err, data) => {
|
||||||
|
if (err) {
|
||||||
|
res.writeHead(500);
|
||||||
|
res.end('Error loading voice-client.html');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
res.writeHead(200, {'Content-Type': 'text/html'});
|
||||||
|
res.end(data);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
res.writeHead(404);
|
||||||
|
res.end('Not found');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
server.listen(PORT, () => {
|
||||||
|
console.log(`Voice client available at: http://localhost:${PORT}`);
|
||||||
|
console.log(`Make sure to also start the WebSocket server with: npm run ws:server`);
|
||||||
|
});
|
||||||
26
example/start-test-environment.sh
Executable file
26
example/start-test-environment.sh
Executable file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Kill any previously running servers
|
||||||
|
echo "Cleaning up any existing processes..."
|
||||||
|
pkill -f "node.*example/serve-client.js" || true
|
||||||
|
pkill -f "tsx.*example/ws-server.ts" || true
|
||||||
|
|
||||||
|
echo "Starting WebSocket server..."
|
||||||
|
npm run ws:server &
|
||||||
|
WS_SERVER_PID=$!
|
||||||
|
|
||||||
|
# Sleep to ensure WebSocket server has time to start
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
echo "Starting web client server..."
|
||||||
|
npm run client &
|
||||||
|
CLIENT_SERVER_PID=$!
|
||||||
|
|
||||||
|
echo "✅ Test environment started!"
|
||||||
|
echo "📱 Open http://localhost:3000 in your browser"
|
||||||
|
echo ""
|
||||||
|
echo "Press Ctrl+C to shut down both servers"
|
||||||
|
|
||||||
|
# Wait for user to Ctrl+C
|
||||||
|
trap "kill $WS_SERVER_PID $CLIENT_SERVER_PID; echo 'Servers stopped'; exit" INT
|
||||||
|
wait
|
||||||
@@ -263,6 +263,8 @@
|
|||||||
<button id="disconnectBtn" disabled>Disconnect</button>
|
<button id="disconnectBtn" disabled>Disconnect</button>
|
||||||
</div>
|
</div>
|
||||||
<div id="status"><span class="status-dot disconnected"></span>Disconnected</div>
|
<div id="status"><span class="status-dot disconnected"></span>Disconnected</div>
|
||||||
|
<div style="font-size: 13px; color: #666; margin-top: 4px;">Debug: Check browser console (F12) for detailed logs
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Input Controls -->
|
<!-- Input Controls -->
|
||||||
@@ -440,6 +442,7 @@
|
|||||||
setStatus("Playing audio...", "speaking");
|
setStatus("Playing audio...", "speaking");
|
||||||
|
|
||||||
const { bytes, format } = audioQueue.shift();
|
const { bytes, format } = audioQueue.shift();
|
||||||
|
console.log(`Playing audio chunk: ${bytes.length} bytes, format: ${format}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const ctx = getAudioContext();
|
const ctx = getAudioContext();
|
||||||
@@ -449,7 +452,10 @@
|
|||||||
);
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
console.log(`Attempting to decode audio with WebAudio API...`);
|
||||||
const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
|
const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
|
||||||
|
console.log(`Decoded audio successfully: ${audioBuffer.duration.toFixed(2)}s, ${audioBuffer.numberOfChannels} channels, ${audioBuffer.sampleRate}Hz`);
|
||||||
|
|
||||||
await new Promise((resolve) => {
|
await new Promise((resolve) => {
|
||||||
const source = ctx.createBufferSource();
|
const source = ctx.createBufferSource();
|
||||||
source.buffer = audioBuffer;
|
source.buffer = audioBuffer;
|
||||||
@@ -457,29 +463,49 @@
|
|||||||
currentAudioSource = source;
|
currentAudioSource = source;
|
||||||
source.onended = resolve;
|
source.onended = resolve;
|
||||||
source.start(0);
|
source.start(0);
|
||||||
|
console.log(`Audio playback started`);
|
||||||
});
|
});
|
||||||
|
console.log(`Audio playback completed`);
|
||||||
currentAudioSource = null;
|
currentAudioSource = null;
|
||||||
} catch (_decodeErr) {
|
} catch (decodeErr) {
|
||||||
|
console.warn(`WebAudio decode failed, falling back to Audio element:`, decodeErr);
|
||||||
const mime = getMimeTypeForFormat(format);
|
const mime = getMimeTypeForFormat(format);
|
||||||
|
console.log(`Using MIME type: ${mime}`);
|
||||||
|
|
||||||
const blob = new Blob([bytes], { type: mime });
|
const blob = new Blob([bytes], { type: mime });
|
||||||
const url = URL.createObjectURL(blob);
|
const url = URL.createObjectURL(blob);
|
||||||
const audio = new Audio(url);
|
const audio = new Audio(url);
|
||||||
|
|
||||||
|
audio.onerror = (e) => console.error(`Audio element error:`, e);
|
||||||
|
audio.oncanplaythrough = () => console.log(`Audio ready to play: ${audio.duration.toFixed(2)}s`);
|
||||||
|
|
||||||
currentAudioElement = audio;
|
currentAudioElement = audio;
|
||||||
await audio.play();
|
await audio.play();
|
||||||
|
console.log(`Audio element playback started`);
|
||||||
|
|
||||||
await new Promise((resolve) => {
|
await new Promise((resolve) => {
|
||||||
audio.onended = resolve;
|
audio.onended = () => {
|
||||||
audio.onerror = resolve;
|
console.log(`Audio element playback completed`);
|
||||||
|
resolve();
|
||||||
|
};
|
||||||
|
audio.onerror = (e) => {
|
||||||
|
console.error(`Audio element playback failed:`, e);
|
||||||
|
resolve();
|
||||||
|
};
|
||||||
});
|
});
|
||||||
currentAudioElement = null;
|
currentAudioElement = null;
|
||||||
URL.revokeObjectURL(url);
|
URL.revokeObjectURL(url);
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
console.error(`Audio playback error:`, err);
|
||||||
log(`Audio play error: ${err?.message || err}`);
|
log(`Audio play error: ${err?.message || err}`);
|
||||||
} finally {
|
} finally {
|
||||||
isPlaying = false;
|
isPlaying = false;
|
||||||
if (audioQueue.length > 0) {
|
if (audioQueue.length > 0) {
|
||||||
|
console.log(`${audioQueue.length} more audio chunks in queue, continuing playback`);
|
||||||
playNextAudioChunk();
|
playNextAudioChunk();
|
||||||
} else if (connected) {
|
} else if (connected) {
|
||||||
|
console.log(`Audio queue empty, returning to ${whisperListening || micShouldRun ? 'listening' : 'connected'} state`);
|
||||||
setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected", whisperListening || micShouldRun ? "listening" : "connected");
|
setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected", whisperListening || micShouldRun ? "listening" : "connected");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -521,6 +547,7 @@
|
|||||||
case "webm":
|
case "webm":
|
||||||
return "audio/webm";
|
return "audio/webm";
|
||||||
default:
|
default:
|
||||||
|
console.log(`Unknown audio format: ${format}, defaulting to mpeg`);
|
||||||
return `audio/${format || "mpeg"}`;
|
return `audio/${format || "mpeg"}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -543,7 +570,10 @@
|
|||||||
analyserNode = ctx.createAnalyser();
|
analyserNode = ctx.createAnalyser();
|
||||||
analyserNode.fftSize = 256;
|
analyserNode.fftSize = 256;
|
||||||
analyserSource.connect(analyserNode);
|
analyserSource.connect(analyserNode);
|
||||||
} catch (_) { }
|
console.log('Audio analyser setup complete');
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Audio analyser setup failed:', err);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function teardownAnalyser() {
|
function teardownAnalyser() {
|
||||||
@@ -671,15 +701,26 @@
|
|||||||
*/
|
*/
|
||||||
async function startWhisperListening() {
|
async function startWhisperListening() {
|
||||||
try {
|
try {
|
||||||
|
console.log("Starting Whisper VAD listening");
|
||||||
mediaStream = await navigator.mediaDevices.getUserMedia({
|
mediaStream = await navigator.mediaDevices.getUserMedia({
|
||||||
audio: {
|
audio: {
|
||||||
channelCount: 1,
|
channelCount: 1,
|
||||||
sampleRate: 16000,
|
sampleRate: 16000,
|
||||||
echoCancellation: true,
|
echoCancellation: true,
|
||||||
noiseSuppression: true,
|
noiseSuppression: true,
|
||||||
|
autoGainControl: true
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Log the actual constraints we got
|
||||||
|
const tracks = mediaStream.getAudioTracks();
|
||||||
|
if (tracks.length > 0) {
|
||||||
|
const settings = tracks[0].getSettings();
|
||||||
|
console.log('Audio track settings:', settings);
|
||||||
|
log(`🎵 Audio: ${settings.sampleRate}Hz, ${settings.channelCount}ch`);
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
console.error("Mic permission error:", err);
|
||||||
log(`Mic permission failed: ${err?.message || err}`);
|
log(`Mic permission failed: ${err?.message || err}`);
|
||||||
setStatus("Mic permission denied", "disconnected");
|
setStatus("Mic permission denied", "disconnected");
|
||||||
return;
|
return;
|
||||||
@@ -808,13 +849,45 @@
|
|||||||
whisperSegmentActive = true;
|
whisperSegmentActive = true;
|
||||||
segmentStartTime = Date.now();
|
segmentStartTime = Date.now();
|
||||||
|
|
||||||
const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
|
// Try to choose the best format for Whisper compatibility
|
||||||
? "audio/webm;codecs=opus"
|
let mimeType = '';
|
||||||
: MediaRecorder.isTypeSupported("audio/mp4")
|
const supportedTypes = [
|
||||||
? "audio/mp4"
|
"audio/webm;codecs=opus", // Best compatibility with Whisper
|
||||||
: "audio/webm";
|
"audio/webm",
|
||||||
|
"audio/ogg;codecs=opus",
|
||||||
|
"audio/mp4",
|
||||||
|
"audio/wav"
|
||||||
|
];
|
||||||
|
|
||||||
mediaRecorder = new MediaRecorder(mediaStream, { mimeType });
|
for (const type of supportedTypes) {
|
||||||
|
if (MediaRecorder.isTypeSupported(type)) {
|
||||||
|
mimeType = type;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mimeType) {
|
||||||
|
console.warn("No preferred MIME types supported, using default");
|
||||||
|
mimeType = ''; // Let the browser choose
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Using MediaRecorder with MIME type: ${mimeType || 'browser default'}`);
|
||||||
|
|
||||||
|
// Create recorder with bitrate suitable for speech
|
||||||
|
const recorderOptions = {
|
||||||
|
mimeType: mimeType,
|
||||||
|
audioBitsPerSecond: 128000 // 128kbps is good for speech
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
mediaRecorder = new MediaRecorder(mediaStream, recorderOptions);
|
||||||
|
console.log(`MediaRecorder created with mimeType: ${mediaRecorder.mimeType}`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("MediaRecorder creation failed:", err);
|
||||||
|
// Fallback to default options
|
||||||
|
mediaRecorder = new MediaRecorder(mediaStream);
|
||||||
|
console.log(`Fallback MediaRecorder created with mimeType: ${mediaRecorder.mimeType}`);
|
||||||
|
}
|
||||||
|
|
||||||
mediaRecorder.ondataavailable = (event) => {
|
mediaRecorder.ondataavailable = (event) => {
|
||||||
if (event.data.size > 0) audioChunks.push(event.data);
|
if (event.data.size > 0) audioChunks.push(event.data);
|
||||||
@@ -843,27 +916,45 @@
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
|
try {
|
||||||
audioChunks = [];
|
const blob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
|
||||||
|
audioChunks = [];
|
||||||
|
|
||||||
const arrayBuffer = await blob.arrayBuffer();
|
const arrayBuffer = await blob.arrayBuffer();
|
||||||
const uint8 = new Uint8Array(arrayBuffer);
|
const uint8 = new Uint8Array(arrayBuffer);
|
||||||
|
|
||||||
// Base64 encode in chunks to avoid stack overflow
|
// Base64 encode in chunks to avoid stack overflow
|
||||||
let binary = "";
|
let binary = "";
|
||||||
const chunkSize = 8192;
|
const chunkSize = 8192;
|
||||||
for (let i = 0; i < uint8.length; i += chunkSize) {
|
for (let i = 0; i < uint8.length; i += chunkSize) {
|
||||||
const slice = uint8.subarray(i, Math.min(i + chunkSize, uint8.length));
|
const slice = uint8.subarray(i, Math.min(i + chunkSize, uint8.length));
|
||||||
binary += String.fromCharCode.apply(null, slice);
|
binary += String.fromCharCode.apply(null, slice);
|
||||||
}
|
}
|
||||||
const base64 = btoa(binary);
|
const base64 = btoa(binary);
|
||||||
|
|
||||||
resetOutputPanels();
|
// Log audio size for debugging
|
||||||
|
console.log(`Prepared audio segment: ${(uint8.length / 1024).toFixed(1)}KB, duration: ${duration}ms, mime: ${mediaRecorder.mimeType}`);
|
||||||
|
|
||||||
if (ws && connected) {
|
resetOutputPanels();
|
||||||
ws.send(JSON.stringify({ type: "audio", data: base64 }));
|
|
||||||
log(`→ Sent audio segment (${(uint8.length / 1024).toFixed(1)} KB, ${duration}ms) for Whisper`);
|
if (ws && connected) {
|
||||||
transcriptEl.textContent = "🎙️ Transcribing audio...";
|
// Add format information to help server decode the audio
|
||||||
|
const message = {
|
||||||
|
type: "audio",
|
||||||
|
data: base64,
|
||||||
|
format: mediaRecorder.mimeType,
|
||||||
|
sampleRate: 16000, // Match the constraint we requested
|
||||||
|
duration: duration
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`Sending audio to server: ${(base64.length / 1000).toFixed(1)}KB, format: ${mediaRecorder.mimeType}`);
|
||||||
|
ws.send(JSON.stringify(message));
|
||||||
|
log(`→ Sent audio segment (${(uint8.length / 1024).toFixed(1)} KB, ${duration}ms) for Whisper`);
|
||||||
|
transcriptEl.textContent = "🎙️ Transcribing audio...";
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Error processing audio segment:", err);
|
||||||
|
log(`❌ Error processing audio: ${err.message || err}`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -921,6 +1012,18 @@
|
|||||||
// ── Server Message Handler ──────────────────────────────────────────
|
// ── Server Message Handler ──────────────────────────────────────────
|
||||||
function handleServerMessage(msg) {
|
function handleServerMessage(msg) {
|
||||||
switch (msg.type) {
|
switch (msg.type) {
|
||||||
|
// ── Transcription feedback ────────────────
|
||||||
|
case "transcription_result":
|
||||||
|
console.log(`Received transcription: "${msg.text}", language: ${msg.language || 'unknown'}`);
|
||||||
|
transcriptEl.textContent = msg.text;
|
||||||
|
log(`🎙️ Transcription: ${msg.text}`);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "transcription_error":
|
||||||
|
console.error(`Transcription error: ${msg.error}`);
|
||||||
|
transcriptEl.textContent = `⚠️ ${msg.error}`;
|
||||||
|
log(`❌ Transcription error: ${msg.error}`);
|
||||||
|
break;
|
||||||
// ── Stream lifecycle ────────────────────
|
// ── Stream lifecycle ────────────────────
|
||||||
case "stream_start":
|
case "stream_start":
|
||||||
assistantEl.textContent = "";
|
assistantEl.textContent = "";
|
||||||
@@ -1022,8 +1125,9 @@
|
|||||||
|
|
||||||
case "audio_chunk": {
|
case "audio_chunk": {
|
||||||
const bytes = decodeBase64ToBytes(msg.data);
|
const bytes = decodeBase64ToBytes(msg.data);
|
||||||
audioQueue.push({ bytes, format: msg.format || "opus" });
|
audioQueue.push({ bytes, format: msg.format || "mp3" });
|
||||||
log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "opus"})`);
|
log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`);
|
||||||
|
console.log(`Received audio chunk #${msg.chunkId ?? "?"}: ${bytes.length} bytes, format: ${msg.format || "mp3"}`);
|
||||||
playNextAudioChunk();
|
playNextAudioChunk();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -1096,9 +1200,12 @@
|
|||||||
|
|
||||||
ws.onmessage = (event) => {
|
ws.onmessage = (event) => {
|
||||||
try {
|
try {
|
||||||
|
console.log(`← Received WebSocket message: ${event.data.length} bytes`);
|
||||||
const msg = JSON.parse(event.data);
|
const msg = JSON.parse(event.data);
|
||||||
|
console.log('Message parsed:', msg.type);
|
||||||
handleServerMessage(msg);
|
handleServerMessage(msg);
|
||||||
} catch {
|
} catch (err) {
|
||||||
|
console.error('Parse error:', err);
|
||||||
log("Received non-JSON message");
|
log("Received non-JSON message");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ const wss = new WebSocketServer({ port, host });
|
|||||||
wss.on("listening", () => {
|
wss.on("listening", () => {
|
||||||
console.log(`[ws-server] listening on ${endpoint}`);
|
console.log(`[ws-server] listening on ${endpoint}`);
|
||||||
console.log("[ws-server] Waiting for connections...\n");
|
console.log("[ws-server] Waiting for connections...\n");
|
||||||
|
console.log(`[ws-server] 🌐 Open voice-client.html in your browser and connect to ${endpoint}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
wss.on("connection", (socket) => {
|
wss.on("connection", (socket) => {
|
||||||
@@ -55,12 +56,12 @@ Keep responses concise and conversational since they will be spoken aloud.
|
|||||||
Use tools when needed to provide accurate information.`,
|
Use tools when needed to provide accurate information.`,
|
||||||
voice: "alloy",
|
voice: "alloy",
|
||||||
speechInstructions: "Speak in a friendly, natural conversational tone.",
|
speechInstructions: "Speak in a friendly, natural conversational tone.",
|
||||||
outputFormat: "opus",
|
outputFormat: "mp3", // Using mp3 for better browser compatibility
|
||||||
streamingSpeech: {
|
streamingSpeech: {
|
||||||
minChunkSize: 40,
|
minChunkSize: 20, // Smaller chunks for faster streaming
|
||||||
maxChunkSize: 180,
|
maxChunkSize: 150, // Not too large to ensure timely audio delivery
|
||||||
parallelGeneration: true,
|
parallelGeneration: true, // Generate audio chunks in parallel
|
||||||
maxParallelRequests: 2,
|
maxParallelRequests: 3, // Allow up to 3 concurrent TTS requests
|
||||||
},
|
},
|
||||||
tools: {
|
tools: {
|
||||||
getWeather: weatherTool,
|
getWeather: weatherTool,
|
||||||
@@ -96,6 +97,50 @@ Use tools when needed to provide accurate information.`,
|
|||||||
console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`);
|
console.log(`[ws-server] 🔊 Audio chunk #${chunkId}: ${uint8Array.length} bytes (${format})`);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Log raw WebSocket messages for debugging
|
||||||
|
const originalSend = socket.send.bind(socket);
|
||||||
|
|
||||||
|
// Define a wrapper function to log messages before sending
|
||||||
|
function loggedSend(data: any): void {
|
||||||
|
let dataSize = 'unknown size';
|
||||||
|
if (typeof data === 'string') {
|
||||||
|
dataSize = `${data.length} chars`;
|
||||||
|
} else if (data instanceof Buffer || data instanceof ArrayBuffer) {
|
||||||
|
dataSize = `${Buffer.byteLength(data)} bytes`;
|
||||||
|
} else if (data instanceof Uint8Array) {
|
||||||
|
dataSize = `${data.byteLength} bytes`;
|
||||||
|
}
|
||||||
|
console.log(`[ws-server] → Sending WebSocket data (${dataSize})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a proxy for the send method that logs but preserves original signatures
|
||||||
|
socket.send = function (data: any, optionsOrCallback?: any, callback?: (err?: Error) => void): void {
|
||||||
|
loggedSend(data);
|
||||||
|
|
||||||
|
if (typeof optionsOrCallback === 'function') {
|
||||||
|
// Handle the (data, callback) signature
|
||||||
|
return originalSend(data, optionsOrCallback);
|
||||||
|
} else if (optionsOrCallback) {
|
||||||
|
// Handle the (data, options, callback) signature
|
||||||
|
return originalSend(data, optionsOrCallback, callback);
|
||||||
|
} else {
|
||||||
|
// Handle the (data) signature
|
||||||
|
return originalSend(data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
socket.on('message', (data: any) => {
|
||||||
|
let dataSize = 'unknown size';
|
||||||
|
if (data instanceof Buffer) {
|
||||||
|
dataSize = `${data.length} bytes`;
|
||||||
|
} else if (data instanceof ArrayBuffer) {
|
||||||
|
dataSize = `${data.byteLength} bytes`;
|
||||||
|
} else if (typeof data === 'string') {
|
||||||
|
dataSize = `${data.length} chars`;
|
||||||
|
}
|
||||||
|
console.log(`[ws-server] ← Received WebSocket data (${dataSize})`);
|
||||||
|
});
|
||||||
|
|
||||||
agent.on("transcription", ({ text, language }: { text: string; language?: string }) => {
|
agent.on("transcription", ({ text, language }: { text: string; language?: string }) => {
|
||||||
console.log(`[ws-server] 📝 Transcription (${language || "unknown"}): ${text}`);
|
console.log(`[ws-server] 📝 Transcription (${language || "unknown"}): ${text}`);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -7,7 +7,8 @@
|
|||||||
"build": "tsc",
|
"build": "tsc",
|
||||||
"dev": "tsc -w",
|
"dev": "tsc -w",
|
||||||
"demo": "tsx example/demo.ts",
|
"demo": "tsx example/demo.ts",
|
||||||
"ws:server": "tsx example/ws-server-2.ts",
|
"ws:server": "tsx example/ws-server.ts",
|
||||||
|
"client": "node example/serve-client.js",
|
||||||
"prepublishOnly": "pnpm build"
|
"prepublishOnly": "pnpm build"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
|
|||||||
@@ -156,6 +156,7 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
this.socket.on("message", async (data) => {
|
this.socket.on("message", async (data) => {
|
||||||
try {
|
try {
|
||||||
const message = JSON.parse(data.toString());
|
const message = JSON.parse(data.toString());
|
||||||
|
console.log(`Received WebSocket message of type: ${message.type}`);
|
||||||
|
|
||||||
// Handle transcribed text from the client/STT
|
// Handle transcribed text from the client/STT
|
||||||
if (message.type === "transcript") {
|
if (message.type === "transcript") {
|
||||||
@@ -165,6 +166,7 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
}
|
}
|
||||||
// Interrupt ongoing speech when user starts speaking (barge-in)
|
// Interrupt ongoing speech when user starts speaking (barge-in)
|
||||||
this.interruptCurrentResponse("user_speaking");
|
this.interruptCurrentResponse("user_speaking");
|
||||||
|
console.log(`Processing transcript: "${message.text}"`);
|
||||||
await this.enqueueInput(message.text);
|
await this.enqueueInput(message.text);
|
||||||
}
|
}
|
||||||
// Handle raw audio data that needs transcription
|
// Handle raw audio data that needs transcription
|
||||||
@@ -175,10 +177,12 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
}
|
}
|
||||||
// Interrupt ongoing speech when user starts speaking (barge-in)
|
// Interrupt ongoing speech when user starts speaking (barge-in)
|
||||||
this.interruptCurrentResponse("user_speaking");
|
this.interruptCurrentResponse("user_speaking");
|
||||||
await this.processAudioInput(message.data);
|
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || 'unknown'}`);
|
||||||
|
await this.processAudioInput(message.data, message.format);
|
||||||
}
|
}
|
||||||
// Handle explicit interrupt request from client
|
// Handle explicit interrupt request from client
|
||||||
else if (message.type === "interrupt") {
|
else if (message.type === "interrupt") {
|
||||||
|
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
|
||||||
this.interruptCurrentResponse(message.reason || "client_request");
|
this.interruptCurrentResponse(message.reason || "client_request");
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -246,17 +250,38 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
throw new Error("Transcription model not configured");
|
throw new Error("Transcription model not configured");
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = await transcribe({
|
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
|
||||||
model: this.transcriptionModel,
|
|
||||||
audio: audioData,
|
|
||||||
});
|
|
||||||
|
|
||||||
this.emit("transcription", {
|
try {
|
||||||
text: result.text,
|
// Note: The AI SDK transcribe function only accepts these parameters
|
||||||
language: result.language,
|
// We can't directly pass language or temperature to it
|
||||||
});
|
const result = await transcribe({
|
||||||
|
model: this.transcriptionModel,
|
||||||
|
audio: audioData,
|
||||||
|
// If we need to pass additional options to OpenAI Whisper,
|
||||||
|
// we would need to do it via providerOptions if supported
|
||||||
|
});
|
||||||
|
|
||||||
return result.text;
|
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || 'unknown'}`);
|
||||||
|
|
||||||
|
this.emit("transcription", {
|
||||||
|
text: result.text,
|
||||||
|
language: result.language,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Also send transcription to client for immediate feedback
|
||||||
|
this.sendWebSocketMessage({
|
||||||
|
type: "transcription_result",
|
||||||
|
text: result.text,
|
||||||
|
language: result.language,
|
||||||
|
});
|
||||||
|
|
||||||
|
return result.text;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Whisper transcription failed:", error);
|
||||||
|
// Re-throw to be handled by the caller
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -472,13 +497,16 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? '...' : ''}"`);
|
||||||
const audioData = await this.generateSpeechFromText(
|
const audioData = await this.generateSpeechFromText(
|
||||||
chunk.text,
|
chunk.text,
|
||||||
this.currentSpeechAbortController.signal
|
this.currentSpeechAbortController.signal
|
||||||
);
|
);
|
||||||
|
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
|
||||||
return audioData;
|
return audioData;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if ((error as Error).name === "AbortError") {
|
if ((error as Error).name === "AbortError") {
|
||||||
|
console.log(`Audio generation aborted for chunk ${chunk.id}`);
|
||||||
return null; // Cancelled, don't report as error
|
return null; // Cancelled, don't report as error
|
||||||
}
|
}
|
||||||
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
|
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
|
||||||
@@ -494,6 +522,7 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
if (this.isSpeaking) return;
|
if (this.isSpeaking) return;
|
||||||
this.isSpeaking = true;
|
this.isSpeaking = true;
|
||||||
|
|
||||||
|
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
|
||||||
this.emit("speech_start", { streaming: true });
|
this.emit("speech_start", { streaming: true });
|
||||||
this.sendWebSocketMessage({ type: "speech_stream_start" });
|
this.sendWebSocketMessage({ type: "speech_stream_start" });
|
||||||
|
|
||||||
@@ -501,6 +530,8 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
while (this.speechChunkQueue.length > 0) {
|
while (this.speechChunkQueue.length > 0) {
|
||||||
const chunk = this.speechChunkQueue[0];
|
const chunk = this.speechChunkQueue[0];
|
||||||
|
|
||||||
|
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
|
||||||
|
|
||||||
// Ensure audio generation has started
|
// Ensure audio generation has started
|
||||||
if (!chunk.audioPromise) {
|
if (!chunk.audioPromise) {
|
||||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||||
@@ -510,13 +541,17 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
const audioData = await chunk.audioPromise;
|
const audioData = await chunk.audioPromise;
|
||||||
|
|
||||||
// Check if we were interrupted while waiting
|
// Check if we were interrupted while waiting
|
||||||
if (!this.isSpeaking) break;
|
if (!this.isSpeaking) {
|
||||||
|
console.log(`Speech interrupted during chunk #${chunk.id}`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// Remove from queue after processing
|
// Remove from queue after processing
|
||||||
this.speechChunkQueue.shift();
|
this.speechChunkQueue.shift();
|
||||||
|
|
||||||
if (audioData) {
|
if (audioData) {
|
||||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||||
|
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
|
||||||
|
|
||||||
// Send audio chunk via WebSocket
|
// Send audio chunk via WebSocket
|
||||||
this.sendWebSocketMessage({
|
this.sendWebSocketMessage({
|
||||||
@@ -535,6 +570,8 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
text: chunk.text,
|
text: chunk.text,
|
||||||
uint8Array: audioData,
|
uint8Array: audioData,
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
console.log(`No audio data generated for chunk #${chunk.id}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start generating next chunks in parallel
|
// Start generating next chunks in parallel
|
||||||
@@ -545,14 +582,20 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
this.speechChunkQueue.length
|
this.speechChunkQueue.length
|
||||||
);
|
);
|
||||||
|
|
||||||
for (let i = 0; i < toStart; i++) {
|
if (toStart > 0) {
|
||||||
const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise);
|
console.log(`Starting parallel generation for ${toStart} more chunks`);
|
||||||
if (nextChunk) {
|
for (let i = 0; i < toStart; i++) {
|
||||||
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
|
const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise);
|
||||||
|
if (nextChunk) {
|
||||||
|
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error in speech queue processing:", error);
|
||||||
|
this.emit("error", error);
|
||||||
} finally {
|
} finally {
|
||||||
this.isSpeaking = false;
|
this.isSpeaking = false;
|
||||||
this.currentSpeechAbortController = undefined;
|
this.currentSpeechAbortController = undefined;
|
||||||
@@ -564,6 +607,7 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
this.speechQueueDonePromise = undefined;
|
this.speechQueueDonePromise = undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(`Speech queue processing complete`);
|
||||||
this.sendWebSocketMessage({ type: "speech_stream_end" });
|
this.sendWebSocketMessage({ type: "speech_stream_end" });
|
||||||
this.emit("speech_complete", { streaming: true });
|
this.emit("speech_complete", { streaming: true });
|
||||||
}
|
}
|
||||||
@@ -600,7 +644,7 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
/**
|
/**
|
||||||
* Process incoming audio data: transcribe and generate response
|
* Process incoming audio data: transcribe and generate response
|
||||||
*/
|
*/
|
||||||
private async processAudioInput(base64Audio: string): Promise<void> {
|
private async processAudioInput(base64Audio: string, format?: string): Promise<void> {
|
||||||
if (!this.transcriptionModel) {
|
if (!this.transcriptionModel) {
|
||||||
this.emit("error", new Error("Transcription model not configured for audio input"));
|
this.emit("error", new Error("Transcription model not configured for audio input"));
|
||||||
return;
|
return;
|
||||||
@@ -624,16 +668,28 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.emit("audio_received", { size: audioBuffer.length });
|
this.emit("audio_received", { size: audioBuffer.length, format });
|
||||||
|
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || 'unknown'}`);
|
||||||
|
|
||||||
const transcribedText = await this.transcribeAudio(audioBuffer);
|
const transcribedText = await this.transcribeAudio(audioBuffer);
|
||||||
|
console.log(`Transcribed text: "${transcribedText}"`);
|
||||||
|
|
||||||
if (transcribedText.trim()) {
|
if (transcribedText.trim()) {
|
||||||
await this.enqueueInput(transcribedText);
|
await this.enqueueInput(transcribedText);
|
||||||
|
} else {
|
||||||
|
this.emit("warning", "Transcription returned empty text");
|
||||||
|
this.sendWebSocketMessage({
|
||||||
|
type: "transcription_error",
|
||||||
|
error: "Whisper returned empty text"
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Failed to process audio input:", error);
|
console.error("Failed to process audio input:", error);
|
||||||
this.emit("error", error);
|
this.emit("error", error);
|
||||||
|
this.sendWebSocketMessage({
|
||||||
|
type: "transcription_error",
|
||||||
|
error: `Transcription failed: ${(error as Error).message || String(error)}`
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1110,7 +1166,20 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
if (this.socket.readyState === WebSocket.OPEN) {
|
if (this.socket.readyState === WebSocket.OPEN) {
|
||||||
|
// Skip logging huge audio data for better readability
|
||||||
|
if (message.type === "audio_chunk" || message.type === "audio") {
|
||||||
|
const { data, ...rest } = message as any;
|
||||||
|
console.log(`Sending WebSocket message: ${message.type}`,
|
||||||
|
data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "",
|
||||||
|
rest
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
console.log(`Sending WebSocket message: ${message.type}`);
|
||||||
|
}
|
||||||
|
|
||||||
this.socket.send(JSON.stringify(message));
|
this.socket.send(JSON.stringify(message));
|
||||||
|
} else {
|
||||||
|
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Socket may have closed between the readyState check and send()
|
// Socket may have closed between the readyState check and send()
|
||||||
|
|||||||
Reference in New Issue
Block a user