diff --git a/example/voice-client.html b/example/voice-client.html index 27b67d4..703b790 100644 --- a/example/voice-client.html +++ b/example/voice-client.html @@ -347,13 +347,22 @@ let whisperSegmentActive = false; // currently capturing a speech segment // VAD (Voice Activity Detection) config - const VAD_SPEECH_THRESHOLD = 12; // RMS above this = speech detected - const VAD_SILENCE_TIMEOUT = 1500; // ms of silence before auto-sending segment - const VAD_MIN_SEGMENT_MS = 300; // ignore segments shorter than this - const VAD_POLL_INTERVAL = 60; // ms between VAD checks + // Tuned to avoid triggering on light background noise. + const VAD_BASE_THRESHOLD = 18; // absolute floor for speech gate + const VAD_NOISE_MULTIPLIER = 2.2; // speech gate = noiseFloor * multiplier + const VAD_SILENCE_HYSTERESIS = 0.65; // lower gate while already in speech + const VAD_SPEECH_START_FRAMES = 4; // consecutive speech frames to start (~240ms) + const VAD_SILENCE_TIMEOUT = 900; // ms of silence before auto-sending segment + const VAD_MIN_SEGMENT_MS = 700; // ignore short noise bursts + const VAD_POLL_INTERVAL = 60; // ms between VAD checks + const VAD_CALIBRATION_MS = 1200; // initial noise floor calibration let vadSilenceTimer = null; let vadPollTimer = null; let segmentStartTime = 0; + let vadSmoothedRms = 0; + let vadNoiseFloor = 10; + let vadCalibratingUntil = 0; + let vadSpeechFrames = 0; // Audio playback state let audioContext = null; @@ -451,7 +460,7 @@ }); currentAudioSource = null; } catch (_decodeErr) { - const mime = format === "mp3" ? "audio/mpeg" : `audio/${format}`; + const mime = getMimeTypeForFormat(format); const blob = new Blob([bytes], { type: mime }); const url = URL.createObjectURL(blob); const audio = new Audio(url); @@ -497,6 +506,25 @@ return bytes; } + function getMimeTypeForFormat(format) { + switch ((format || "").toLowerCase()) { + case "opus": + return "audio/ogg; codecs=opus"; + case "ogg": + return "audio/ogg"; + case "wav": + return "audio/wav"; + case "mp3": + return "audio/mpeg"; + case "aac": + return "audio/aac"; + case "webm": + return "audio/webm"; + default: + return `audio/${format || "mpeg"}`; + } + } + // ── RMS audio level from analyser ─────────────────────────────────── function getCurrentRMS() { if (!analyserNode) return 0; @@ -658,6 +686,10 @@ } whisperListening = true; + vadSmoothedRms = 0; + vadNoiseFloor = 10; + vadSpeechFrames = 0; + vadCalibratingUntil = Date.now() + VAD_CALIBRATION_MS; startMicBtn.disabled = true; stopMicBtn.disabled = false; @@ -666,7 +698,7 @@ startVADPolling(); setStatus("Listening (Whisper VAD)...", "listening"); - log("Whisper VAD listening started — speak and it will auto-detect"); + log("Whisper VAD listening started — calibrating noise floor..."); } function stopWhisperListening() { @@ -688,6 +720,7 @@ teardownAnalyser(); stopViz(); + vadSpeechFrames = 0; startMicBtn.disabled = !connected; stopMicBtn.disabled = true; @@ -711,16 +744,37 @@ if (!whisperListening) return; const rms = getCurrentRMS(); - const isSpeech = rms > VAD_SPEECH_THRESHOLD; + // Smooth audio level so tiny spikes don't trigger segments + vadSmoothedRms = vadSmoothedRms === 0 ? rms : (vadSmoothedRms * 0.8) + (rms * 0.2); + + // Adaptive noise-floor calibration / tracking + // During startup: quick calibration + // After startup: slow drift only when not in a speech segment + if (Date.now() < vadCalibratingUntil) { + vadNoiseFloor = vadNoiseFloor === 0 ? vadSmoothedRms : (vadNoiseFloor * 0.85) + (vadSmoothedRms * 0.15); + return; + } + + if (!whisperSegmentActive) { + vadNoiseFloor = (vadNoiseFloor * 0.97) + (vadSmoothedRms * 0.03); + } + + const speechThreshold = Math.max(VAD_BASE_THRESHOLD, vadNoiseFloor * VAD_NOISE_MULTIPLIER); + const silenceThreshold = speechThreshold * VAD_SILENCE_HYSTERESIS; + const isSpeech = whisperSegmentActive + ? vadSmoothedRms > silenceThreshold + : vadSmoothedRms > speechThreshold; if (isSpeech) { - // ── Speech detected ── - // Auto barge-in: if assistant is speaking, interrupt it - autoBargeIn(); - if (!whisperSegmentActive) { - // Start a new recording segment - beginWhisperSegment(); + // Debounce: require N consecutive speech frames before start + vadSpeechFrames += 1; + if (vadSpeechFrames >= VAD_SPEECH_START_FRAMES) { + // Auto barge-in only once we are confident speech is real + autoBargeIn(); + beginWhisperSegment(); + vadSpeechFrames = 0; + } } // Reset silence timer — user is still talking @@ -728,7 +782,14 @@ clearTimeout(vadSilenceTimer); vadSilenceTimer = null; } - } else if (whisperSegmentActive && !vadSilenceTimer) { + } else { + // Decay speech frames when signal drops, prevents start on random bursts + if (!whisperSegmentActive) { + vadSpeechFrames = Math.max(0, vadSpeechFrames - 1); + } + } + + if (!isSpeech && whisperSegmentActive && !vadSilenceTimer) { // ── Silence while recording → start countdown ── vadSilenceTimer = setTimeout(() => { vadSilenceTimer = null; @@ -961,8 +1022,8 @@ case "audio_chunk": { const bytes = decodeBase64ToBytes(msg.data); - audioQueue.push({ bytes, format: msg.format || "mp3" }); - log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`); + audioQueue.push({ bytes, format: msg.format || "opus" }); + log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "opus"})`); playNextAudioChunk(); break; } @@ -970,7 +1031,7 @@ // Full audio (non-streaming fallback) case "audio": { const bytes = decodeBase64ToBytes(msg.data); - audioQueue.push({ bytes, format: msg.format || "mp3" }); + audioQueue.push({ bytes, format: msg.format || "opus" }); log(`🔊 Full audio (${bytes.length} bytes)`); playNextAudioChunk(); break; @@ -983,7 +1044,7 @@ setBadge("interrupted", "idle"); if (connected) { setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected", - whisperListening || micShouldRun ? "listening" : "connected"); + whisperListening || micShouldRun ? "listening" : "connected"); } break; diff --git a/example/ws-server.ts b/example/ws-server.ts index f648c61..a8537f8 100644 --- a/example/ws-server.ts +++ b/example/ws-server.ts @@ -55,7 +55,7 @@ Keep responses concise and conversational since they will be spoken aloud. Use tools when needed to provide accurate information.`, voice: "alloy", speechInstructions: "Speak in a friendly, natural conversational tone.", - outputFormat: "mp3", + outputFormat: "opus", streamingSpeech: { minChunkSize: 40, maxChunkSize: 180, diff --git a/output_chunk_0.mp3 b/output_chunk_0.mp3 new file mode 100644 index 0000000..79dc113 Binary files /dev/null and b/output_chunk_0.mp3 differ