mirror of
https://github.com/Bijit-Mondal/VoiceAgent.git
synced 2026-03-02 18:36:39 +00:00
voice agent works
This commit is contained in:
@@ -347,13 +347,22 @@
|
|||||||
let whisperSegmentActive = false; // currently capturing a speech segment
|
let whisperSegmentActive = false; // currently capturing a speech segment
|
||||||
|
|
||||||
// VAD (Voice Activity Detection) config
|
// VAD (Voice Activity Detection) config
|
||||||
const VAD_SPEECH_THRESHOLD = 12; // RMS above this = speech detected
|
// Tuned to avoid triggering on light background noise.
|
||||||
const VAD_SILENCE_TIMEOUT = 1500; // ms of silence before auto-sending segment
|
const VAD_BASE_THRESHOLD = 18; // absolute floor for speech gate
|
||||||
const VAD_MIN_SEGMENT_MS = 300; // ignore segments shorter than this
|
const VAD_NOISE_MULTIPLIER = 2.2; // speech gate = noiseFloor * multiplier
|
||||||
const VAD_POLL_INTERVAL = 60; // ms between VAD checks
|
const VAD_SILENCE_HYSTERESIS = 0.65; // lower gate while already in speech
|
||||||
|
const VAD_SPEECH_START_FRAMES = 4; // consecutive speech frames to start (~240ms)
|
||||||
|
const VAD_SILENCE_TIMEOUT = 900; // ms of silence before auto-sending segment
|
||||||
|
const VAD_MIN_SEGMENT_MS = 700; // ignore short noise bursts
|
||||||
|
const VAD_POLL_INTERVAL = 60; // ms between VAD checks
|
||||||
|
const VAD_CALIBRATION_MS = 1200; // initial noise floor calibration
|
||||||
let vadSilenceTimer = null;
|
let vadSilenceTimer = null;
|
||||||
let vadPollTimer = null;
|
let vadPollTimer = null;
|
||||||
let segmentStartTime = 0;
|
let segmentStartTime = 0;
|
||||||
|
let vadSmoothedRms = 0;
|
||||||
|
let vadNoiseFloor = 10;
|
||||||
|
let vadCalibratingUntil = 0;
|
||||||
|
let vadSpeechFrames = 0;
|
||||||
|
|
||||||
// Audio playback state
|
// Audio playback state
|
||||||
let audioContext = null;
|
let audioContext = null;
|
||||||
@@ -451,7 +460,7 @@
|
|||||||
});
|
});
|
||||||
currentAudioSource = null;
|
currentAudioSource = null;
|
||||||
} catch (_decodeErr) {
|
} catch (_decodeErr) {
|
||||||
const mime = format === "mp3" ? "audio/mpeg" : `audio/${format}`;
|
const mime = getMimeTypeForFormat(format);
|
||||||
const blob = new Blob([bytes], { type: mime });
|
const blob = new Blob([bytes], { type: mime });
|
||||||
const url = URL.createObjectURL(blob);
|
const url = URL.createObjectURL(blob);
|
||||||
const audio = new Audio(url);
|
const audio = new Audio(url);
|
||||||
@@ -497,6 +506,25 @@
|
|||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getMimeTypeForFormat(format) {
|
||||||
|
switch ((format || "").toLowerCase()) {
|
||||||
|
case "opus":
|
||||||
|
return "audio/ogg; codecs=opus";
|
||||||
|
case "ogg":
|
||||||
|
return "audio/ogg";
|
||||||
|
case "wav":
|
||||||
|
return "audio/wav";
|
||||||
|
case "mp3":
|
||||||
|
return "audio/mpeg";
|
||||||
|
case "aac":
|
||||||
|
return "audio/aac";
|
||||||
|
case "webm":
|
||||||
|
return "audio/webm";
|
||||||
|
default:
|
||||||
|
return `audio/${format || "mpeg"}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── RMS audio level from analyser ───────────────────────────────────
|
// ── RMS audio level from analyser ───────────────────────────────────
|
||||||
function getCurrentRMS() {
|
function getCurrentRMS() {
|
||||||
if (!analyserNode) return 0;
|
if (!analyserNode) return 0;
|
||||||
@@ -658,6 +686,10 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
whisperListening = true;
|
whisperListening = true;
|
||||||
|
vadSmoothedRms = 0;
|
||||||
|
vadNoiseFloor = 10;
|
||||||
|
vadSpeechFrames = 0;
|
||||||
|
vadCalibratingUntil = Date.now() + VAD_CALIBRATION_MS;
|
||||||
startMicBtn.disabled = true;
|
startMicBtn.disabled = true;
|
||||||
stopMicBtn.disabled = false;
|
stopMicBtn.disabled = false;
|
||||||
|
|
||||||
@@ -666,7 +698,7 @@
|
|||||||
startVADPolling();
|
startVADPolling();
|
||||||
|
|
||||||
setStatus("Listening (Whisper VAD)...", "listening");
|
setStatus("Listening (Whisper VAD)...", "listening");
|
||||||
log("Whisper VAD listening started — speak and it will auto-detect");
|
log("Whisper VAD listening started — calibrating noise floor...");
|
||||||
}
|
}
|
||||||
|
|
||||||
function stopWhisperListening() {
|
function stopWhisperListening() {
|
||||||
@@ -688,6 +720,7 @@
|
|||||||
|
|
||||||
teardownAnalyser();
|
teardownAnalyser();
|
||||||
stopViz();
|
stopViz();
|
||||||
|
vadSpeechFrames = 0;
|
||||||
|
|
||||||
startMicBtn.disabled = !connected;
|
startMicBtn.disabled = !connected;
|
||||||
stopMicBtn.disabled = true;
|
stopMicBtn.disabled = true;
|
||||||
@@ -711,16 +744,37 @@
|
|||||||
if (!whisperListening) return;
|
if (!whisperListening) return;
|
||||||
|
|
||||||
const rms = getCurrentRMS();
|
const rms = getCurrentRMS();
|
||||||
const isSpeech = rms > VAD_SPEECH_THRESHOLD;
|
// Smooth audio level so tiny spikes don't trigger segments
|
||||||
|
vadSmoothedRms = vadSmoothedRms === 0 ? rms : (vadSmoothedRms * 0.8) + (rms * 0.2);
|
||||||
|
|
||||||
|
// Adaptive noise-floor calibration / tracking
|
||||||
|
// During startup: quick calibration
|
||||||
|
// After startup: slow drift only when not in a speech segment
|
||||||
|
if (Date.now() < vadCalibratingUntil) {
|
||||||
|
vadNoiseFloor = vadNoiseFloor === 0 ? vadSmoothedRms : (vadNoiseFloor * 0.85) + (vadSmoothedRms * 0.15);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!whisperSegmentActive) {
|
||||||
|
vadNoiseFloor = (vadNoiseFloor * 0.97) + (vadSmoothedRms * 0.03);
|
||||||
|
}
|
||||||
|
|
||||||
|
const speechThreshold = Math.max(VAD_BASE_THRESHOLD, vadNoiseFloor * VAD_NOISE_MULTIPLIER);
|
||||||
|
const silenceThreshold = speechThreshold * VAD_SILENCE_HYSTERESIS;
|
||||||
|
const isSpeech = whisperSegmentActive
|
||||||
|
? vadSmoothedRms > silenceThreshold
|
||||||
|
: vadSmoothedRms > speechThreshold;
|
||||||
|
|
||||||
if (isSpeech) {
|
if (isSpeech) {
|
||||||
// ── Speech detected ──
|
|
||||||
// Auto barge-in: if assistant is speaking, interrupt it
|
|
||||||
autoBargeIn();
|
|
||||||
|
|
||||||
if (!whisperSegmentActive) {
|
if (!whisperSegmentActive) {
|
||||||
// Start a new recording segment
|
// Debounce: require N consecutive speech frames before start
|
||||||
beginWhisperSegment();
|
vadSpeechFrames += 1;
|
||||||
|
if (vadSpeechFrames >= VAD_SPEECH_START_FRAMES) {
|
||||||
|
// Auto barge-in only once we are confident speech is real
|
||||||
|
autoBargeIn();
|
||||||
|
beginWhisperSegment();
|
||||||
|
vadSpeechFrames = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset silence timer — user is still talking
|
// Reset silence timer — user is still talking
|
||||||
@@ -728,7 +782,14 @@
|
|||||||
clearTimeout(vadSilenceTimer);
|
clearTimeout(vadSilenceTimer);
|
||||||
vadSilenceTimer = null;
|
vadSilenceTimer = null;
|
||||||
}
|
}
|
||||||
} else if (whisperSegmentActive && !vadSilenceTimer) {
|
} else {
|
||||||
|
// Decay speech frames when signal drops, prevents start on random bursts
|
||||||
|
if (!whisperSegmentActive) {
|
||||||
|
vadSpeechFrames = Math.max(0, vadSpeechFrames - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isSpeech && whisperSegmentActive && !vadSilenceTimer) {
|
||||||
// ── Silence while recording → start countdown ──
|
// ── Silence while recording → start countdown ──
|
||||||
vadSilenceTimer = setTimeout(() => {
|
vadSilenceTimer = setTimeout(() => {
|
||||||
vadSilenceTimer = null;
|
vadSilenceTimer = null;
|
||||||
@@ -961,8 +1022,8 @@
|
|||||||
|
|
||||||
case "audio_chunk": {
|
case "audio_chunk": {
|
||||||
const bytes = decodeBase64ToBytes(msg.data);
|
const bytes = decodeBase64ToBytes(msg.data);
|
||||||
audioQueue.push({ bytes, format: msg.format || "mp3" });
|
audioQueue.push({ bytes, format: msg.format || "opus" });
|
||||||
log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`);
|
log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "opus"})`);
|
||||||
playNextAudioChunk();
|
playNextAudioChunk();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -970,7 +1031,7 @@
|
|||||||
// Full audio (non-streaming fallback)
|
// Full audio (non-streaming fallback)
|
||||||
case "audio": {
|
case "audio": {
|
||||||
const bytes = decodeBase64ToBytes(msg.data);
|
const bytes = decodeBase64ToBytes(msg.data);
|
||||||
audioQueue.push({ bytes, format: msg.format || "mp3" });
|
audioQueue.push({ bytes, format: msg.format || "opus" });
|
||||||
log(`🔊 Full audio (${bytes.length} bytes)`);
|
log(`🔊 Full audio (${bytes.length} bytes)`);
|
||||||
playNextAudioChunk();
|
playNextAudioChunk();
|
||||||
break;
|
break;
|
||||||
@@ -983,7 +1044,7 @@
|
|||||||
setBadge("interrupted", "idle");
|
setBadge("interrupted", "idle");
|
||||||
if (connected) {
|
if (connected) {
|
||||||
setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected",
|
setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected",
|
||||||
whisperListening || micShouldRun ? "listening" : "connected");
|
whisperListening || micShouldRun ? "listening" : "connected");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ Keep responses concise and conversational since they will be spoken aloud.
|
|||||||
Use tools when needed to provide accurate information.`,
|
Use tools when needed to provide accurate information.`,
|
||||||
voice: "alloy",
|
voice: "alloy",
|
||||||
speechInstructions: "Speak in a friendly, natural conversational tone.",
|
speechInstructions: "Speak in a friendly, natural conversational tone.",
|
||||||
outputFormat: "mp3",
|
outputFormat: "opus",
|
||||||
streamingSpeech: {
|
streamingSpeech: {
|
||||||
minChunkSize: 40,
|
minChunkSize: 40,
|
||||||
maxChunkSize: 180,
|
maxChunkSize: 180,
|
||||||
|
|||||||
BIN
output_chunk_0.mp3
Normal file
BIN
output_chunk_0.mp3
Normal file
Binary file not shown.
Reference in New Issue
Block a user