diff --git a/example/voice-client.html b/example/voice-client.html
index 27b67d4..703b790 100644
--- a/example/voice-client.html
+++ b/example/voice-client.html
@@ -347,13 +347,22 @@
         let whisperSegmentActive = false; // currently capturing a speech segment
 
         // VAD (Voice Activity Detection) config
-        const VAD_SPEECH_THRESHOLD = 12;   // RMS above this = speech detected
-        const VAD_SILENCE_TIMEOUT = 1500;  // ms of silence before auto-sending segment
-        const VAD_MIN_SEGMENT_MS = 300;    // ignore segments shorter than this
-        const VAD_POLL_INTERVAL = 60;      // ms between VAD checks
+        // Tuned to avoid triggering on light background noise.
+        const VAD_BASE_THRESHOLD = 18;          // absolute floor for speech gate
+        const VAD_NOISE_MULTIPLIER = 2.2;       // speech gate = noiseFloor * multiplier
+        const VAD_SILENCE_HYSTERESIS = 0.65;    // lower gate while already in speech
+        const VAD_SPEECH_START_FRAMES = 4;      // consecutive speech frames to start (~240ms)
+        const VAD_SILENCE_TIMEOUT = 900;        // ms of silence before auto-sending segment
+        const VAD_MIN_SEGMENT_MS = 700;         // ignore short noise bursts
+        const VAD_POLL_INTERVAL = 60;           // ms between VAD checks
+        const VAD_CALIBRATION_MS = 1200;        // initial noise floor calibration
         let vadSilenceTimer = null;
         let vadPollTimer = null;
         let segmentStartTime = 0;
+        let vadSmoothedRms = 0;
+        let vadNoiseFloor = 10;
+        let vadCalibratingUntil = 0;
+        let vadSpeechFrames = 0;
 
         // Audio playback state
         let audioContext = null;
@@ -451,7 +460,7 @@
                     });
                     currentAudioSource = null;
                 } catch (_decodeErr) {
-                    const mime = format === "mp3" ? "audio/mpeg" : `audio/${format}`;
+                    const mime = getMimeTypeForFormat(format);
                     const blob = new Blob([bytes], { type: mime });
                     const url = URL.createObjectURL(blob);
                     const audio = new Audio(url);
@@ -497,6 +506,25 @@
             return bytes;
         }
 
+        function getMimeTypeForFormat(format) {
+            switch ((format || "").toLowerCase()) {
+                case "opus":
+                    return "audio/ogg; codecs=opus";
+                case "ogg":
+                    return "audio/ogg";
+                case "wav":
+                    return "audio/wav";
+                case "mp3":
+                    return "audio/mpeg";
+                case "aac":
+                    return "audio/aac";
+                case "webm":
+                    return "audio/webm";
+                default:
+                    return `audio/${format || "mpeg"}`;
+            }
+        }
+
         // ── RMS audio level from analyser ───────────────────────────────────
         function getCurrentRMS() {
             if (!analyserNode) return 0;
@@ -658,6 +686,10 @@
             }
 
             whisperListening = true;
+            vadSmoothedRms = 0;
+            vadNoiseFloor = 10;
+            vadSpeechFrames = 0;
+            vadCalibratingUntil = Date.now() + VAD_CALIBRATION_MS;
             startMicBtn.disabled = true;
             stopMicBtn.disabled = false;
 
@@ -666,7 +698,7 @@
             startVADPolling();
 
             setStatus("Listening (Whisper VAD)...", "listening");
-            log("Whisper VAD listening started — speak and it will auto-detect");
+            log("Whisper VAD listening started — calibrating noise floor...");
         }
 
         function stopWhisperListening() {
@@ -688,6 +720,7 @@
 
             teardownAnalyser();
             stopViz();
+            vadSpeechFrames = 0;
 
             startMicBtn.disabled = !connected;
             stopMicBtn.disabled = true;
@@ -711,16 +744,37 @@
             if (!whisperListening) return;
 
             const rms = getCurrentRMS();
-            const isSpeech = rms > VAD_SPEECH_THRESHOLD;
+            // Smooth audio level so tiny spikes don't trigger segments
+            vadSmoothedRms = vadSmoothedRms === 0 ? rms : (vadSmoothedRms * 0.8) + (rms * 0.2);
+
+            // Adaptive noise-floor calibration / tracking
+            // During startup: quick calibration
+            // After startup: slow drift only when not in a speech segment
+            if (Date.now() < vadCalibratingUntil) {
+                vadNoiseFloor = vadNoiseFloor === 0 ? vadSmoothedRms : (vadNoiseFloor * 0.85) + (vadSmoothedRms * 0.15);
+                return;
+            }
+
+            if (!whisperSegmentActive) {
+                vadNoiseFloor = (vadNoiseFloor * 0.97) + (vadSmoothedRms * 0.03);
+            }
+
+            const speechThreshold = Math.max(VAD_BASE_THRESHOLD, vadNoiseFloor * VAD_NOISE_MULTIPLIER);
+            const silenceThreshold = speechThreshold * VAD_SILENCE_HYSTERESIS;
+            const isSpeech = whisperSegmentActive
+                ? vadSmoothedRms > silenceThreshold
+                : vadSmoothedRms > speechThreshold;
 
             if (isSpeech) {
-                // ── Speech detected ──
-                // Auto barge-in: if assistant is speaking, interrupt it
-                autoBargeIn();
-
                 if (!whisperSegmentActive) {
-                    // Start a new recording segment
-                    beginWhisperSegment();
+                    // Debounce: require N consecutive speech frames before start
+                    vadSpeechFrames += 1;
+                    if (vadSpeechFrames >= VAD_SPEECH_START_FRAMES) {
+                        // Auto barge-in only once we are confident speech is real
+                        autoBargeIn();
+                        beginWhisperSegment();
+                        vadSpeechFrames = 0;
+                    }
                 }
 
                 // Reset silence timer — user is still talking
@@ -728,7 +782,14 @@
                     clearTimeout(vadSilenceTimer);
                     vadSilenceTimer = null;
                 }
-            } else if (whisperSegmentActive && !vadSilenceTimer) {
+            } else {
+                // Decay speech frames when signal drops, prevents start on random bursts
+                if (!whisperSegmentActive) {
+                    vadSpeechFrames = Math.max(0, vadSpeechFrames - 1);
+                }
+            }
+
+            if (!isSpeech && whisperSegmentActive && !vadSilenceTimer) {
                 // ── Silence while recording → start countdown ──
                 vadSilenceTimer = setTimeout(() => {
                     vadSilenceTimer = null;
@@ -961,8 +1022,8 @@
 
                 case "audio_chunk": {
                     const bytes = decodeBase64ToBytes(msg.data);
-                    audioQueue.push({ bytes, format: msg.format || "mp3" });
-                    log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "mp3"})`);
+                    audioQueue.push({ bytes, format: msg.format || "opus" });
+                    log(`🔊 Audio chunk #${msg.chunkId ?? "?"} (${bytes.length} bytes, ${msg.format || "opus"})`);
                     playNextAudioChunk();
                     break;
                 }
@@ -970,7 +1031,7 @@
                 // Full audio (non-streaming fallback)
                 case "audio": {
                     const bytes = decodeBase64ToBytes(msg.data);
-                    audioQueue.push({ bytes, format: msg.format || "mp3" });
+                    audioQueue.push({ bytes, format: msg.format || "opus" });
                     log(`🔊 Full audio (${bytes.length} bytes)`);
                     playNextAudioChunk();
                     break;
@@ -983,7 +1044,7 @@
                     setBadge("interrupted", "idle");
                     if (connected) {
                         setStatus(whisperListening || micShouldRun ? "Listening..." : "Connected",
-                                  whisperListening || micShouldRun ? "listening" : "connected");
+                            whisperListening || micShouldRun ? "listening" : "connected");
                     }
                     break;
 
diff --git a/example/ws-server.ts b/example/ws-server.ts
index f648c61..a8537f8 100644
--- a/example/ws-server.ts
+++ b/example/ws-server.ts
@@ -55,7 +55,7 @@ Keep responses concise and conversational since they will be spoken aloud.
 Use tools when needed to provide accurate information.`,
         voice: "alloy",
         speechInstructions: "Speak in a friendly, natural conversational tone.",
-        outputFormat: "mp3",
+        outputFormat: "opus",
         streamingSpeech: {
             minChunkSize: 40,
             maxChunkSize: 180,
diff --git a/output_chunk_0.mp3 b/output_chunk_0.mp3
new file mode 100644
index 0000000..79dc113
Binary files /dev/null and b/output_chunk_0.mp3 differ