feat(example): video streaming

2026-03-02 18:36:39 +00:00 · 2026-02-19 18:42:06 +05:30
parent bbe354b70b
commit c5542fc156
10 changed files with 1214 additions and 14 deletions
--- a/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp
+++ b/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp
--- a/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp
+++ b/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp
--- a/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp
+++ b/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp
--- a/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp
+++ b/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp
--- a/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp
+++ b/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp
--- a/example/serve-client.js
+++ b/example/serve-client.js
@@ -2,12 +2,12 @@ const http = require('http');
 const fs = require('fs');
 const path = require('path');

-const PORT = 3000;
+const PORT = 3102;

 // Create a simple HTTP server to serve the voice client HTML
 const server = http.createServer((req, res) => {
  if (req.url === '/' || req.url === '/index.html') {
-    const htmlPath = path.join(__dirname, 'voice-client.html');
+    const htmlPath = path.join(__dirname, 'video-client.html');
    fs.readFile(htmlPath, (err, data) => {
      if (err) {
        res.writeHead(500);
--- a/example/video-client.html
+++ b/example/video-client.html
@@ -0,0 +1,998 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Video + Voice Agent Client</title>
+    <style>
+        body {
+            font-family: system-ui, sans-serif;
+            max-width: 1000px;
+            margin: 20px auto;
+            padding: 0 16px;
+            background: #f9fafb;
+            color: #111827;
+        }
+
+        h1 {
+            margin-bottom: 8px;
+        }
+
+        .subtitle {
+            color: #6b7280;
+            font-size: 0.95rem;
+            margin-bottom: 24px;
+        }
+
+        .card {
+            background: white;
+            border: 1px solid #e5e7eb;
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 20px;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
+        }
+
+        .row {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 12px;
+            align-items: center;
+            margin-bottom: 16px;
+        }
+
+        video {
+            width: 100%;
+            max-width: 520px;
+            border-radius: 10px;
+            background: #000;
+            aspect-ratio: 4 / 3;
+        }
+
+        button {
+            padding: 10px 16px;
+            border-radius: 8px;
+            border: 1px solid #d1d5db;
+            background: white;
+            cursor: pointer;
+            font-weight: 500;
+        }
+
+        button.primary {
+            background: #2563eb;
+            color: white;
+            border-color: #2563eb;
+        }
+
+        button.danger {
+            background: #dc2626;
+            color: white;
+            border-color: #dc2626;
+        }
+
+        button:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+
+        .status {
+            font-weight: 600;
+            margin: 8px 0;
+            font-size: 0.95rem;
+        }
+
+        .dot {
+            display: inline-block;
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            margin-right: 8px;
+        }
+
+        .dot.disconnected {
+            background: #9ca3af;
+        }
+
+        .dot.connected {
+            background: #22c55e;
+        }
+
+        .dot.listening {
+            background: #f59e0b;
+            animation: pulse 1.5s infinite;
+        }
+
+        .dot.speaking {
+            background: #3b82f6;
+            animation: pulse 1.2s infinite;
+        }
+
+        @keyframes pulse {
+
+            0%,
+            100% {
+                opacity: 1
+            }
+
+            50% {
+                opacity: 0.6
+            }
+        }
+
+        #transcript,
+        #assistant,
+        #reasoning,
+        #tools {
+            min-height: 48px;
+            padding: 12px;
+            border-radius: 8px;
+            background: #f3f4f6;
+            border-left: 4px solid #9ca3af;
+            margin-bottom: 16px;
+            white-space: pre-wrap;
+        }
+
+        #transcript {
+            border-left-color: #2563eb;
+        }
+
+        #assistant {
+            border-left-color: #22c55e;
+        }
+
+        #reasoning {
+            border-left-color: #f59e0b;
+            font-style: italic;
+            color: #4b5563;
+        }
+
+        #tools {
+            border-left-color: #8b5cf6;
+            font-size: 0.9rem;
+        }
+
+        #log {
+            background: #0f172a;
+            color: #e2e8f0;
+            font-family: 'SF Mono', monospace;
+            font-size: 0.82rem;
+            padding: 12px;
+            border-radius: 8px;
+            max-height: 240px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+        }
+
+        .hidden {
+            display: none;
+        }
+
+        /* ── Mic selector & level meter ── */
+        #micRow {
+            margin-bottom: 12px;
+        }
+
+        #micSelect {
+            flex: 1;
+            min-width: 180px;
+            padding: 6px 8px;
+            border-radius: 6px;
+            border: 1px solid #d1d5db;
+        }
+
+        #refreshMicsBtn {
+            padding: 6px 12px;
+            font-size: 0.85rem;
+        }
+
+        .meter-wrap {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            margin-bottom: 12px;
+        }
+
+        .meter-wrap label {
+            font-size: 0.85rem;
+            white-space: nowrap;
+        }
+
+        #levelMeter {
+            flex: 1;
+            height: 14px;
+            border-radius: 7px;
+            background: #e5e7eb;
+            overflow: hidden;
+        }
+
+        #levelBar {
+            height: 100%;
+            width: 0%;
+            border-radius: 7px;
+            background: #22c55e;
+            transition: width 60ms linear;
+        }
+
+        #levelBar.hot {
+            background: #ef4444;
+        }
+
+        #rmsValue {
+            font-family: monospace;
+            font-size: 0.8rem;
+            width: 56px;
+            text-align: right;
+        }
+
+        /* ── Push-to-talk ── */
+        #pttBtn {
+            padding: 10px 20px;
+            font-size: 1rem;
+            font-weight: 600;
+            border-radius: 10px;
+            border: 2px solid #2563eb;
+            background: #eff6ff;
+            color: #2563eb;
+            cursor: pointer;
+            user-select: none;
+            touch-action: none;
+        }
+
+        #pttBtn:active,
+        #pttBtn.active {
+            background: #dc2626;
+            color: white;
+            border-color: #dc2626;
+        }
+
+        #pttBtn:disabled {
+            opacity: 0.4;
+            cursor: not-allowed;
+        }
+    </style>
+</head>
+
+<body>
+
+    <h1>📹 Video + Voice Agent</h1>
+    <p class="subtitle">Webcam + microphone → multimodal AI (vision + speech)</p>
+
+    <div class="card">
+        <video id="localVideo" autoplay playsinline muted></video>
+        <canvas id="frameCanvas" style="display:none"></canvas>
+
+        <div class="row" style="margin-top:16px">
+            <input type="text" id="wsEndpoint" value="ws://localhost:8081" style="flex:1; min-width:260px" />
+            <button id="connectBtn" class="primary">Connect</button>
+            <button id="disconnectBtn" disabled>Disconnect</button>
+        </div>
+
+        <!-- ── Mic selector ── -->
+        <div class="row" id="micRow">
+            <label>Microphone:</label>
+            <select id="micSelect">
+                <option value="">-- click Refresh --</option>
+            </select>
+            <button id="refreshMicsBtn">🔄 Refresh</button>
+        </div>
+
+        <!-- ── Live level meter ── -->
+        <div class="meter-wrap">
+            <label>Mic level:</label>
+            <div id="levelMeter">
+                <div id="levelBar"></div>
+            </div>
+            <span id="rmsValue">0.000</span>
+        </div>
+
+        <div class="row">
+            <label>Input mode:</label>
+            <select id="inputMode">
+                <option value="browser-stt">Browser STT</option>
+                <option value="server-whisper">Server Whisper (VAD)</option>
+                <option value="push-to-talk" selected>Push-to-Talk</option>
+            </select>
+            <label>Frames:</label>
+            <select id="frameInterval">
+                <option value="3000">every 3s</option>
+                <option value="5000" selected>every 5s</option>
+                <option value="10000">every 10s</option>
+                <option value="0">manual only</option>
+            </select>
+        </div>
+
+        <div class="row">
+            <button id="startMediaBtn" disabled>📹🎤 Start Camera + Mic</button>
+            <button id="stopMediaBtn" disabled>⏹ Stop</button>
+            <button id="captureBtn" disabled>Capture Frame Now</button>
+            <button id="pttBtn" disabled>🎙 Hold to Talk</button>
+            <button id="interruptBtn" class="danger" disabled>✋ Interrupt</button>
+        </div>
+
+        <div class="status" id="status">
+            <span class="dot disconnected"></span>Disconnected
+        </div>
+    </div>
+
+    <h3>👤 You said</h3>
+    <div id="transcript">—</div>
+
+    <h3>🤖 Assistant</h3>
+    <div id="assistant"></div>
+
+    <div id="reasoningSection" class="hidden">
+        <h3>💭 Reasoning</h3>
+        <div id="reasoning"></div>
+    </div>
+
+    <div id="toolsSection" class="hidden">
+        <h3>🛠️ Tools</h3>
+        <div id="tools"></div>
+    </div>
+
+    <h3>📜 Log</h3>
+    <div id="log"></div>
+
+    <script>
+        // ────────────────────────────────────────────────────────────────
+        //  State & Elements
+        // ────────────────────────────────────────────────────────────────
+
+        const els = {
+            wsEndpoint: document.getElementById('wsEndpoint'),
+            connectBtn: document.getElementById('connectBtn'),
+            disconnectBtn: document.getElementById('disconnectBtn'),
+            inputMode: document.getElementById('inputMode'),
+            frameInterval: document.getElementById('frameInterval'),
+            startMediaBtn: document.getElementById('startMediaBtn'),
+            stopMediaBtn: document.getElementById('stopMediaBtn'),
+            captureBtn: document.getElementById('captureBtn'),
+            pttBtn: document.getElementById('pttBtn'),
+            interruptBtn: document.getElementById('interruptBtn'),
+            status: document.getElementById('status'),
+            transcript: document.getElementById('transcript'),
+            assistant: document.getElementById('assistant'),
+            reasoningSec: document.getElementById('reasoningSection'),
+            reasoning: document.getElementById('reasoning'),
+            toolsSec: document.getElementById('toolsSection'),
+            tools: document.getElementById('tools'),
+            log: document.getElementById('log'),
+            video: document.getElementById('localVideo'),
+            canvas: document.getElementById('frameCanvas'),
+            micSelect: document.getElementById('micSelect'),
+            refreshMicsBtn: document.getElementById('refreshMicsBtn'),
+            levelBar: document.getElementById('levelBar'),
+            rmsValue: document.getElementById('rmsValue'),
+        };
+
+        let ws = null;
+        let localStream = null;
+        let audioOnlyStream = null;   // ← ADD THIS
+        let mediaRecorder = null;
+        let audioChunks = [];
+        let frameTimer = null;
+        let audioQueue = [];
+        let isPlaying = false;
+        let currentSource = null;
+
+        // Level-meter / VAD audio nodes (use browser-native sample rate)
+        let meterCtx = null;       // AudioContext for the meter (always running when media is on)
+        let meterAnalyser = null;
+        let meterSource = null;
+        let meterRafId = null;
+
+        // VAD-specific
+        let silenceStart = null;
+        let recordingStartTime = null;
+        const SPEECH_THRESHOLD = 0.015;
+        const SILENCE_THRESHOLD = 0.008;
+        const SILENCE_DURATION = 1400;   // ms
+        const MIN_RECORDING_TIME = 600;  // ms
+
+        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+        let recognition = null;
+
+        // ────────────────────────────────────────────────────────────────
+        //  Helpers
+        // ────────────────────────────────────────────────────────────────
+
+        function log(...args) {
+            const time = new Date().toLocaleTimeString([], { hour12: false });
+            const line = `[${time}] ${args.join(' ')}\n`;
+            els.log.textContent += line;
+            els.log.scrollTop = els.log.scrollHeight;
+        }
+
+        function setStatus(text, state = 'disconnected') {
+            els.status.innerHTML = `<span class="dot ${state}"></span>${text}`;
+        }
+
+        function enable(...btns) {
+            btns.forEach(b => { if (els[b]) els[b].disabled = false; });
+        }
+        function disable(...btns) {
+            btns.forEach(b => { if (els[b]) els[b].disabled = true; });
+        }
+
+        function resetUI() {
+            els.assistant.textContent = '';
+            els.reasoning.textContent = '';
+            els.tools.textContent = '';
+            els.reasoningSec.classList.add('hidden');
+            els.toolsSec.classList.add('hidden');
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Mic enumeration
+        // ────────────────────────────────────────────────────────────────
+
+        async function refreshMics() {
+            try {
+                // Need a temporary stream to get labelled device list
+                const tmp = await navigator.mediaDevices.getUserMedia({ audio: true });
+                tmp.getTracks().forEach(t => t.stop());
+
+                const devices = await navigator.mediaDevices.enumerateDevices();
+                const mics = devices.filter(d => d.kind === 'audioinput');
+                els.micSelect.innerHTML = '';
+                mics.forEach((m, i) => {
+                    const opt = document.createElement('option');
+                    opt.value = m.deviceId;
+                    opt.textContent = m.label || `Microphone ${i + 1}`;
+                    els.micSelect.appendChild(opt);
+                });
+                log(`Found ${mics.length} microphone(s)`);
+            } catch (err) {
+                log('Mic enumeration failed:', err.message);
+            }
+        }
+
+        els.refreshMicsBtn.onclick = refreshMics;
+        // Auto-populate on page load
+        refreshMics();
+
+        // ────────────────────────────────────────────────────────────────
+        //  Live audio level meter (always-on when media is active)
+        //  Uses AnalyserNode + rAF – no ScriptProcessorNode needed.
+        // ────────────────────────────────────────────────────────────────
+
+        function startLevelMeter(stream) {
+            // Use the browser's native sample rate (NO custom sampleRate!)
+            meterCtx = new (window.AudioContext || window.webkitAudioContext)();
+            meterSource = meterCtx.createMediaStreamSource(stream);
+            meterAnalyser = meterCtx.createAnalyser();
+            meterAnalyser.fftSize = 1024;
+            meterSource.connect(meterAnalyser);
+            // Do NOT connect to destination – we don't want to hear ourselves
+
+            const buf = new Float32Array(meterAnalyser.fftSize);
+
+            function tick() {
+                meterAnalyser.getFloatTimeDomainData(buf);
+                let sum = 0;
+                for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
+                const rms = Math.sqrt(sum / buf.length);
+
+                // Update UI
+                const pct = Math.min(rms / 0.15, 1) * 100;   // 0.15 is "loud"
+                els.levelBar.style.width = pct + '%';
+                els.levelBar.classList.toggle('hot', rms > SPEECH_THRESHOLD);
+                els.rmsValue.textContent = rms.toFixed(4);
+
+                // If VAD mode is active, drive it from here
+                if (els.inputMode.value === 'server-whisper') {
+                    vadTick(rms);
+                }
+
+                meterRafId = requestAnimationFrame(tick);
+            }
+            tick();
+            log(`Level meter started (sampleRate=${meterCtx.sampleRate})`);
+        }
+
+        function stopLevelMeter() {
+            if (meterRafId) { cancelAnimationFrame(meterRafId); meterRafId = null; }
+            if (meterSource) { meterSource.disconnect(); meterSource = null; }
+            if (meterAnalyser) { meterAnalyser.disconnect(); meterAnalyser = null; }
+            if (meterCtx) { meterCtx.close(); meterCtx = null; }
+            els.levelBar.style.width = '0%';
+            els.rmsValue.textContent = '0.000';
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Frame capture & send
+        // ────────────────────────────────────────────────────────────────
+
+        function captureFrame(reason = 'timer') {
+            if (!els.video.videoWidth) return;
+
+            const ctx = els.canvas.getContext('2d');
+            els.canvas.width = els.video.videoWidth;
+            els.canvas.height = els.video.videoHeight;
+            ctx.drawImage(els.video, 0, 0);
+
+            const dataUrl = els.canvas.toDataURL('image/webp', 0.78);
+            const base64 = dataUrl.split(',')[1];
+
+            if (ws?.readyState === WebSocket.OPEN) {
+                ws.send(JSON.stringify({
+                    type: 'video_frame',
+                    sessionId: 'client-main',
+                    sequence: Date.now(),
+                    timestamp: Date.now(),
+                    triggerReason: reason,
+                    image: {
+                        data: base64,
+                        format: 'webp',
+                        width: els.canvas.width,
+                        height: els.canvas.height
+                    }
+                }));
+                log(`Frame sent (${(base64.length / 1000).toFixed(1)} kB) — ${reason}`);
+            }
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Audio playback queue
+        // ────────────────────────────────────────────────────────────────
+
+        async function playNext() {
+            if (isPlaying || audioQueue.length === 0) return;
+            isPlaying = true;
+
+            const { bytes, format } = audioQueue.shift();
+
+            try {
+                const ctx = new (window.AudioContext || window.webkitAudioContext)();
+                const buffer = await ctx.decodeAudioData(
+                    bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.length)
+                );
+                const source = ctx.createBufferSource();
+                source.buffer = buffer;
+                source.connect(ctx.destination);
+                currentSource = source;
+                source.onended = () => {
+                    currentSource = null;
+                    isPlaying = false;
+                    ctx.close();
+                    playNext();
+                };
+                source.start(0);
+                log(`Playing audio chunk (${bytes.length} bytes, ${format})`);
+            } catch (err) {
+                console.error('Audio decode/play error:', err);
+                isPlaying = false;
+                playNext();
+            }
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  WebSocket
+        // ────────────────────────────────────────────────────────────────
+
+        function connect() {
+            const url = els.wsEndpoint.value.trim();
+            if (!url) return log('No endpoint');
+
+            setStatus('Connecting...', 'disconnected');
+            ws = new WebSocket(url);
+
+            ws.onopen = () => {
+                setStatus('Connected', 'connected');
+                enable('startMediaBtn', 'interruptBtn', 'captureBtn');
+                disable('connectBtn');
+                enable('disconnectBtn');
+                log(`Connected to ${url}`);
+            };
+
+            ws.onclose = () => {
+                setStatus('Disconnected', 'disconnected');
+                disable('startMediaBtn', 'stopMediaBtn', 'captureBtn', 'interruptBtn', 'pttBtn');
+                enable('connectBtn');
+                disable('disconnectBtn');
+                stopAllMedia();
+                log('Disconnected');
+                ws = null;
+            };
+
+            ws.onerror = (e) => {
+                log('WebSocket error', e);
+                setStatus('Error', 'disconnected');
+            };
+
+            ws.onmessage = (event) => {
+                try {
+                    const msg = JSON.parse(event.data);
+                    handleMessage(msg);
+                } catch (err) {
+                    log('Parse error:', err);
+                }
+            };
+        }
+
+        function disconnect() {
+            if (ws) ws.close();
+            stopAllMedia();
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Media (camera + mic)
+        // ────────────────────────────────────────────────────────────────
+
+        async function startMedia() {
+            try {
+                const audioConstraint = els.micSelect.value
+                    ? { deviceId: { exact: els.micSelect.value } }
+                    : true;
+
+                localStream = await navigator.mediaDevices.getUserMedia({
+                    video: { width: { ideal: 640 }, height: { ideal: 480 } },
+                    audio: audioConstraint,
+                });
+
+                audioOnlyStream = new MediaStream(localStream.getAudioTracks());  // ← ADD THIS
+
+
+                // Log which mic was actually selected
+                const audioTrack = localStream.getAudioTracks()[0];
+                log(`Mic active: "${audioTrack?.label || 'unknown'}"`);
+
+                els.video.srcObject = localStream;
+                await els.video.play();
+
+                enable('stopMediaBtn', 'pttBtn');
+                disable('startMediaBtn');
+
+                // Start the always-on level meter
+                startLevelMeter(localStream);
+
+                // Periodic frames
+                const intervalMs = Number(els.frameInterval.value);
+                if (intervalMs > 0) {
+                    frameTimer = setInterval(() => captureFrame('timer'), intervalMs);
+                    log(`Frame capture every ${intervalMs / 1000}s`);
+                }
+
+                // Start the selected input mode
+                const mode = els.inputMode.value;
+                if (mode === 'browser-stt') {
+                    startBrowserSTT();
+                }
+                // VAD and push-to-talk don't need extra init – they're driven by
+                // the level-meter tick and button events respectively.
+
+                setStatus('Listening...', 'listening');
+                log(`Camera + Mic started, input mode: ${mode}`);
+            } catch (err) {
+                log('getUserMedia failed:', err.message);
+            }
+        }
+
+        function stopAllMedia() {
+            if (frameTimer) { clearInterval(frameTimer); frameTimer = null; }
+
+            stopLevelMeter();
+
+            if (localStream) {
+                localStream.getTracks().forEach(t => t.stop());
+                audioOnlyStream = null;
+                localStream = null;
+            }
+            els.video.srcObject = null;
+
+            if (mediaRecorder?.state === 'recording') mediaRecorder.stop();
+            mediaRecorder = null;
+
+            if (recognition) recognition.stop();
+            recognition = null;
+
+            silenceStart = null;
+            recordingStartTime = null;
+            audioChunks = [];
+
+            disable('stopMediaBtn', 'pttBtn');
+            enable('startMediaBtn');
+            setStatus('Connected', 'connected');
+            log('Media stopped');
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Shared: record a segment from localStream and send it
+        // ────────────────────────────────────────────────────────────────
+
+        function chosenMimeType() {
+            for (const mt of [
+                'audio/webm;codecs=opus',
+                'audio/webm',
+                'audio/ogg;codecs=opus',
+                'audio/mp4',
+            ]) {
+                if (MediaRecorder.isTypeSupported(mt)) return mt;
+            }
+            return '';  // let browser pick default
+        }
+
+        function startRecording() {
+            if (mediaRecorder?.state === 'recording') return;
+            if (!audioOnlyStream) { log('No audio stream!'); return; }
+
+            audioChunks = [];
+            recordingStartTime = Date.now();
+            silenceStart = null;
+
+            const mimeType = chosenMimeType();
+            const opts = mimeType ? { mimeType } : undefined;
+            mediaRecorder = new MediaRecorder(audioOnlyStream, opts);
+
+            mediaRecorder.ondataavailable = e => {
+                if (e.data.size > 0) audioChunks.push(e.data);
+            };
+
+            mediaRecorder.onstop = async () => {
+                const usedMime = mediaRecorder?.mimeType || mimeType || 'audio/webm';
+                if (audioChunks.length === 0) {
+                    log('No audio chunks recorded');
+                    setStatus('Listening...', 'listening');
+                    return;
+                }
+
+                const blob = new Blob(audioChunks, { type: usedMime });
+                if (blob.size < 800) {
+                    log(`Audio too short (${blob.size} bytes), skipping`);
+                    setStatus('Listening...', 'listening');
+                    return;
+                }
+
+                const arrayBuffer = await blob.arrayBuffer();
+                const base64 = btoa(
+                    new Uint8Array(arrayBuffer).reduce((d, b) => d + String.fromCharCode(b), '')
+                );
+
+                if (ws?.readyState === WebSocket.OPEN) {
+                    ws.send(JSON.stringify({ type: 'audio', data: base64, format: usedMime }));
+                    log(`Sent audio (${(base64.length / 1000).toFixed(1)} kB, ${usedMime})`);
+                    els.transcript.textContent = 'Transcribing...';
+                } else {
+                    log('WS not connected, audio dropped');
+                }
+
+                setStatus('Listening...', 'listening');
+            };
+
+            mediaRecorder.start(100);   // timeslice 100ms
+            setStatus('🔴 Recording...', 'speaking');
+            log('Recording started');
+        }
+
+        function stopRecording() {
+            if (mediaRecorder?.state === 'recording') {
+                mediaRecorder.stop();
+                silenceStart = null;
+                recordingStartTime = null;
+                setStatus('Processing...', 'connected');
+                log('Recording stopped, sending...');
+            }
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  VAD (driven from the level-meter rAF loop)
+        // ────────────────────────────────────────────────────────────────
+
+        function vadTick(rms) {
+            if (rms > SPEECH_THRESHOLD) {
+                silenceStart = null;
+                if (!mediaRecorder || mediaRecorder.state !== 'recording') {
+                    startRecording();
+                }
+            } else if (rms < SILENCE_THRESHOLD && mediaRecorder?.state === 'recording') {
+                if (!silenceStart) {
+                    silenceStart = Date.now();
+                } else if (Date.now() - silenceStart > SILENCE_DURATION) {
+                    if (recordingStartTime && (Date.now() - recordingStartTime) > MIN_RECORDING_TIME) {
+                        log('Silence → stopping');
+                        stopRecording();
+                    }
+                }
+            }
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Push-to-Talk
+        // ────────────────────────────────────────────────────────────────
+
+        function pttDown() {
+            if (!localStream) return;
+            els.pttBtn.classList.add('active');
+            startRecording();
+        }
+
+        function pttUp() {
+            els.pttBtn.classList.remove('active');
+            stopRecording();
+        }
+
+        els.pttBtn.addEventListener('mousedown', pttDown);
+        els.pttBtn.addEventListener('mouseup', pttUp);
+        els.pttBtn.addEventListener('mouseleave', pttUp);
+        els.pttBtn.addEventListener('touchstart', e => { e.preventDefault(); pttDown(); });
+        els.pttBtn.addEventListener('touchend', e => { e.preventDefault(); pttUp(); });
+
+        // Spacebar push-to-talk (only when mode is push-to-talk)
+        let spaceHeld = false;
+        document.addEventListener('keydown', e => {
+            if (e.code === 'Space' && !spaceHeld && els.inputMode.value === 'push-to-talk'
+                && localStream && !e.target.matches('input, textarea, select')) {
+                e.preventDefault();
+                spaceHeld = true;
+                pttDown();
+            }
+        });
+        document.addEventListener('keyup', e => {
+            if (e.code === 'Space' && spaceHeld) {
+                e.preventDefault();
+                spaceHeld = false;
+                pttUp();
+            }
+        });
+
+        // ────────────────────────────────────────────────────────────────
+        //  Browser STT
+        // ────────────────────────────────────────────────────────────────
+
+        function startBrowserSTT() {
+            if (!SpeechRecognition) { log('Web Speech API not supported'); return; }
+            recognition = new SpeechRecognition();
+            recognition.continuous = true;
+            recognition.interimResults = true;
+            recognition.lang = 'en-US';
+
+            recognition.onresult = e => {
+                const transcript = Array.from(e.results).map(r => r[0].transcript).join('');
+                els.transcript.textContent = transcript;
+                if (e.results[0].isFinal) sendTranscript(transcript);
+            };
+
+            recognition.onerror = e => log('STT error:', e.error);
+            recognition.start();
+            log('Browser STT started');
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Sending transcript / interrupt
+        // ────────────────────────────────────────────────────────────────
+
+        function sendTranscript(text) {
+            if (!ws || ws.readyState !== WebSocket.OPEN) return;
+            ws.send(JSON.stringify({ type: 'transcript', text }));
+            log(`Sent transcript: ${text}`);
+            resetUI();
+        }
+
+        function interrupt() {
+            if (ws?.readyState === WebSocket.OPEN) {
+                ws.send(JSON.stringify({ type: 'interrupt', reason: 'user_button' }));
+                log('Interrupt sent');
+            }
+            audioQueue = [];
+            if (currentSource) { currentSource.stop(); currentSource = null; }
+            isPlaying = false;
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Server → Client messages
+        // ────────────────────────────────────────────────────────────────
+
+        function handleMessage(msg) {
+            switch (msg.type) {
+                case 'transcription_result':
+                    els.transcript.textContent = msg.text || '(empty)';
+                    log(`Transcription: ${msg.text}`);
+                    break;
+
+                case 'text_delta':
+                    els.assistant.textContent += msg.text || '';
+                    break;
+
+                case 'reasoning_delta':
+                    els.reasoningSec.classList.remove('hidden');
+                    els.reasoning.textContent += msg.text || '';
+                    break;
+
+                case 'tool_call':
+                case 'tool_result':
+                    els.toolsSec.classList.remove('hidden');
+                    els.tools.innerHTML += `<div>${msg.type}: ${msg.toolName || '?'} → ${JSON.stringify(msg.result || msg.input || {})}</div>`;
+                    break;
+
+                case 'audio_chunk':
+                case 'audio':
+                    const bytes = Uint8Array.from(atob(msg.data), c => c.charCodeAt(0));
+                    audioQueue.push({ bytes, format: msg.format || 'mp3' });
+                    playNext();
+                    break;
+
+                case 'speech_interrupted':
+                    audioQueue = [];
+                    if (currentSource) currentSource.stop();
+                    isPlaying = false;
+                    log(`Speech interrupted: ${msg.reason || '?'}`);
+                    break;
+
+                case 'response_complete':
+                    log('Response complete');
+                    break;
+
+                case 'capture_frame':
+                    log(`Server requested frame: ${msg.reason}`);
+                    captureFrame(msg.reason || 'server_request');
+                    break;
+
+                case 'frame_ack':
+                    break;   // silent
+
+                case 'session_init':
+                    log(`Session: ${msg.sessionId}`);
+                    break;
+
+                case 'stream_start':
+                    resetUI();
+                    break;
+
+                case 'stream_finish':
+                    log(`Stream finished: ${msg.finishReason}`);
+                    break;
+
+                case 'speech_stream_start':
+                    break;
+
+                case 'speech_stream_end':
+                    log('Speech done');
+                    break;
+
+                case 'error':
+                    log(`ERROR: ${msg.error}`);
+                    console.error('Server error:', msg.error);
+                    break;
+
+                case 'transcription_error':
+                    log(`Transcription error: ${msg.error}`);
+                    els.transcript.textContent = `Error: ${msg.error}`;
+                    break;
+
+                default:
+                    if (msg.type?.includes('stream') || msg.type?.includes('step')) {
+                        // verbose stream events – log quietly
+                    } else {
+                        log(`[${msg.type}]`);
+                    }
+            }
+        }
+
+        // ────────────────────────────────────────────────────────────────
+        //  Event listeners
+        // ────────────────────────────────────────────────────────────────
+
+        els.connectBtn.onclick = connect;
+        els.disconnectBtn.onclick = disconnect;
+        els.startMediaBtn.onclick = startMedia;
+        els.stopMediaBtn.onclick = stopAllMedia;
+        els.captureBtn.onclick = () => captureFrame('manual');
+        els.interruptBtn.onclick = interrupt;
+
+        els.frameInterval.onchange = () => {
+            if (frameTimer) {
+                clearInterval(frameTimer);
+                const ms = Number(els.frameInterval.value);
+                if (ms > 0) frameTimer = setInterval(() => captureFrame('timer'), ms);
+            }
+        };
+
+        document.getElementById('wsEndpoint').addEventListener('keypress', e => {
+            if (e.key === 'Enter') connect();
+        });
+
+    </script>
+</body>
+
+</html>
--- a/example/ws-server-video.ts
+++ b/example/ws-server-video.ts
@@ -0,0 +1,161 @@
+// ws-server-video.ts
+import "dotenv/config";
+import { WebSocketServer } from "ws";
+import { VideoAgent } from "../src/VideoAgent";   // adjust path
+import { tool } from "ai";
+import { z } from "zod";
+import { openai } from "@ai-sdk/openai";
+import { mkdirSync, writeFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+
+// ── Frame saving ────────────────────────────────────────────────────────
+const __dirname = typeof import.meta.dirname === "string"
+    ? import.meta.dirname
+    : dirname(fileURLToPath(import.meta.url));
+
+const FRAMES_DIR = join(__dirname, "frames");
+mkdirSync(FRAMES_DIR, { recursive: true });
+console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
+
+let frameCounter = 0;
+
+function saveFrame(msg: {
+    sequence?: number;
+    timestamp?: number;
+    triggerReason?: string;
+    image: { data: string; format?: string; width?: number; height?: number };
+}) {
+    const idx = frameCounter++;
+    const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
+    const ts = new Date(msg.timestamp ?? Date.now())
+        .toISOString()
+        .replace(/[:.]/g, "-");
+    const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
+    const filepath = join(FRAMES_DIR, filename);
+
+    const buf = Buffer.from(msg.image.data, "base64");
+    writeFileSync(filepath, buf);
+
+    console.log(
+        `[frames] Saved ${filename}  (${(buf.length / 1024).toFixed(1)} kB` +
+        `${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
+        `, ${msg.triggerReason ?? "unknown"})`
+    );
+}
+
+const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
+const url = new URL(endpoint);
+const port = Number(url.port || 8081);
+const host = url.hostname || "localhost";
+
+
+// ── Tools (same as demo.ts) ────────────────────────────────────────────
+const weatherTool = tool({
+    description: "Get the weather in a location",
+    inputSchema: z.object({
+        location: z.string().describe("The location to get the weather for"),
+    }),
+    execute: async ({ location }) => ({
+        location,
+        temperature: 72 + Math.floor(Math.random() * 21) - 10,
+        conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
+            Math.floor(Math.random() * 4)
+        ],
+    }),
+});
+
+const timeTool = tool({
+    description: "Get the current time",
+    inputSchema: z.object({}),
+    execute: async () => ({
+        time: new Date().toLocaleTimeString(),
+        timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
+    }),
+});
+const wss = new WebSocketServer({ port, host });
+
+wss.on("listening", () => {
+    console.log(`[video-ws] listening on ${endpoint}`);
+    console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
+});
+
+wss.on("connection", (socket) => {
+    console.log("[video-ws] ✓ client connected");
+
+    const agent = new VideoAgent({
+        model: openai("gpt-4o"),               // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
+        transcriptionModel: openai.transcription("whisper-1"),
+        speechModel: openai.speech("gpt-4o-mini-tts"),
+        instructions: `You are a helpful video+voice assistant.
+You can SEE what the user is showing via webcam.
+Describe what you see when it helps answer the question.
+Keep spoken answers concise and natural.`,
+        voice: "alloy",
+        streamingSpeech: {
+            minChunkSize: 25,
+            maxChunkSize: 140,
+            parallelGeneration: true,
+            maxParallelRequests: 3,
+        },
+        tools: { getWeather: weatherTool, getTime: timeTool },
+        // Tune these depending on your budget & latency goals
+        maxContextFrames: 6,           // very important — each frame ≈ 100–400 tokens
+        maxFrameInputSize: 2_500_000,  // ~2.5 MB
+    });
+
+    // Reuse most of the same event logging you have in ws-server.ts
+    agent.on("text", (data: { role: string; text: string }) => {
+        console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
+    });
+    agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
+        process.stdout.write(data.text || "");
+    });
+    agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
+        console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB  ${dimensions.width}×${dimensions.height}`);
+    });
+    agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
+
+    // Audio and transcription events
+    agent.on("audio_received", ({ size, format }) => {
+        console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
+    });
+    agent.on("transcription", ({ text, language }) => {
+        console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
+    });
+
+    // Speech events
+    agent.on("speech_start", () => console.log(`[video] Speech started`));
+    agent.on("speech_complete", () => console.log(`[video] Speech complete`));
+    agent.on("audio_chunk", ({ chunkId, text }) => {
+        console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
+    });
+
+    // Error handling
+    agent.on("error", (error: Error) => {
+        console.error(`[video] ERROR:`, error);
+    });
+    agent.on("warning", (warning: string) => {
+        console.warn(`[video] WARNING:`, warning);
+    });
+
+    agent.on("disconnected", () => {
+        agent.destroy();
+        console.log("[video-ws] ✗ client disconnected (agent destroyed)");
+    });
+
+    // ── Intercept raw messages to save frames to disk ────────────────────
+    socket.on("message", (raw) => {
+        try {
+            const msg = JSON.parse(raw.toString());
+            if (msg.type === "video_frame" && msg.image?.data) {
+                saveFrame(msg);
+            }
+        } catch {
+            // not JSON — ignore, agent will handle binary etc.
+        }
+    });
+
+    // The crucial line — same as VoiceAgent
+    agent.handleSocket(socket);
+});
--- a/package.json
+++ b/package.json
@@ -15,6 +15,7 @@
    "demo": "tsx example/demo.ts",
    "ws:server": "tsx example/ws-server.ts",
    "client": "node example/serve-client.js",
+    "ws:video": "tsx example/ws-server-video.ts",
    "prepublishOnly": "pnpm build"
  },
  "keywords": [
--- a/src/VideoAgent.ts
+++ b/src/VideoAgent.ts
@@ -84,6 +84,10 @@ const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
 };

 export interface VideoAgentOptions {
+    /** 
+     * AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'), 
+     * anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
+     */
    model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
    transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1'))
    speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts'))
@@ -285,6 +289,7 @@ Use tools when needed to provide accurate information.`;
                    // Handle raw audio data that needs transcription
                    case "audio":
                        if (typeof message.data !== "string" || !message.data) {
+                            console.warn("Received empty or invalid audio message");
                            this.emit("warning", "Received empty or invalid audio message");
                            return;
                        }
@@ -293,9 +298,15 @@ Use tools when needed to provide accurate information.`;
                        // Force capture current frame when user speaks
                        this.requestFrameCapture("user_request");
                        console.log(
-                            `Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
+                            `[audio handler] Received audio data (${(message.data.length / 1000).toFixed(1)}KB) for processing, format: ${message.format || "unknown"}`
                        );
+                        try {
                            await this.processAudioInput(message);
+                            console.log(`[audio handler] processAudioInput completed`);
+                        } catch (audioError) {
+                            console.error(`[audio handler] Error in processAudioInput:`, audioError);
+                            this.emit("error", audioError);
+                        }
                        break;

                    // Handle video frame from client
@@ -850,13 +861,20 @@ Use tools when needed to provide accurate information.`;
    /**
     * Process incoming audio data: transcribe and generate response
     */
-    private async processAudioInput(audioMessage: AudioData): Promise<void> {
+    private async processAudioInput(audioMessage: AudioData | { type: string; data: string; format?: string; sessionId?: string }): Promise<void> {
        if (!this.transcriptionModel) {
-            this.emit("error", new Error("Transcription model not configured for audio input"));
+            const error = new Error("Transcription model not configured for audio input");
+            console.error(error.message);
+            this.emit("error", error);
+            this.sendWebSocketMessage({
+                type: "error",
+                error: error.message,
+            });
            return;
        }

        try {
+            console.log(`[processAudioInput] Starting audio processing, data length: ${audioMessage.data?.length || 0}`);
            const audioBuffer = Buffer.from(audioMessage.data, "base64");

            if (audioBuffer.length > this.maxAudioInputSize) {
@@ -877,19 +895,23 @@ Use tools when needed to provide accurate information.`;
            this.emit("audio_received", {
                size: audioBuffer.length,
                format: audioMessage.format,
-                sessionId: audioMessage.sessionId,
+                sessionId: audioMessage.sessionId || this.sessionId,
            });

            console.log(
-                `Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
+                `[processAudioInput] Processing audio: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
            );

+            console.log(`[processAudioInput] Calling transcribeAudio...`);
            const transcribedText = await this.transcribeAudio(audioBuffer);
-            console.log(`Transcribed text: "${transcribedText}"`);
+            console.log(`[processAudioInput] Transcribed text: "${transcribedText}"`);

            if (transcribedText.trim()) {
+                console.log(`[processAudioInput] Enqueueing text input: "${transcribedText}"`);
                await this.enqueueTextInput(transcribedText);
+                console.log(`[processAudioInput] Text input processing complete`);
            } else {
+                console.warn(`[processAudioInput] Transcription returned empty text`);
                this.emit("warning", "Transcription returned empty text");
                this.sendWebSocketMessage({
                    type: "transcription_error",
@@ -897,7 +919,7 @@ Use tools when needed to provide accurate information.`;
                });
            }
        } catch (error) {
-            console.error("Failed to process audio input:", error);
+            console.error("[processAudioInput] Failed to process audio input:", error);
            this.emit("error", error);
            this.sendWebSocketMessage({
                type: "transcription_error",
@@ -1049,28 +1071,38 @@ Use tools when needed to provide accurate information.`;
     * Drain the input queue, processing one request at a time
     */
    private async drainInputQueue(): Promise<void> {
-        if (this.processingQueue) return;
+        if (this.processingQueue) {
+            console.log(`[drainInputQueue] Already processing, skipping`);
+            return;
+        }
        this.processingQueue = true;
+        console.log(`[drainInputQueue] Starting to drain queue, ${this.inputQueue.length} items`);

        try {
            while (this.inputQueue.length > 0) {
                const item = this.inputQueue.shift()!;
+                console.log(`[drainInputQueue] Processing item: text="${item.text?.substring(0, 50)}...", hasFrame=${!!item.frame}`);
                try {
                    let result: string;
                    if (item.frame && item.text) {
+                        console.log(`[drainInputQueue] Calling processMultimodalInput`);
                        result = await this.processMultimodalInput(item.text, item.frame);
                    } else if (item.text) {
+                        console.log(`[drainInputQueue] Calling processUserInput`);
                        result = await this.processUserInput(item.text);
                    } else {
                        result = "";
                    }
+                    console.log(`[drainInputQueue] Got result: "${result?.substring(0, 100)}..."`);
                    item.resolve(result);
                } catch (error) {
+                    console.error(`[drainInputQueue] Error processing item:`, error);
                    item.reject(error);
                }
            }
        } finally {
            this.processingQueue = false;
+            console.log(`[drainInputQueue] Done draining queue`);
        }
    }

@@ -1173,6 +1205,7 @@ Use tools when needed to provide accurate information.`;
     * Process user input with streaming text generation
     */
    private async processUserInput(text: string): Promise<string> {
+        console.log(`[processUserInput] Starting with text: "${text}"`);
        this.isProcessing = true;
        this.currentStreamAbortController = new AbortController();
        const streamAbortSignal = this.currentStreamAbortController.signal;
@@ -1182,6 +1215,7 @@ Use tools when needed to provide accurate information.`;

            // Check if we have current frame data - if so, include it
            const hasVisualContext = !!this.currentFrameData;
+            console.log(`[processUserInput] hasVisualContext: ${hasVisualContext}`);

            let messages: ModelMessage[];

@@ -1207,6 +1241,10 @@ Use tools when needed to provide accurate information.`;

            this.trimHistory();

+            console.log(`[processUserInput] Calling streamText with ${messages.length} messages`);
+            console.log(`[processUserInput] Model:`, this.model);
+            console.log(`[processUserInput] Tools:`, Object.keys(this.tools));
+
            const result = streamText({
                model: this.model,
                system: this.instructions,
@@ -1218,6 +1256,7 @@ Use tools when needed to provide accurate information.`;
                    this.handleStreamChunk(chunk);
                },
                onFinish: async (event) => {
+                    console.log(`[processUserInput] onFinish called`);
                    for (const step of event.steps) {
                        for (const toolResult of step.toolResults) {
                            this.emit("tool_result", {
@@ -1229,11 +1268,12 @@ Use tools when needed to provide accurate information.`;
                    }
                },
                onError: ({ error }) => {
-                    console.error("Stream error:", error);
+                    console.error("[processUserInput] Stream error:", error);
                    this.emit("error", error);
                },
            });

+            console.log(`[processUserInput] Calling processStreamResult`);
            return await this.processStreamResult(result);
        } catch (error) {
            this.pendingTextBuffer = "";