diff --git a/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp b/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp new file mode 100644 index 0000000..d05b820 Binary files /dev/null and b/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp differ diff --git a/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp b/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp new file mode 100644 index 0000000..a0acc5a Binary files /dev/null and b/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp differ diff --git a/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp b/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp new file mode 100644 index 0000000..128aff2 Binary files /dev/null and b/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp differ diff --git a/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp b/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp new file mode 100644 index 0000000..095e541 Binary files /dev/null and b/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp differ diff --git a/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp b/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp new file mode 100644 index 0000000..f745871 Binary files /dev/null and b/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp differ diff --git a/example/serve-client.js b/example/serve-client.js index c8b1651..19b6082 100644 --- a/example/serve-client.js +++ b/example/serve-client.js @@ -2,19 +2,19 @@ const http = require('http'); const fs = require('fs'); const path = require('path'); -const PORT = 3000; +const PORT = 3102; // Create a simple HTTP server to serve the voice client HTML const server = http.createServer((req, res) => { if (req.url === '/' || req.url === '/index.html') { - const htmlPath = path.join(__dirname, 'voice-client.html'); + const htmlPath = path.join(__dirname, 'video-client.html'); fs.readFile(htmlPath, (err, data) => { if (err) { res.writeHead(500); res.end('Error loading voice-client.html'); return; } - res.writeHead(200, {'Content-Type': 'text/html'}); + res.writeHead(200, { 'Content-Type': 'text/html' }); res.end(data); }); } else { diff --git a/example/video-client.html b/example/video-client.html new file mode 100644 index 0000000..65304f8 --- /dev/null +++ b/example/video-client.html @@ -0,0 +1,998 @@ + + + + + + + Video + Voice Agent Client + + + + + +

📹 Video + Voice Agent

+

Webcam + microphone → multimodal AI (vision + speech)

+ +
+ + + +
+ + + +
+ + +
+ + + +
+ + +
+ +
+
+
+ 0.000 +
+ +
+ + + + +
+ +
+ + + + + +
+ +
+ Disconnected +
+
+ +

👤 You said

+
+ +

🤖 Assistant

+
+ + + + + +

📜 Log

+
+ + + + + \ No newline at end of file diff --git a/example/ws-server-video.ts b/example/ws-server-video.ts new file mode 100644 index 0000000..e155194 --- /dev/null +++ b/example/ws-server-video.ts @@ -0,0 +1,161 @@ +// ws-server-video.ts +import "dotenv/config"; +import { WebSocketServer } from "ws"; +import { VideoAgent } from "../src/VideoAgent"; // adjust path +import { tool } from "ai"; +import { z } from "zod"; +import { openai } from "@ai-sdk/openai"; +import { mkdirSync, writeFileSync } from "fs"; +import { join, dirname } from "path"; +import { fileURLToPath } from "url"; + +// ── Frame saving ──────────────────────────────────────────────────────── +const __dirname = typeof import.meta.dirname === "string" + ? import.meta.dirname + : dirname(fileURLToPath(import.meta.url)); + +const FRAMES_DIR = join(__dirname, "frames"); +mkdirSync(FRAMES_DIR, { recursive: true }); +console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`); + +let frameCounter = 0; + +function saveFrame(msg: { + sequence?: number; + timestamp?: number; + triggerReason?: string; + image: { data: string; format?: string; width?: number; height?: number }; +}) { + const idx = frameCounter++; + const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp"); + const ts = new Date(msg.timestamp ?? Date.now()) + .toISOString() + .replace(/[:.]/g, "-"); + const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`; + const filepath = join(FRAMES_DIR, filename); + + const buf = Buffer.from(msg.image.data, "base64"); + writeFileSync(filepath, buf); + + console.log( + `[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` + + `${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` + + `, ${msg.triggerReason ?? "unknown"})` + ); +} + +const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081"; +const url = new URL(endpoint); +const port = Number(url.port || 8081); +const host = url.hostname || "localhost"; + + +// ── Tools (same as demo.ts) ──────────────────────────────────────────── +const weatherTool = tool({ + description: "Get the weather in a location", + inputSchema: z.object({ + location: z.string().describe("The location to get the weather for"), + }), + execute: async ({ location }) => ({ + location, + temperature: 72 + Math.floor(Math.random() * 21) - 10, + conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][ + Math.floor(Math.random() * 4) + ], + }), +}); + +const timeTool = tool({ + description: "Get the current time", + inputSchema: z.object({}), + execute: async () => ({ + time: new Date().toLocaleTimeString(), + timezone: Intl.DateTimeFormat().resolvedOptions().timeZone, + }), +}); +const wss = new WebSocketServer({ port, host }); + +wss.on("listening", () => { + console.log(`[video-ws] listening on ${endpoint}`); + console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`); +}); + +wss.on("connection", (socket) => { + console.log("[video-ws] ✓ client connected"); + + const agent = new VideoAgent({ + model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash… + transcriptionModel: openai.transcription("whisper-1"), + speechModel: openai.speech("gpt-4o-mini-tts"), + instructions: `You are a helpful video+voice assistant. +You can SEE what the user is showing via webcam. +Describe what you see when it helps answer the question. +Keep spoken answers concise and natural.`, + voice: "alloy", + streamingSpeech: { + minChunkSize: 25, + maxChunkSize: 140, + parallelGeneration: true, + maxParallelRequests: 3, + }, + tools: { getWeather: weatherTool, getTime: timeTool }, + // Tune these depending on your budget & latency goals + maxContextFrames: 6, // very important — each frame ≈ 100–400 tokens + maxFrameInputSize: 2_500_000, // ~2.5 MB + }); + + // Reuse most of the same event logging you have in ws-server.ts + agent.on("text", (data: { role: string; text: string }) => { + console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`); + }); + agent.on("chunk:text_delta", (data: { id: string; text: string }) => { + process.stdout.write(data.text || ""); + }); + agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => { + console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`); + }); + agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`)); + + // Audio and transcription events + agent.on("audio_received", ({ size, format }) => { + console.log(`[video] Audio received: ${size} bytes, format: ${format}`); + }); + agent.on("transcription", ({ text, language }) => { + console.log(`[video] Transcription: "${text}" (${language || "unknown"})`); + }); + + // Speech events + agent.on("speech_start", () => console.log(`[video] Speech started`)); + agent.on("speech_complete", () => console.log(`[video] Speech complete`)); + agent.on("audio_chunk", ({ chunkId, text }) => { + console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`); + }); + + // Error handling + agent.on("error", (error: Error) => { + console.error(`[video] ERROR:`, error); + }); + agent.on("warning", (warning: string) => { + console.warn(`[video] WARNING:`, warning); + }); + + agent.on("disconnected", () => { + agent.destroy(); + console.log("[video-ws] ✗ client disconnected (agent destroyed)"); + }); + + // ── Intercept raw messages to save frames to disk ──────────────────── + socket.on("message", (raw) => { + try { + const msg = JSON.parse(raw.toString()); + if (msg.type === "video_frame" && msg.image?.data) { + saveFrame(msg); + } + } catch { + // not JSON — ignore, agent will handle binary etc. + } + }); + + // The crucial line — same as VoiceAgent + agent.handleSocket(socket); +}); \ No newline at end of file diff --git a/package.json b/package.json index e910961..07c7841 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "demo": "tsx example/demo.ts", "ws:server": "tsx example/ws-server.ts", "client": "node example/serve-client.js", + "ws:video": "tsx example/ws-server-video.ts", "prepublishOnly": "pnpm build" }, "keywords": [ @@ -56,4 +57,4 @@ "tsx": "^4.20.5", "typescript": "^5.9.3" } -} +} \ No newline at end of file diff --git a/src/VideoAgent.ts b/src/VideoAgent.ts index a8165fa..21697fd 100644 --- a/src/VideoAgent.ts +++ b/src/VideoAgent.ts @@ -84,6 +84,10 @@ const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = { }; export interface VideoAgentOptions { + /** + * AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'), + * anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames. + */ model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o')) transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1')) speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts')) @@ -285,6 +289,7 @@ Use tools when needed to provide accurate information.`; // Handle raw audio data that needs transcription case "audio": if (typeof message.data !== "string" || !message.data) { + console.warn("Received empty or invalid audio message"); this.emit("warning", "Received empty or invalid audio message"); return; } @@ -293,9 +298,15 @@ Use tools when needed to provide accurate information.`; // Force capture current frame when user speaks this.requestFrameCapture("user_request"); console.log( - `Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}` + `[audio handler] Received audio data (${(message.data.length / 1000).toFixed(1)}KB) for processing, format: ${message.format || "unknown"}` ); - await this.processAudioInput(message); + try { + await this.processAudioInput(message); + console.log(`[audio handler] processAudioInput completed`); + } catch (audioError) { + console.error(`[audio handler] Error in processAudioInput:`, audioError); + this.emit("error", audioError); + } break; // Handle video frame from client @@ -850,13 +861,20 @@ Use tools when needed to provide accurate information.`; /** * Process incoming audio data: transcribe and generate response */ - private async processAudioInput(audioMessage: AudioData): Promise { + private async processAudioInput(audioMessage: AudioData | { type: string; data: string; format?: string; sessionId?: string }): Promise { if (!this.transcriptionModel) { - this.emit("error", new Error("Transcription model not configured for audio input")); + const error = new Error("Transcription model not configured for audio input"); + console.error(error.message); + this.emit("error", error); + this.sendWebSocketMessage({ + type: "error", + error: error.message, + }); return; } try { + console.log(`[processAudioInput] Starting audio processing, data length: ${audioMessage.data?.length || 0}`); const audioBuffer = Buffer.from(audioMessage.data, "base64"); if (audioBuffer.length > this.maxAudioInputSize) { @@ -877,19 +895,23 @@ Use tools when needed to provide accurate information.`; this.emit("audio_received", { size: audioBuffer.length, format: audioMessage.format, - sessionId: audioMessage.sessionId, + sessionId: audioMessage.sessionId || this.sessionId, }); console.log( - `Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}` + `[processAudioInput] Processing audio: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}` ); + console.log(`[processAudioInput] Calling transcribeAudio...`); const transcribedText = await this.transcribeAudio(audioBuffer); - console.log(`Transcribed text: "${transcribedText}"`); + console.log(`[processAudioInput] Transcribed text: "${transcribedText}"`); if (transcribedText.trim()) { + console.log(`[processAudioInput] Enqueueing text input: "${transcribedText}"`); await this.enqueueTextInput(transcribedText); + console.log(`[processAudioInput] Text input processing complete`); } else { + console.warn(`[processAudioInput] Transcription returned empty text`); this.emit("warning", "Transcription returned empty text"); this.sendWebSocketMessage({ type: "transcription_error", @@ -897,7 +919,7 @@ Use tools when needed to provide accurate information.`; }); } } catch (error) { - console.error("Failed to process audio input:", error); + console.error("[processAudioInput] Failed to process audio input:", error); this.emit("error", error); this.sendWebSocketMessage({ type: "transcription_error", @@ -1049,28 +1071,38 @@ Use tools when needed to provide accurate information.`; * Drain the input queue, processing one request at a time */ private async drainInputQueue(): Promise { - if (this.processingQueue) return; + if (this.processingQueue) { + console.log(`[drainInputQueue] Already processing, skipping`); + return; + } this.processingQueue = true; + console.log(`[drainInputQueue] Starting to drain queue, ${this.inputQueue.length} items`); try { while (this.inputQueue.length > 0) { const item = this.inputQueue.shift()!; + console.log(`[drainInputQueue] Processing item: text="${item.text?.substring(0, 50)}...", hasFrame=${!!item.frame}`); try { let result: string; if (item.frame && item.text) { + console.log(`[drainInputQueue] Calling processMultimodalInput`); result = await this.processMultimodalInput(item.text, item.frame); } else if (item.text) { + console.log(`[drainInputQueue] Calling processUserInput`); result = await this.processUserInput(item.text); } else { result = ""; } + console.log(`[drainInputQueue] Got result: "${result?.substring(0, 100)}..."`); item.resolve(result); } catch (error) { + console.error(`[drainInputQueue] Error processing item:`, error); item.reject(error); } } } finally { this.processingQueue = false; + console.log(`[drainInputQueue] Done draining queue`); } } @@ -1173,6 +1205,7 @@ Use tools when needed to provide accurate information.`; * Process user input with streaming text generation */ private async processUserInput(text: string): Promise { + console.log(`[processUserInput] Starting with text: "${text}"`); this.isProcessing = true; this.currentStreamAbortController = new AbortController(); const streamAbortSignal = this.currentStreamAbortController.signal; @@ -1182,6 +1215,7 @@ Use tools when needed to provide accurate information.`; // Check if we have current frame data - if so, include it const hasVisualContext = !!this.currentFrameData; + console.log(`[processUserInput] hasVisualContext: ${hasVisualContext}`); let messages: ModelMessage[]; @@ -1207,6 +1241,10 @@ Use tools when needed to provide accurate information.`; this.trimHistory(); + console.log(`[processUserInput] Calling streamText with ${messages.length} messages`); + console.log(`[processUserInput] Model:`, this.model); + console.log(`[processUserInput] Tools:`, Object.keys(this.tools)); + const result = streamText({ model: this.model, system: this.instructions, @@ -1218,6 +1256,7 @@ Use tools when needed to provide accurate information.`; this.handleStreamChunk(chunk); }, onFinish: async (event) => { + console.log(`[processUserInput] onFinish called`); for (const step of event.steps) { for (const toolResult of step.toolResults) { this.emit("tool_result", { @@ -1229,11 +1268,12 @@ Use tools when needed to provide accurate information.`; } }, onError: ({ error }) => { - console.error("Stream error:", error); + console.error("[processUserInput] Stream error:", error); this.emit("error", error); }, }); + console.log(`[processUserInput] Calling processStreamResult`); return await this.processStreamResult(result); } catch (error) { this.pendingTextBuffer = "";