diff --git a/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp b/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp
new file mode 100644
index 0000000..d05b820
Binary files /dev/null and b/example/frames/frame_00000_2026-02-19T13-10-00-344Z.webp differ
diff --git a/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp b/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp
new file mode 100644
index 0000000..a0acc5a
Binary files /dev/null and b/example/frames/frame_00001_2026-02-19T13-10-04-584Z.webp differ
diff --git a/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp b/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp
new file mode 100644
index 0000000..128aff2
Binary files /dev/null and b/example/frames/frame_00002_2026-02-19T13-10-06-446Z.webp differ
diff --git a/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp b/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp
new file mode 100644
index 0000000..095e541
Binary files /dev/null and b/example/frames/frame_00003_2026-02-19T13-10-14-543Z.webp differ
diff --git a/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp b/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp
new file mode 100644
index 0000000..f745871
Binary files /dev/null and b/example/frames/frame_00004_2026-02-19T13-10-24-540Z.webp differ
diff --git a/example/serve-client.js b/example/serve-client.js
index c8b1651..19b6082 100644
--- a/example/serve-client.js
+++ b/example/serve-client.js
@@ -2,19 +2,19 @@ const http = require('http');
const fs = require('fs');
const path = require('path');
-const PORT = 3000;
+const PORT = 3102;
// Create a simple HTTP server to serve the voice client HTML
const server = http.createServer((req, res) => {
if (req.url === '/' || req.url === '/index.html') {
- const htmlPath = path.join(__dirname, 'voice-client.html');
+ const htmlPath = path.join(__dirname, 'video-client.html');
fs.readFile(htmlPath, (err, data) => {
if (err) {
res.writeHead(500);
res.end('Error loading voice-client.html');
return;
}
- res.writeHead(200, {'Content-Type': 'text/html'});
+ res.writeHead(200, { 'Content-Type': 'text/html' });
res.end(data);
});
} else {
diff --git a/example/video-client.html b/example/video-client.html
new file mode 100644
index 0000000..65304f8
--- /dev/null
+++ b/example/video-client.html
@@ -0,0 +1,998 @@
+
+
+
+
+
+
+ Video + Voice Agent Client
+
+
+
+
+
+ 📹 Video + Voice Agent
+ Webcam + microphone → multimodal AI (vision + speech)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Disconnected
+
+
+
+ 👤 You said
+ —
+
+ 🤖 Assistant
+
+
+
+
+
+
+ 📜 Log
+
+
+
+
+
+
\ No newline at end of file
diff --git a/example/ws-server-video.ts b/example/ws-server-video.ts
new file mode 100644
index 0000000..e155194
--- /dev/null
+++ b/example/ws-server-video.ts
@@ -0,0 +1,161 @@
+// ws-server-video.ts
+import "dotenv/config";
+import { WebSocketServer } from "ws";
+import { VideoAgent } from "../src/VideoAgent"; // adjust path
+import { tool } from "ai";
+import { z } from "zod";
+import { openai } from "@ai-sdk/openai";
+import { mkdirSync, writeFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+
+// ── Frame saving ────────────────────────────────────────────────────────
+const __dirname = typeof import.meta.dirname === "string"
+ ? import.meta.dirname
+ : dirname(fileURLToPath(import.meta.url));
+
+const FRAMES_DIR = join(__dirname, "frames");
+mkdirSync(FRAMES_DIR, { recursive: true });
+console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
+
+let frameCounter = 0;
+
+function saveFrame(msg: {
+ sequence?: number;
+ timestamp?: number;
+ triggerReason?: string;
+ image: { data: string; format?: string; width?: number; height?: number };
+}) {
+ const idx = frameCounter++;
+ const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
+ const ts = new Date(msg.timestamp ?? Date.now())
+ .toISOString()
+ .replace(/[:.]/g, "-");
+ const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
+ const filepath = join(FRAMES_DIR, filename);
+
+ const buf = Buffer.from(msg.image.data, "base64");
+ writeFileSync(filepath, buf);
+
+ console.log(
+ `[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` +
+ `${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
+ `, ${msg.triggerReason ?? "unknown"})`
+ );
+}
+
+const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
+const url = new URL(endpoint);
+const port = Number(url.port || 8081);
+const host = url.hostname || "localhost";
+
+
+// ── Tools (same as demo.ts) ────────────────────────────────────────────
+const weatherTool = tool({
+ description: "Get the weather in a location",
+ inputSchema: z.object({
+ location: z.string().describe("The location to get the weather for"),
+ }),
+ execute: async ({ location }) => ({
+ location,
+ temperature: 72 + Math.floor(Math.random() * 21) - 10,
+ conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
+ Math.floor(Math.random() * 4)
+ ],
+ }),
+});
+
+const timeTool = tool({
+ description: "Get the current time",
+ inputSchema: z.object({}),
+ execute: async () => ({
+ time: new Date().toLocaleTimeString(),
+ timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
+ }),
+});
+const wss = new WebSocketServer({ port, host });
+
+wss.on("listening", () => {
+ console.log(`[video-ws] listening on ${endpoint}`);
+ console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
+});
+
+wss.on("connection", (socket) => {
+ console.log("[video-ws] ✓ client connected");
+
+ const agent = new VideoAgent({
+ model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
+ transcriptionModel: openai.transcription("whisper-1"),
+ speechModel: openai.speech("gpt-4o-mini-tts"),
+ instructions: `You are a helpful video+voice assistant.
+You can SEE what the user is showing via webcam.
+Describe what you see when it helps answer the question.
+Keep spoken answers concise and natural.`,
+ voice: "alloy",
+ streamingSpeech: {
+ minChunkSize: 25,
+ maxChunkSize: 140,
+ parallelGeneration: true,
+ maxParallelRequests: 3,
+ },
+ tools: { getWeather: weatherTool, getTime: timeTool },
+ // Tune these depending on your budget & latency goals
+ maxContextFrames: 6, // very important — each frame ≈ 100–400 tokens
+ maxFrameInputSize: 2_500_000, // ~2.5 MB
+ });
+
+ // Reuse most of the same event logging you have in ws-server.ts
+ agent.on("text", (data: { role: string; text: string }) => {
+ console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
+ });
+ agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
+ process.stdout.write(data.text || "");
+ });
+ agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
+ console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`);
+ });
+ agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
+
+ // Audio and transcription events
+ agent.on("audio_received", ({ size, format }) => {
+ console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
+ });
+ agent.on("transcription", ({ text, language }) => {
+ console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
+ });
+
+ // Speech events
+ agent.on("speech_start", () => console.log(`[video] Speech started`));
+ agent.on("speech_complete", () => console.log(`[video] Speech complete`));
+ agent.on("audio_chunk", ({ chunkId, text }) => {
+ console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
+ });
+
+ // Error handling
+ agent.on("error", (error: Error) => {
+ console.error(`[video] ERROR:`, error);
+ });
+ agent.on("warning", (warning: string) => {
+ console.warn(`[video] WARNING:`, warning);
+ });
+
+ agent.on("disconnected", () => {
+ agent.destroy();
+ console.log("[video-ws] ✗ client disconnected (agent destroyed)");
+ });
+
+ // ── Intercept raw messages to save frames to disk ────────────────────
+ socket.on("message", (raw) => {
+ try {
+ const msg = JSON.parse(raw.toString());
+ if (msg.type === "video_frame" && msg.image?.data) {
+ saveFrame(msg);
+ }
+ } catch {
+ // not JSON — ignore, agent will handle binary etc.
+ }
+ });
+
+ // The crucial line — same as VoiceAgent
+ agent.handleSocket(socket);
+});
\ No newline at end of file
diff --git a/package.json b/package.json
index e910961..07c7841 100644
--- a/package.json
+++ b/package.json
@@ -15,6 +15,7 @@
"demo": "tsx example/demo.ts",
"ws:server": "tsx example/ws-server.ts",
"client": "node example/serve-client.js",
+ "ws:video": "tsx example/ws-server-video.ts",
"prepublishOnly": "pnpm build"
},
"keywords": [
@@ -56,4 +57,4 @@
"tsx": "^4.20.5",
"typescript": "^5.9.3"
}
-}
+}
\ No newline at end of file
diff --git a/src/VideoAgent.ts b/src/VideoAgent.ts
index a8165fa..21697fd 100644
--- a/src/VideoAgent.ts
+++ b/src/VideoAgent.ts
@@ -84,6 +84,10 @@ const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
};
export interface VideoAgentOptions {
+ /**
+ * AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
+ * anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
+ */
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1'))
speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts'))
@@ -285,6 +289,7 @@ Use tools when needed to provide accurate information.`;
// Handle raw audio data that needs transcription
case "audio":
if (typeof message.data !== "string" || !message.data) {
+ console.warn("Received empty or invalid audio message");
this.emit("warning", "Received empty or invalid audio message");
return;
}
@@ -293,9 +298,15 @@ Use tools when needed to provide accurate information.`;
// Force capture current frame when user speaks
this.requestFrameCapture("user_request");
console.log(
- `Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
+ `[audio handler] Received audio data (${(message.data.length / 1000).toFixed(1)}KB) for processing, format: ${message.format || "unknown"}`
);
- await this.processAudioInput(message);
+ try {
+ await this.processAudioInput(message);
+ console.log(`[audio handler] processAudioInput completed`);
+ } catch (audioError) {
+ console.error(`[audio handler] Error in processAudioInput:`, audioError);
+ this.emit("error", audioError);
+ }
break;
// Handle video frame from client
@@ -850,13 +861,20 @@ Use tools when needed to provide accurate information.`;
/**
* Process incoming audio data: transcribe and generate response
*/
- private async processAudioInput(audioMessage: AudioData): Promise {
+ private async processAudioInput(audioMessage: AudioData | { type: string; data: string; format?: string; sessionId?: string }): Promise {
if (!this.transcriptionModel) {
- this.emit("error", new Error("Transcription model not configured for audio input"));
+ const error = new Error("Transcription model not configured for audio input");
+ console.error(error.message);
+ this.emit("error", error);
+ this.sendWebSocketMessage({
+ type: "error",
+ error: error.message,
+ });
return;
}
try {
+ console.log(`[processAudioInput] Starting audio processing, data length: ${audioMessage.data?.length || 0}`);
const audioBuffer = Buffer.from(audioMessage.data, "base64");
if (audioBuffer.length > this.maxAudioInputSize) {
@@ -877,19 +895,23 @@ Use tools when needed to provide accurate information.`;
this.emit("audio_received", {
size: audioBuffer.length,
format: audioMessage.format,
- sessionId: audioMessage.sessionId,
+ sessionId: audioMessage.sessionId || this.sessionId,
});
console.log(
- `Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
+ `[processAudioInput] Processing audio: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
);
+ console.log(`[processAudioInput] Calling transcribeAudio...`);
const transcribedText = await this.transcribeAudio(audioBuffer);
- console.log(`Transcribed text: "${transcribedText}"`);
+ console.log(`[processAudioInput] Transcribed text: "${transcribedText}"`);
if (transcribedText.trim()) {
+ console.log(`[processAudioInput] Enqueueing text input: "${transcribedText}"`);
await this.enqueueTextInput(transcribedText);
+ console.log(`[processAudioInput] Text input processing complete`);
} else {
+ console.warn(`[processAudioInput] Transcription returned empty text`);
this.emit("warning", "Transcription returned empty text");
this.sendWebSocketMessage({
type: "transcription_error",
@@ -897,7 +919,7 @@ Use tools when needed to provide accurate information.`;
});
}
} catch (error) {
- console.error("Failed to process audio input:", error);
+ console.error("[processAudioInput] Failed to process audio input:", error);
this.emit("error", error);
this.sendWebSocketMessage({
type: "transcription_error",
@@ -1049,28 +1071,38 @@ Use tools when needed to provide accurate information.`;
* Drain the input queue, processing one request at a time
*/
private async drainInputQueue(): Promise {
- if (this.processingQueue) return;
+ if (this.processingQueue) {
+ console.log(`[drainInputQueue] Already processing, skipping`);
+ return;
+ }
this.processingQueue = true;
+ console.log(`[drainInputQueue] Starting to drain queue, ${this.inputQueue.length} items`);
try {
while (this.inputQueue.length > 0) {
const item = this.inputQueue.shift()!;
+ console.log(`[drainInputQueue] Processing item: text="${item.text?.substring(0, 50)}...", hasFrame=${!!item.frame}`);
try {
let result: string;
if (item.frame && item.text) {
+ console.log(`[drainInputQueue] Calling processMultimodalInput`);
result = await this.processMultimodalInput(item.text, item.frame);
} else if (item.text) {
+ console.log(`[drainInputQueue] Calling processUserInput`);
result = await this.processUserInput(item.text);
} else {
result = "";
}
+ console.log(`[drainInputQueue] Got result: "${result?.substring(0, 100)}..."`);
item.resolve(result);
} catch (error) {
+ console.error(`[drainInputQueue] Error processing item:`, error);
item.reject(error);
}
}
} finally {
this.processingQueue = false;
+ console.log(`[drainInputQueue] Done draining queue`);
}
}
@@ -1173,6 +1205,7 @@ Use tools when needed to provide accurate information.`;
* Process user input with streaming text generation
*/
private async processUserInput(text: string): Promise {
+ console.log(`[processUserInput] Starting with text: "${text}"`);
this.isProcessing = true;
this.currentStreamAbortController = new AbortController();
const streamAbortSignal = this.currentStreamAbortController.signal;
@@ -1182,6 +1215,7 @@ Use tools when needed to provide accurate information.`;
// Check if we have current frame data - if so, include it
const hasVisualContext = !!this.currentFrameData;
+ console.log(`[processUserInput] hasVisualContext: ${hasVisualContext}`);
let messages: ModelMessage[];
@@ -1207,6 +1241,10 @@ Use tools when needed to provide accurate information.`;
this.trimHistory();
+ console.log(`[processUserInput] Calling streamText with ${messages.length} messages`);
+ console.log(`[processUserInput] Model:`, this.model);
+ console.log(`[processUserInput] Tools:`, Object.keys(this.tools));
+
const result = streamText({
model: this.model,
system: this.instructions,
@@ -1218,6 +1256,7 @@ Use tools when needed to provide accurate information.`;
this.handleStreamChunk(chunk);
},
onFinish: async (event) => {
+ console.log(`[processUserInput] onFinish called`);
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
@@ -1229,11 +1268,12 @@ Use tools when needed to provide accurate information.`;
}
},
onError: ({ error }) => {
- console.error("Stream error:", error);
+ console.error("[processUserInput] Stream error:", error);
this.emit("error", error);
},
});
+ console.log(`[processUserInput] Calling processStreamResult`);
return await this.processStreamResult(result);
} catch (error) {
this.pendingTextBuffer = "";