feat(example): video streaming

2026-03-02 18:36:39 +00:00 · 2026-02-19 18:42:06 +05:30
parent bbe354b70b
commit c5542fc156
10 changed files with 1214 additions and 14 deletions
--- a/example/ws-server-video.ts
+++ b/example/ws-server-video.ts
@@ -0,0 +1,161 @@
+// ws-server-video.ts
+import "dotenv/config";
+import { WebSocketServer } from "ws";
+import { VideoAgent } from "../src/VideoAgent";   // adjust path
+import { tool } from "ai";
+import { z } from "zod";
+import { openai } from "@ai-sdk/openai";
+import { mkdirSync, writeFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+
+// ── Frame saving ────────────────────────────────────────────────────────
+const __dirname = typeof import.meta.dirname === "string"
+    ? import.meta.dirname
+    : dirname(fileURLToPath(import.meta.url));
+
+const FRAMES_DIR = join(__dirname, "frames");
+mkdirSync(FRAMES_DIR, { recursive: true });
+console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
+
+let frameCounter = 0;
+
+function saveFrame(msg: {
+    sequence?: number;
+    timestamp?: number;
+    triggerReason?: string;
+    image: { data: string; format?: string; width?: number; height?: number };
+}) {
+    const idx = frameCounter++;
+    const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
+    const ts = new Date(msg.timestamp ?? Date.now())
+        .toISOString()
+        .replace(/[:.]/g, "-");
+    const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
+    const filepath = join(FRAMES_DIR, filename);
+
+    const buf = Buffer.from(msg.image.data, "base64");
+    writeFileSync(filepath, buf);
+
+    console.log(
+        `[frames] Saved ${filename}  (${(buf.length / 1024).toFixed(1)} kB` +
+        `${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
+        `, ${msg.triggerReason ?? "unknown"})`
+    );
+}
+
+const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
+const url = new URL(endpoint);
+const port = Number(url.port || 8081);
+const host = url.hostname || "localhost";
+
+
+// ── Tools (same as demo.ts) ────────────────────────────────────────────
+const weatherTool = tool({
+    description: "Get the weather in a location",
+    inputSchema: z.object({
+        location: z.string().describe("The location to get the weather for"),
+    }),
+    execute: async ({ location }) => ({
+        location,
+        temperature: 72 + Math.floor(Math.random() * 21) - 10,
+        conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
+            Math.floor(Math.random() * 4)
+        ],
+    }),
+});
+
+const timeTool = tool({
+    description: "Get the current time",
+    inputSchema: z.object({}),
+    execute: async () => ({
+        time: new Date().toLocaleTimeString(),
+        timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
+    }),
+});
+const wss = new WebSocketServer({ port, host });
+
+wss.on("listening", () => {
+    console.log(`[video-ws] listening on ${endpoint}`);
+    console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
+});
+
+wss.on("connection", (socket) => {
+    console.log("[video-ws] ✓ client connected");
+
+    const agent = new VideoAgent({
+        model: openai("gpt-4o"),               // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
+        transcriptionModel: openai.transcription("whisper-1"),
+        speechModel: openai.speech("gpt-4o-mini-tts"),
+        instructions: `You are a helpful video+voice assistant.
+You can SEE what the user is showing via webcam.
+Describe what you see when it helps answer the question.
+Keep spoken answers concise and natural.`,
+        voice: "alloy",
+        streamingSpeech: {
+            minChunkSize: 25,
+            maxChunkSize: 140,
+            parallelGeneration: true,
+            maxParallelRequests: 3,
+        },
+        tools: { getWeather: weatherTool, getTime: timeTool },
+        // Tune these depending on your budget & latency goals
+        maxContextFrames: 6,           // very important — each frame ≈ 100–400 tokens
+        maxFrameInputSize: 2_500_000,  // ~2.5 MB
+    });
+
+    // Reuse most of the same event logging you have in ws-server.ts
+    agent.on("text", (data: { role: string; text: string }) => {
+        console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
+    });
+    agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
+        process.stdout.write(data.text || "");
+    });
+    agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
+        console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB  ${dimensions.width}×${dimensions.height}`);
+    });
+    agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
+
+    // Audio and transcription events
+    agent.on("audio_received", ({ size, format }) => {
+        console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
+    });
+    agent.on("transcription", ({ text, language }) => {
+        console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
+    });
+
+    // Speech events
+    agent.on("speech_start", () => console.log(`[video] Speech started`));
+    agent.on("speech_complete", () => console.log(`[video] Speech complete`));
+    agent.on("audio_chunk", ({ chunkId, text }) => {
+        console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
+    });
+
+    // Error handling
+    agent.on("error", (error: Error) => {
+        console.error(`[video] ERROR:`, error);
+    });
+    agent.on("warning", (warning: string) => {
+        console.warn(`[video] WARNING:`, warning);
+    });
+
+    agent.on("disconnected", () => {
+        agent.destroy();
+        console.log("[video-ws] ✗ client disconnected (agent destroyed)");
+    });
+
+    // ── Intercept raw messages to save frames to disk ────────────────────
+    socket.on("message", (raw) => {
+        try {
+            const msg = JSON.parse(raw.toString());
+            if (msg.type === "video_frame" && msg.image?.data) {
+                saveFrame(msg);
+            }
+        } catch {
+            // not JSON — ignore, agent will handle binary etc.
+        }
+    });
+
+    // The crucial line — same as VoiceAgent
+    agent.handleSocket(socket);
+});