// ws-server-video.ts import "dotenv/config"; import { WebSocketServer } from "ws"; import { VideoAgent } from "../src/VideoAgent"; // adjust path import { tool } from "ai"; import { z } from "zod"; import { openai } from "@ai-sdk/openai"; import { mkdirSync, writeFileSync } from "fs"; import { join, dirname } from "path"; import { fileURLToPath } from "url"; // ── Frame saving ──────────────────────────────────────────────────────── const __dirname = typeof import.meta.dirname === "string" ? import.meta.dirname : dirname(fileURLToPath(import.meta.url)); const FRAMES_DIR = join(__dirname, "frames"); mkdirSync(FRAMES_DIR, { recursive: true }); console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`); let frameCounter = 0; function saveFrame(msg: { sequence?: number; timestamp?: number; triggerReason?: string; image: { data: string; format?: string; width?: number; height?: number }; }) { const idx = frameCounter++; const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp"); const ts = new Date(msg.timestamp ?? Date.now()) .toISOString() .replace(/[:.]/g, "-"); const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`; const filepath = join(FRAMES_DIR, filename); const buf = Buffer.from(msg.image.data, "base64"); writeFileSync(filepath, buf); console.log( `[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` + `${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` + `, ${msg.triggerReason ?? "unknown"})` ); } const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081"; const url = new URL(endpoint); const port = Number(url.port || 8081); const host = url.hostname || "localhost"; // ── Tools (same as demo.ts) ──────────────────────────────────────────── const weatherTool = tool({ description: "Get the weather in a location", inputSchema: z.object({ location: z.string().describe("The location to get the weather for"), }), execute: async ({ location }) => ({ location, temperature: 72 + Math.floor(Math.random() * 21) - 10, conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][ Math.floor(Math.random() * 4) ], }), }); const timeTool = tool({ description: "Get the current time", inputSchema: z.object({}), execute: async () => ({ time: new Date().toLocaleTimeString(), timezone: Intl.DateTimeFormat().resolvedOptions().timeZone, }), }); const wss = new WebSocketServer({ port, host }); wss.on("listening", () => { console.log(`[video-ws] listening on ${endpoint}`); console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`); }); wss.on("connection", (socket) => { console.log("[video-ws] ✓ client connected"); const agent = new VideoAgent({ model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash… transcriptionModel: openai.transcription("whisper-1"), speechModel: openai.speech("gpt-4o-mini-tts"), instructions: `You are a helpful video+voice assistant. You can SEE what the user is showing via webcam. Describe what you see when it helps answer the question. Keep spoken answers concise and natural.`, voice: "alloy", streamingSpeech: { minChunkSize: 25, maxChunkSize: 140, parallelGeneration: true, maxParallelRequests: 3, }, tools: { getWeather: weatherTool, getTime: timeTool }, // Tune these depending on your budget & latency goals maxContextFrames: 6, // very important — each frame ≈ 100–400 tokens maxFrameInputSize: 2_500_000, // ~2.5 MB }); // Reuse most of the same event logging you have in ws-server.ts agent.on("text", (data: { role: string; text: string }) => { console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`); }); agent.on("chunk:text_delta", (data: { id: string; text: string }) => { process.stdout.write(data.text || ""); }); agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => { console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`); }); agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`)); // Audio and transcription events agent.on("audio_received", ({ size, format }) => { console.log(`[video] Audio received: ${size} bytes, format: ${format}`); }); agent.on("transcription", ({ text, language }) => { console.log(`[video] Transcription: "${text}" (${language || "unknown"})`); }); // Speech events agent.on("speech_start", () => console.log(`[video] Speech started`)); agent.on("speech_complete", () => console.log(`[video] Speech complete`)); agent.on("audio_chunk", ({ chunkId, text }) => { console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`); }); // Error handling agent.on("error", (error: Error) => { console.error(`[video] ERROR:`, error); }); agent.on("warning", (warning: string) => { console.warn(`[video] WARNING:`, warning); }); agent.on("disconnected", () => { agent.destroy(); console.log("[video-ws] ✗ client disconnected (agent destroyed)"); }); // ── Intercept raw messages to save frames to disk ──────────────────── socket.on("message", (raw) => { try { const msg = JSON.parse(raw.toString()); if (msg.type === "video_frame" && msg.image?.data) { saveFrame(msg); } } catch { // not JSON — ignore, agent will handle binary etc. } }); // The crucial line — same as VoiceAgent agent.handleSocket(socket); });