feat: Introduce new core components for conversation and speech management

- Added ConversationManager for managing conversation history with configurable limits.
- Implemented InputQueue for serial processing of input items.
- Created SpeechManager for handling text-to-speech generation and streaming.
- Developed StreamProcessor for processing LLM streams and forwarding events.
- Added TranscriptionManager for audio transcription using AI SDK.
- Introduced WebSocketManager for managing WebSocket connections and messaging.
- Updated VoiceAgent to support new architecture and improved socket handling.
- Refactored index files to export new core components.
This commit is contained in:
Bijit Mondal
2026-02-23 16:15:49 +05:30
parent 4dd30b89c0
commit 5e7eb469ae
71 changed files with 5175 additions and 19 deletions

571
dist/VideoAgent.new.js vendored Normal file
View File

@@ -0,0 +1,571 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VideoAgent = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const core_1 = require("./core");
/** Default maximum frame input size (5 MB) */
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
/** Default video agent config */
const DEFAULT_VIDEO_AGENT_CONFIG = {
maxContextFrames: 10,
};
// ── VideoAgent class ────────────────────────────────────
class VideoAgent extends events_1.EventEmitter {
model;
instructions;
stopWhen;
endpoint;
tools = {};
isDestroyed = false;
_isProcessing = false;
// Abort controller for the current LLM stream
currentStreamAbortController;
// ── Managers ─────────────────────────────────────────
ws;
speech;
conversation;
transcription;
inputQueue;
// ── Video-specific state ────────────────────────────
sessionId;
frameSequence = 0;
lastFrameTimestamp = 0;
lastFrameHash;
frameContextBuffer = [];
currentFrameData;
videoConfig;
maxFrameInputSize;
constructor(options) {
super();
this.model = options.model;
this.instructions =
options.instructions ||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
When analyzing images, be concise but informative. Describe what you see when asked.
Keep responses conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`;
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
this.endpoint = options.endpoint;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
this.sessionId = options.sessionId || this.generateSessionId();
this.videoConfig = {
...DEFAULT_VIDEO_AGENT_CONFIG,
maxContextFrames: options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
};
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ─────────────────────────
this.ws = new core_1.WebSocketManager();
this.speech = new core_1.SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new core_1.ConversationManager({
history: options.history,
});
this.transcription = new core_1.TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new core_1.InputQueue();
// ── Wire managers to WebSocket send ─────────────
const sendMsg = (msg) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire input queue processor ──────────────────
this.inputQueue.processor = (item) => this.processQueueItem(item);
// ── Bubble events from managers ─────────────────
this.bubbleEvents(this.ws, ["connected", "error"]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle ──────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message) => this.handleMessage(message));
}
// ══════════════════════════════════════════════════════
// Public API
// ══════════════════════════════════════════════════════
registerTools(tools) {
this.tools = { ...this.tools, ...tools };
}
async transcribeAudio(audioData) {
return this.transcription.transcribeAudio(audioData);
}
async generateSpeechFromText(text, abortSignal) {
return this.speech.generateSpeechFromText(text, abortSignal);
}
interruptSpeech(reason = "interrupted") {
this.speech.interruptSpeech(reason);
}
interruptCurrentResponse(reason = "interrupted") {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
async connect(url) {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
handleSocket(socket) {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
async sendText(text) {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueTextInput(text);
}
async sendAudio(audioData) {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
async sendAudioBuffer(audioBuffer) {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Send a video frame with optional text query for vision analysis
*/
async sendFrame(frameData, query, options) {
this.ensureNotDestroyed();
const frame = {
type: "video_frame",
sessionId: this.sessionId,
sequence: this.frameSequence++,
timestamp: Date.now(),
triggerReason: "user_request",
previousFrameRef: this.lastFrameHash,
image: {
data: frameData,
format: options?.format || "webp",
width: options?.width || 640,
height: options?.height || 480,
},
};
// Update local frame state
await this.handleVideoFrame(frame);
if (query) {
return this.enqueueMultimodalInput(query, frame);
}
return "";
}
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason) {
this.ws.send({
type: "capture_frame",
reason,
timestamp: Date.now(),
});
this.emit("frame_requested", { reason });
}
getConfig() {
return { ...this.videoConfig };
}
updateConfig(config) {
this.videoConfig = { ...this.videoConfig, ...config };
this.emit("config_changed", this.videoConfig);
}
startListening() {
this.emit("listening");
}
stopListening() {
this.emit("stopped");
}
clearHistory() {
this.conversation.clearHistory();
this.frameContextBuffer = [];
}
getHistory() {
return this.conversation.getHistory();
}
setHistory(history) {
this.conversation.setHistory(history);
}
getFrameContext() {
return [...this.frameContextBuffer];
}
getSessionId() {
return this.sessionId;
}
disconnect() {
this.ws.disconnect();
}
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.frameContextBuffer = [];
this.tools = {};
this.removeAllListeners();
}
// ── Getters ─────────────────────────────────────────
get connected() {
return this.ws.isConnected;
}
get processing() {
return this._isProcessing;
}
get speaking() {
return this.speech.isSpeaking;
}
get pendingSpeechChunks() {
return this.speech.pendingChunkCount;
}
get destroyed() {
return this.isDestroyed;
}
get currentFrameSequence() {
return this.frameSequence;
}
get hasVisualContext() {
return !!this.currentFrameData;
}
// ══════════════════════════════════════════════════════
// Private — message handling
// ══════════════════════════════════════════════════════
async handleMessage(message) {
try {
switch (message.type) {
case "transcript":
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
await this.enqueueTextInput(message.text);
break;
case "audio":
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
try {
await this.handleAudioInput(message.data, message.format);
}
catch (audioError) {
this.emit("error", audioError);
}
break;
case "video_frame":
await this.handleVideoFrame(message);
break;
case "interrupt":
this.interruptCurrentResponse(message.reason || "client_request");
break;
case "client_ready":
this.handleClientReady(message);
break;
}
}
catch (err) {
this.emit("error", err);
}
}
handleClientReady(message) {
this.ws.send({
type: "session_init",
sessionId: this.sessionId,
});
this.emit("client_ready", message.capabilities);
}
// ══════════════════════════════════════════════════════
// Private — audio
// ══════════════════════════════════════════════════════
async handleAudioInput(base64Audio, format) {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueTextInput(text);
}
}
// ══════════════════════════════════════════════════════
// Private — video frames
// ══════════════════════════════════════════════════════
async handleVideoFrame(frame) {
try {
if (!frame.image?.data) {
this.emit("warning", "Received empty or invalid video frame");
return;
}
const frameSize = Buffer.from(frame.image.data, "base64").length;
if (frameSize > this.maxFrameInputSize) {
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
this.emit("error", new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
return;
}
const frameHash = this.hashFrame(frame.image.data);
this.lastFrameTimestamp = frame.timestamp;
this.lastFrameHash = frameHash;
this.currentFrameData = frame.image.data;
this.addFrameToContext({
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
frameHash,
});
this.emit("frame_received", {
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
size: frameSize,
dimensions: { width: frame.image.width, height: frame.image.height },
});
this.ws.send({
type: "frame_ack",
sequence: frame.sequence,
timestamp: Date.now(),
});
}
catch (error) {
this.emit("error", error);
}
}
addFrameToContext(context) {
this.frameContextBuffer.push(context);
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
this.frameContextBuffer.shift();
}
}
hashFrame(data) {
let hash = 0;
for (let i = 0; i < data.length; i++) {
const char = data.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
}
generateSessionId() {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2, 10);
return `vs_${timestamp}_${randomPart}`;
}
// ══════════════════════════════════════════════════════
// Private — input queue
// ══════════════════════════════════════════════════════
enqueueTextInput(text) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
enqueueMultimodalInput(text, frame) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, frame, resolve, reject });
});
}
/**
* Route queued items to the correct processor.
*/
async processQueueItem(item) {
if (item.frame && item.text) {
return this.processMultimodalInput(item.text, item.frame);
}
else if (item.text) {
return this.processUserInput(item.text);
}
return "";
}
// ══════════════════════════════════════════════════════
// Private — multimodal content building
// ══════════════════════════════════════════════════════
buildMultimodalContent(text, frameData) {
const content = [];
if (this.frameContextBuffer.length > 0) {
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
content.push({ type: "text", text: contextSummary });
}
const imageData = frameData || this.currentFrameData;
if (imageData) {
content.push({ type: "image", image: imageData });
}
content.push({ type: "text", text });
return content;
}
// ══════════════════════════════════════════════════════
// Private — LLM processing
// ══════════════════════════════════════════════════════
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
async runStream(messages, abortSignal) {
const result = (0, ai_1.streamText)({
model: this.model,
system: this.instructions,
messages,
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal,
onChunk: ({ chunk }) => {
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
this.emit("error", error);
},
});
const streamResult = await (0, core_1.processFullStream)(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
}, {
sessionId: this.sessionId,
frameContext: this.frameContextBuffer.length > 0
? {
frameCount: this.frameContextBuffer.length,
lastFrameSequence: this.frameContextBuffer[this.frameContextBuffer.length - 1]
?.sequence,
}
: undefined,
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush remaining speech & wait for queue
this.speech.flushPendingText();
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
/**
* Process text-only input (with optional visual context from latest frame).
*/
async processUserInput(text) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text });
const hasVisual = !!this.currentFrameData;
let messages;
if (hasVisual) {
const content = this.buildMultimodalContent(text);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Visual context] ${text}` }],
});
messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
}
else {
this.conversation.addMessage({ role: "user", content: text });
messages = this.conversation.getHistoryRef();
}
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
/**
* Process multimodal input (text + explicit video frame).
*/
async processMultimodalInput(text, frame) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text, hasImage: true });
const content = this.buildMultimodalContent(text, frame.image.data);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Image attached] ${text}` }],
});
const messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ══════════════════════════════════════════════════════
// Private — helpers
// ══════════════════════════════════════════════════════
ensureNotDestroyed() {
if (this.isDestroyed) {
throw new Error("VideoAgent has been destroyed and cannot be used");
}
}
cleanupOnDisconnect() {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.currentFrameData = undefined;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
bubbleEvents(source, events) {
for (const event of events) {
source.on(event, (...args) => this.emit(event, ...args));
}
}
}
exports.VideoAgent = VideoAgent;
//# sourceMappingURL=VideoAgent.new.js.map