6 Commits

Author SHA1 Message Date
Bijit Mondal
bf4ba8ea77 1.0.1 2026-02-23 16:16:03 +05:30
Bijit Mondal
5e7eb469ae feat: Introduce new core components for conversation and speech management
- Added ConversationManager for managing conversation history with configurable limits.
- Implemented InputQueue for serial processing of input items.
- Created SpeechManager for handling text-to-speech generation and streaming.
- Developed StreamProcessor for processing LLM streams and forwarding events.
- Added TranscriptionManager for audio transcription using AI SDK.
- Introduced WebSocketManager for managing WebSocket connections and messaging.
- Updated VoiceAgent to support new architecture and improved socket handling.
- Refactored index files to export new core components.
2026-02-23 16:15:49 +05:30
Bijit Mondal
4dd30b89c0 Refactor code structure for improved readability and maintainability 2026-02-20 16:19:08 +05:30
Bijit Mondal
97a3078578 1.0.0 2026-02-20 16:17:14 +05:30
Bijit Mondal
990d17abe7 refactor(VideoAgent): remove unnecessary console logs for cleaner output 2026-02-20 16:16:18 +05:30
Bijit Mondal
c5542fc156 feat(example): video streaming 2026-02-19 18:42:06 +05:30
76 changed files with 6379 additions and 144 deletions

View File

@@ -53,6 +53,10 @@ interface FrameContext {
description?: string; description?: string;
} }
export interface VideoAgentOptions { export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel; model: LanguageModel;
transcriptionModel?: TranscriptionModel; transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel; speechModel?: SpeechModel;

File diff suppressed because one or more lines are too long

66
dist/VideoAgent.js vendored
View File

@@ -71,7 +71,7 @@ Use tools when needed to provide accurate information.`;
this.endpoint = options.endpoint; this.endpoint = options.endpoint;
this.voice = options.voice || "alloy"; this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions; this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "mp3"; this.outputFormat = options.outputFormat || "opus";
this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE; this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE; this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
// Generate or use provided session ID // Generate or use provided session ID
@@ -142,7 +142,6 @@ Use tools when needed to provide accurate information.`;
this.socket.on("message", async (data) => { this.socket.on("message", async (data) => {
try { try {
const message = JSON.parse(data.toString()); const message = JSON.parse(data.toString());
console.log(`Received WebSocket message of type: ${message.type}`);
switch (message.type) { switch (message.type) {
// Handle transcribed text from the client/STT // Handle transcribed text from the client/STT
case "transcript": case "transcript":
@@ -154,7 +153,6 @@ Use tools when needed to provide accurate information.`;
this.interruptCurrentResponse("user_speaking"); this.interruptCurrentResponse("user_speaking");
// Force capture current frame when user speaks // Force capture current frame when user speaks
this.requestFrameCapture("user_request"); this.requestFrameCapture("user_request");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueTextInput(message.text); await this.enqueueTextInput(message.text);
break; break;
// Handle raw audio data that needs transcription // Handle raw audio data that needs transcription
@@ -167,8 +165,12 @@ Use tools when needed to provide accurate information.`;
this.interruptCurrentResponse("user_speaking"); this.interruptCurrentResponse("user_speaking");
// Force capture current frame when user speaks // Force capture current frame when user speaks
this.requestFrameCapture("user_request"); this.requestFrameCapture("user_request");
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`); try {
await this.processAudioInput(message); await this.processAudioInput(message);
}
catch (audioError) {
this.emit("error", audioError);
}
break; break;
// Handle video frame from client // Handle video frame from client
case "video_frame": case "video_frame":
@@ -176,7 +178,6 @@ Use tools when needed to provide accurate information.`;
break; break;
// Handle explicit interrupt request from client // Handle explicit interrupt request from client
case "interrupt": case "interrupt":
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
this.interruptCurrentResponse(message.reason || "client_request"); this.interruptCurrentResponse(message.reason || "client_request");
break; break;
// Handle client ready signal // Handle client ready signal
@@ -184,22 +185,19 @@ Use tools when needed to provide accurate information.`;
this.handleClientReady(message); this.handleClientReady(message);
break; break;
default: default:
console.log(`Unknown message type: ${message.type}`); break;
} }
} }
catch (err) { catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err); this.emit("error", err);
} }
}); });
this.socket.on("close", () => { this.socket.on("close", () => {
console.log("Disconnected");
this.isConnected = false; this.isConnected = false;
this.cleanupOnDisconnect(); this.cleanupOnDisconnect();
this.emit("disconnected"); this.emit("disconnected");
}); });
this.socket.on("error", (error) => { this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error); this.emit("error", error);
}); });
} }
@@ -207,7 +205,6 @@ Use tools when needed to provide accurate information.`;
* Handle client ready signal * Handle client ready signal
*/ */
handleClientReady(message) { handleClientReady(message) {
console.log(`Client ready, capabilities: ${JSON.stringify(message.capabilities || {})}`);
// Send session info to client // Send session info to client
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "session_init", type: "session_init",
@@ -259,10 +256,8 @@ Use tools when needed to provide accurate information.`;
sequence: frame.sequence, sequence: frame.sequence,
timestamp: Date.now(), timestamp: Date.now(),
}); });
console.log(`Received frame #${frame.sequence} (${frame.triggerReason}): ${(frameSize / 1024).toFixed(1)}KB, ${frame.image.width}x${frame.image.height}`);
} }
catch (error) { catch (error) {
console.error("Failed to handle video frame:", error);
this.emit("error", error); this.emit("error", error);
} }
} }
@@ -331,13 +326,11 @@ Use tools when needed to provide accurate information.`;
if (!this.transcriptionModel) { if (!this.transcriptionModel) {
throw new Error("Transcription model not configured"); throw new Error("Transcription model not configured");
} }
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
try { try {
const result = await (0, ai_1.experimental_transcribe)({ const result = await (0, ai_1.experimental_transcribe)({
model: this.transcriptionModel, model: this.transcriptionModel,
audio: audioData, audio: audioData,
}); });
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`);
this.emit("transcription", { this.emit("transcription", {
text: result.text, text: result.text,
language: result.language, language: result.language,
@@ -351,7 +344,6 @@ Use tools when needed to provide accurate information.`;
return result.text; return result.text;
} }
catch (error) { catch (error) {
console.error("Whisper transcription failed:", error);
throw error; throw error;
} }
} }
@@ -513,17 +505,13 @@ Use tools when needed to provide accurate information.`;
this.currentSpeechAbortController = new AbortController(); this.currentSpeechAbortController = new AbortController();
} }
try { try {
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`);
const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal); const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal);
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
return audioData; return audioData;
} }
catch (error) { catch (error) {
if (error.name === "AbortError") { if (error.name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null; return null;
} }
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
this.emit("error", error); this.emit("error", error);
return null; return null;
} }
@@ -535,25 +523,21 @@ Use tools when needed to provide accurate information.`;
if (this.isSpeaking) if (this.isSpeaking)
return; return;
this.isSpeaking = true; this.isSpeaking = true;
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
this.emit("speech_start", { streaming: true }); this.emit("speech_start", { streaming: true });
this.sendWebSocketMessage({ type: "speech_stream_start" }); this.sendWebSocketMessage({ type: "speech_stream_start" });
try { try {
while (this.speechChunkQueue.length > 0) { while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0]; const chunk = this.speechChunkQueue[0];
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
if (!chunk.audioPromise) { if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk); chunk.audioPromise = this.generateChunkAudio(chunk);
} }
const audioData = await chunk.audioPromise; const audioData = await chunk.audioPromise;
if (!this.isSpeaking) { if (!this.isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break; break;
} }
this.speechChunkQueue.shift(); this.speechChunkQueue.shift();
if (audioData) { if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64"); const base64Audio = Buffer.from(audioData).toString("base64");
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "audio_chunk", type: "audio_chunk",
chunkId: chunk.id, chunkId: chunk.id,
@@ -569,14 +553,10 @@ Use tools when needed to provide accurate information.`;
uint8Array: audioData, uint8Array: audioData,
}); });
} }
else {
console.log(`No audio data generated for chunk #${chunk.id}`);
}
if (this.streamingSpeechConfig.parallelGeneration) { if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length; const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length); const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length);
if (toStart > 0) { if (toStart > 0) {
console.log(`Starting parallel generation for ${toStart} more chunks`);
for (let i = 0; i < toStart; i++) { for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise); const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
if (nextChunk) { if (nextChunk) {
@@ -588,7 +568,6 @@ Use tools when needed to provide accurate information.`;
} }
} }
catch (error) { catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error); this.emit("error", error);
} }
finally { finally {
@@ -599,7 +578,6 @@ Use tools when needed to provide accurate information.`;
this.speechQueueDoneResolve = undefined; this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined; this.speechQueueDonePromise = undefined;
} }
console.log(`Speech queue processing complete`);
this.sendWebSocketMessage({ type: "speech_stream_end" }); this.sendWebSocketMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true }); this.emit("speech_complete", { streaming: true });
} }
@@ -631,7 +609,12 @@ Use tools when needed to provide accurate information.`;
*/ */
async processAudioInput(audioMessage) { async processAudioInput(audioMessage) {
if (!this.transcriptionModel) { if (!this.transcriptionModel) {
this.emit("error", new Error("Transcription model not configured for audio input")); const error = new Error("Transcription model not configured for audio input");
this.emit("error", error);
this.sendWebSocketMessage({
type: "error",
error: error.message,
});
return; return;
} }
try { try {
@@ -649,11 +632,9 @@ Use tools when needed to provide accurate information.`;
this.emit("audio_received", { this.emit("audio_received", {
size: audioBuffer.length, size: audioBuffer.length,
format: audioMessage.format, format: audioMessage.format,
sessionId: audioMessage.sessionId, sessionId: audioMessage.sessionId || this.sessionId,
}); });
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`);
const transcribedText = await this.transcribeAudio(audioBuffer); const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (transcribedText.trim()) { if (transcribedText.trim()) {
await this.enqueueTextInput(transcribedText); await this.enqueueTextInput(transcribedText);
} }
@@ -666,7 +647,6 @@ Use tools when needed to provide accurate information.`;
} }
} }
catch (error) { catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error); this.emit("error", error);
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "transcription_error", type: "transcription_error",
@@ -796,8 +776,9 @@ Use tools when needed to provide accurate information.`;
* Drain the input queue, processing one request at a time * Drain the input queue, processing one request at a time
*/ */
async drainInputQueue() { async drainInputQueue() {
if (this.processingQueue) if (this.processingQueue) {
return; return;
}
this.processingQueue = true; this.processingQueue = true;
try { try {
while (this.inputQueue.length > 0) { while (this.inputQueue.length > 0) {
@@ -889,7 +870,6 @@ Use tools when needed to provide accurate information.`;
} }
}, },
onError: ({ error }) => { onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error); this.emit("error", error);
}, },
}); });
@@ -960,7 +940,6 @@ Use tools when needed to provide accurate information.`;
} }
}, },
onError: ({ error }) => { onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error); this.emit("error", error);
}, },
}); });
@@ -1203,21 +1182,10 @@ Use tools when needed to provide accurate information.`;
return; return;
try { try {
if (this.socket.readyState === ws_1.WebSocket.OPEN) { if (this.socket.readyState === ws_1.WebSocket.OPEN) {
if (message.type === "audio_chunk" || message.type === "audio") {
const { data, ...rest } = message;
console.log(`Sending WebSocket message: ${message.type}`, data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "", rest);
}
else {
console.log(`Sending WebSocket message: ${message.type}`);
}
this.socket.send(JSON.stringify(message)); this.socket.send(JSON.stringify(message));
} }
else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
}
} }
catch (error) { catch (error) {
console.error("Failed to send WebSocket message:", error);
this.emit("error", error); this.emit("error", error);
} }
} }
@@ -1225,14 +1193,12 @@ Use tools when needed to provide accurate information.`;
* Start listening for voice/video input * Start listening for voice/video input
*/ */
startListening() { startListening() {
console.log("Starting video agent...");
this.emit("listening"); this.emit("listening");
} }
/** /**
* Stop listening for voice/video input * Stop listening for voice/video input
*/ */
stopListening() { stopListening() {
console.log("Stopping video agent...");
this.emit("stopped"); this.emit("stopped");
} }
/** /**

File diff suppressed because one or more lines are too long

175
dist/VideoAgent.new.d.ts vendored Normal file
View File

@@ -0,0 +1,175 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
/**
* Trigger reasons for frame capture
*/
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
/**
* Video frame data structure sent to/from the client
*/
interface VideoFrame {
type: "video_frame";
sessionId: string;
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
previousFrameRef?: string;
image: {
data: string;
format: string;
width: number;
height: number;
};
}
/**
* Audio data structure
*/
interface AudioData {
type: "audio";
sessionId: string;
data: string;
format: string;
sampleRate?: number;
duration?: number;
timestamp: number;
}
/**
* Backend configuration for video processing
*/
interface VideoAgentConfig {
/** Maximum frames to keep in context buffer for conversation history */
maxContextFrames: number;
}
/**
* Frame context for maintaining visual conversation history
*/
interface FrameContext {
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
frameHash: string;
description?: string;
}
export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
history?: Partial<HistoryConfig>;
maxAudioInputSize?: number;
/** Maximum frame input size in bytes (default: 5 MB) */
maxFrameInputSize?: number;
/** Maximum frames to keep in context buffer (default: 10) */
maxContextFrames?: number;
/** Session ID for this video agent instance */
sessionId?: string;
}
export declare class VideoAgent extends EventEmitter {
private model;
private instructions;
private stopWhen;
private endpoint?;
private tools;
private isDestroyed;
private _isProcessing;
private currentStreamAbortController?;
private ws;
private speech;
private conversation;
private transcription;
private inputQueue;
private sessionId;
private frameSequence;
private lastFrameTimestamp;
private lastFrameHash?;
private frameContextBuffer;
private currentFrameData?;
private videoConfig;
private maxFrameInputSize;
constructor(options: VideoAgentOptions);
registerTools(tools: Record<string, Tool>): void;
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
interruptSpeech(reason?: string): void;
interruptCurrentResponse(reason?: string): void;
connect(url?: string): Promise<void>;
handleSocket(socket: WebSocket): void;
sendText(text: string): Promise<string>;
sendAudio(audioData: string): Promise<void>;
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Send a video frame with optional text query for vision analysis
*/
sendFrame(frameData: string, query?: string, options?: {
width?: number;
height?: number;
format?: string;
}): Promise<string>;
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason: FrameTriggerReason): void;
getConfig(): VideoAgentConfig;
updateConfig(config: Partial<VideoAgentConfig>): void;
startListening(): void;
stopListening(): void;
clearHistory(): void;
getHistory(): ModelMessage[];
setHistory(history: ModelMessage[]): void;
getFrameContext(): FrameContext[];
getSessionId(): string;
disconnect(): void;
destroy(): void;
get connected(): boolean;
get processing(): boolean;
get speaking(): boolean;
get pendingSpeechChunks(): number;
get destroyed(): boolean;
get currentFrameSequence(): number;
get hasVisualContext(): boolean;
private handleMessage;
private handleClientReady;
private handleAudioInput;
private handleVideoFrame;
private addFrameToContext;
private hashFrame;
private generateSessionId;
private enqueueTextInput;
private enqueueMultimodalInput;
/**
* Route queued items to the correct processor.
*/
private processQueueItem;
private buildMultimodalContent;
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
private runStream;
/**
* Process text-only input (with optional visual context from latest frame).
*/
private processUserInput;
/**
* Process multimodal input (text + explicit video frame).
*/
private processMultimodalInput;
private ensureNotDestroyed;
private cleanupOnDisconnect;
private bubbleEvents;
}
export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
//# sourceMappingURL=VideoAgent.new.d.ts.map

1
dist/VideoAgent.new.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"VideoAgent.new.d.ts","sourceRoot":"","sources":["../src/VideoAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACH,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACnB,MAAM,IAAI,CAAC;AACZ,OAAO,EACH,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACrB,MAAM,SAAS,CAAC;AAcjB;;GAEG;AACH,KAAK,kBAAkB,GAAG,cAAc,GAAG,cAAc,GAAG,OAAO,GAAG,SAAS,CAAC;AAEhF;;GAEG;AACH,UAAU,UAAU;IAChB,IAAI,EAAE,aAAa,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,EAAE;QACH,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;KAClB,CAAC;CACL;AAED;;GAEG;AACH,UAAU,SAAS;IACf,IAAI,EAAE,OAAO,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,UAAU,gBAAgB;IACtB,wEAAwE;IACxE,gBAAgB,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,UAAU,YAAY;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;CACxB;AAYD,MAAM,WAAW,iBAAiB;IAC9B;;;OAGG;IACH,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,6DAA6D;IAC7D,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAUD,qBAAa,UAAW,SAAQ,YAAY;IACxC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;IAG/C,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,aAAa,CAAK;IAC1B,OAAO,CAAC,kBAAkB,CAAK;IAC/B,OAAO,CAAC,aAAa,CAAC,CAAS;IAC/B,OAAO,CAAC,kBAAkB,CAAsB;IAChD,OAAO,CAAC,gBAAgB,CAAC,CAAS;IAClC,OAAO,CAAC,WAAW,CAAmB;IACtC,OAAO,CAAC,iBAAiB,CAAS;gBAEtB,OAAO,EAAE,iBAAiB;IAmF/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAInC,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAIhE,sBAAsB,CAC/B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GAC1B,OAAO,CAAC,UAAU,CAAC;IAIf,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAIrD,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQxD,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAM1C,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK/B,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQvC,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAK3C,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,SAAS,CAClB,SAAS,EAAE,MAAM,EACjB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,GAC/D,OAAO,CAAC,MAAM,CAAC;IA4BlB;;OAEG;IACI,mBAAmB,CAAC,MAAM,EAAE,kBAAkB,GAAG,IAAI;IASrD,SAAS,IAAI,gBAAgB;IAI7B,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,gBAAgB,CAAC,GAAG,IAAI;IAK5D,cAAc;IAId,aAAa;IAIb,YAAY;IAKZ,UAAU,IAAI,YAAY,EAAE;IAI5B,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,eAAe,IAAI,YAAY,EAAE;IAIjC,YAAY,IAAI,MAAM;IAItB,UAAU;IAIV,OAAO;IAYP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,oBAAoB,IAAI,MAAM,CAEjC;IAED,IAAI,gBAAgB,IAAI,OAAO,CAE9B;YAMa,aAAa;IA4C3B,OAAO,CAAC,iBAAiB;YAYX,gBAAgB;YAchB,gBAAgB;IAgD9B,OAAO,CAAC,iBAAiB;IAOzB,OAAO,CAAC,SAAS;IAUjB,OAAO,CAAC,iBAAiB;IAUzB,OAAO,CAAC,gBAAgB;IAMxB,OAAO,CAAC,sBAAsB;IAM9B;;OAEG;YACW,gBAAgB;IAa9B,OAAO,CAAC,sBAAsB;IA0B9B;;OAEG;YACW,SAAS;IAqEvB;;OAEG;YACW,gBAAgB;IAsC9B;;OAEG;YACW,sBAAsB;IAuCpC,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,mBAAmB;IAW3B,OAAO,CAAC,YAAY;CAKvB;AAGD,YAAY,EACR,UAAU,EACV,SAAS,EACT,gBAAgB,EAChB,YAAY,EACZ,kBAAkB,GACrB,CAAC;AAGF,YAAY,EAAE,qBAAqB,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC"}

571
dist/VideoAgent.new.js vendored Normal file
View File

@@ -0,0 +1,571 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VideoAgent = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const core_1 = require("./core");
/** Default maximum frame input size (5 MB) */
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
/** Default video agent config */
const DEFAULT_VIDEO_AGENT_CONFIG = {
maxContextFrames: 10,
};
// ── VideoAgent class ────────────────────────────────────
class VideoAgent extends events_1.EventEmitter {
model;
instructions;
stopWhen;
endpoint;
tools = {};
isDestroyed = false;
_isProcessing = false;
// Abort controller for the current LLM stream
currentStreamAbortController;
// ── Managers ─────────────────────────────────────────
ws;
speech;
conversation;
transcription;
inputQueue;
// ── Video-specific state ────────────────────────────
sessionId;
frameSequence = 0;
lastFrameTimestamp = 0;
lastFrameHash;
frameContextBuffer = [];
currentFrameData;
videoConfig;
maxFrameInputSize;
constructor(options) {
super();
this.model = options.model;
this.instructions =
options.instructions ||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
When analyzing images, be concise but informative. Describe what you see when asked.
Keep responses conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`;
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
this.endpoint = options.endpoint;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
this.sessionId = options.sessionId || this.generateSessionId();
this.videoConfig = {
...DEFAULT_VIDEO_AGENT_CONFIG,
maxContextFrames: options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
};
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ─────────────────────────
this.ws = new core_1.WebSocketManager();
this.speech = new core_1.SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new core_1.ConversationManager({
history: options.history,
});
this.transcription = new core_1.TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new core_1.InputQueue();
// ── Wire managers to WebSocket send ─────────────
const sendMsg = (msg) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire input queue processor ──────────────────
this.inputQueue.processor = (item) => this.processQueueItem(item);
// ── Bubble events from managers ─────────────────
this.bubbleEvents(this.ws, ["connected", "error"]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle ──────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message) => this.handleMessage(message));
}
// ══════════════════════════════════════════════════════
// Public API
// ══════════════════════════════════════════════════════
registerTools(tools) {
this.tools = { ...this.tools, ...tools };
}
async transcribeAudio(audioData) {
return this.transcription.transcribeAudio(audioData);
}
async generateSpeechFromText(text, abortSignal) {
return this.speech.generateSpeechFromText(text, abortSignal);
}
interruptSpeech(reason = "interrupted") {
this.speech.interruptSpeech(reason);
}
interruptCurrentResponse(reason = "interrupted") {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
async connect(url) {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
handleSocket(socket) {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
async sendText(text) {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueTextInput(text);
}
async sendAudio(audioData) {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
async sendAudioBuffer(audioBuffer) {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Send a video frame with optional text query for vision analysis
*/
async sendFrame(frameData, query, options) {
this.ensureNotDestroyed();
const frame = {
type: "video_frame",
sessionId: this.sessionId,
sequence: this.frameSequence++,
timestamp: Date.now(),
triggerReason: "user_request",
previousFrameRef: this.lastFrameHash,
image: {
data: frameData,
format: options?.format || "webp",
width: options?.width || 640,
height: options?.height || 480,
},
};
// Update local frame state
await this.handleVideoFrame(frame);
if (query) {
return this.enqueueMultimodalInput(query, frame);
}
return "";
}
/**
* Request client to capture and send a frame
*/
requestFrameCapture(reason) {
this.ws.send({
type: "capture_frame",
reason,
timestamp: Date.now(),
});
this.emit("frame_requested", { reason });
}
getConfig() {
return { ...this.videoConfig };
}
updateConfig(config) {
this.videoConfig = { ...this.videoConfig, ...config };
this.emit("config_changed", this.videoConfig);
}
startListening() {
this.emit("listening");
}
stopListening() {
this.emit("stopped");
}
clearHistory() {
this.conversation.clearHistory();
this.frameContextBuffer = [];
}
getHistory() {
return this.conversation.getHistory();
}
setHistory(history) {
this.conversation.setHistory(history);
}
getFrameContext() {
return [...this.frameContextBuffer];
}
getSessionId() {
return this.sessionId;
}
disconnect() {
this.ws.disconnect();
}
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.frameContextBuffer = [];
this.tools = {};
this.removeAllListeners();
}
// ── Getters ─────────────────────────────────────────
get connected() {
return this.ws.isConnected;
}
get processing() {
return this._isProcessing;
}
get speaking() {
return this.speech.isSpeaking;
}
get pendingSpeechChunks() {
return this.speech.pendingChunkCount;
}
get destroyed() {
return this.isDestroyed;
}
get currentFrameSequence() {
return this.frameSequence;
}
get hasVisualContext() {
return !!this.currentFrameData;
}
// ══════════════════════════════════════════════════════
// Private — message handling
// ══════════════════════════════════════════════════════
async handleMessage(message) {
try {
switch (message.type) {
case "transcript":
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
await this.enqueueTextInput(message.text);
break;
case "audio":
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
try {
await this.handleAudioInput(message.data, message.format);
}
catch (audioError) {
this.emit("error", audioError);
}
break;
case "video_frame":
await this.handleVideoFrame(message);
break;
case "interrupt":
this.interruptCurrentResponse(message.reason || "client_request");
break;
case "client_ready":
this.handleClientReady(message);
break;
}
}
catch (err) {
this.emit("error", err);
}
}
handleClientReady(message) {
this.ws.send({
type: "session_init",
sessionId: this.sessionId,
});
this.emit("client_ready", message.capabilities);
}
// ══════════════════════════════════════════════════════
// Private — audio
// ══════════════════════════════════════════════════════
async handleAudioInput(base64Audio, format) {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueTextInput(text);
}
}
// ══════════════════════════════════════════════════════
// Private — video frames
// ══════════════════════════════════════════════════════
async handleVideoFrame(frame) {
try {
if (!frame.image?.data) {
this.emit("warning", "Received empty or invalid video frame");
return;
}
const frameSize = Buffer.from(frame.image.data, "base64").length;
if (frameSize > this.maxFrameInputSize) {
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
this.emit("error", new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
return;
}
const frameHash = this.hashFrame(frame.image.data);
this.lastFrameTimestamp = frame.timestamp;
this.lastFrameHash = frameHash;
this.currentFrameData = frame.image.data;
this.addFrameToContext({
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
frameHash,
});
this.emit("frame_received", {
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
size: frameSize,
dimensions: { width: frame.image.width, height: frame.image.height },
});
this.ws.send({
type: "frame_ack",
sequence: frame.sequence,
timestamp: Date.now(),
});
}
catch (error) {
this.emit("error", error);
}
}
addFrameToContext(context) {
this.frameContextBuffer.push(context);
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
this.frameContextBuffer.shift();
}
}
hashFrame(data) {
let hash = 0;
for (let i = 0; i < data.length; i++) {
const char = data.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
}
generateSessionId() {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2, 10);
return `vs_${timestamp}_${randomPart}`;
}
// ══════════════════════════════════════════════════════
// Private — input queue
// ══════════════════════════════════════════════════════
enqueueTextInput(text) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
enqueueMultimodalInput(text, frame) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, frame, resolve, reject });
});
}
/**
* Route queued items to the correct processor.
*/
async processQueueItem(item) {
if (item.frame && item.text) {
return this.processMultimodalInput(item.text, item.frame);
}
else if (item.text) {
return this.processUserInput(item.text);
}
return "";
}
// ══════════════════════════════════════════════════════
// Private — multimodal content building
// ══════════════════════════════════════════════════════
buildMultimodalContent(text, frameData) {
const content = [];
if (this.frameContextBuffer.length > 0) {
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
content.push({ type: "text", text: contextSummary });
}
const imageData = frameData || this.currentFrameData;
if (imageData) {
content.push({ type: "image", image: imageData });
}
content.push({ type: "text", text });
return content;
}
// ══════════════════════════════════════════════════════
// Private — LLM processing
// ══════════════════════════════════════════════════════
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
async runStream(messages, abortSignal) {
const result = (0, ai_1.streamText)({
model: this.model,
system: this.instructions,
messages,
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal,
onChunk: ({ chunk }) => {
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
this.emit("error", error);
},
});
const streamResult = await (0, core_1.processFullStream)(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
}, {
sessionId: this.sessionId,
frameContext: this.frameContextBuffer.length > 0
? {
frameCount: this.frameContextBuffer.length,
lastFrameSequence: this.frameContextBuffer[this.frameContextBuffer.length - 1]
?.sequence,
}
: undefined,
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush remaining speech & wait for queue
this.speech.flushPendingText();
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
/**
* Process text-only input (with optional visual context from latest frame).
*/
async processUserInput(text) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text });
const hasVisual = !!this.currentFrameData;
let messages;
if (hasVisual) {
const content = this.buildMultimodalContent(text);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Visual context] ${text}` }],
});
messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
}
else {
this.conversation.addMessage({ role: "user", content: text });
messages = this.conversation.getHistoryRef();
}
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
/**
* Process multimodal input (text + explicit video frame).
*/
async processMultimodalInput(text, frame) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text, hasImage: true });
const content = this.buildMultimodalContent(text, frame.image.data);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Image attached] ${text}` }],
});
const messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
return await this.runStream(messages, this.currentStreamAbortController.signal);
}
catch (error) {
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ══════════════════════════════════════════════════════
// Private — helpers
// ══════════════════════════════════════════════════════
ensureNotDestroyed() {
if (this.isDestroyed) {
throw new Error("VideoAgent has been destroyed and cannot be used");
}
}
cleanupOnDisconnect() {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.currentFrameData = undefined;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
bubbleEvents(source, events) {
for (const event of events) {
source.on(event, (...args) => this.emit(event, ...args));
}
}
}
exports.VideoAgent = VideoAgent;
//# sourceMappingURL=VideoAgent.new.js.map

1
dist/VideoAgent.new.js.map vendored Normal file

File diff suppressed because one or more lines are too long

23
dist/VoiceAgent.d.ts vendored
View File

@@ -20,6 +20,25 @@ export interface VoiceAgentOptions {
/** Maximum audio input size in bytes (default: 10 MB) */ /** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number; maxAudioInputSize?: number;
} }
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export declare class VoiceAgent extends EventEmitter { export declare class VoiceAgent extends EventEmitter {
private socket?; private socket?;
private tools; private tools;
@@ -120,6 +139,10 @@ export declare class VoiceAgent extends EventEmitter {
* Attach an existing WebSocket (server-side usage). * Attach an existing WebSocket (server-side usage).
* Use this when a WS server accepts a connection and you want the * Use this when a WS server accepts a connection and you want the
* agent to handle messages on that socket. * agent to handle messages on that socket.
*
* **Note:** Calling this while a socket is already attached will cleanly
* tear down the previous connection first. Each `VoiceAgent` instance
* supports only one socket at a time — create a new agent per user.
*/ */
handleSocket(socket: WebSocket): void; handleSocket(socket: WebSocket): void;
/** /**

View File

@@ -1 +1 @@
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"} {"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;;;;;OAQG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}

25
dist/VoiceAgent.js vendored
View File

@@ -5,6 +5,25 @@ const ws_1 = require("ws");
const events_1 = require("events"); const events_1 = require("events");
const ai_1 = require("ai"); const ai_1 = require("ai");
const types_1 = require("./types"); const types_1 = require("./types");
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
class VoiceAgent extends events_1.EventEmitter { class VoiceAgent extends events_1.EventEmitter {
socket; socket;
tools = {}; tools = {};
@@ -50,7 +69,7 @@ class VoiceAgent extends events_1.EventEmitter {
this.endpoint = options.endpoint; this.endpoint = options.endpoint;
this.voice = options.voice || "alloy"; this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions; this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "mp3"; this.outputFormat = options.outputFormat || "opus";
this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE; this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
if (options.tools) { if (options.tools) {
this.tools = { ...options.tools }; this.tools = { ...options.tools };
@@ -576,6 +595,10 @@ class VoiceAgent extends events_1.EventEmitter {
* Attach an existing WebSocket (server-side usage). * Attach an existing WebSocket (server-side usage).
* Use this when a WS server accepts a connection and you want the * Use this when a WS server accepts a connection and you want the
* agent to handle messages on that socket. * agent to handle messages on that socket.
*
* **Note:** Calling this while a socket is already attached will cleanly
* tear down the previous connection first. Each `VoiceAgent` instance
* supports only one socket at a time — create a new agent per user.
*/ */
handleSocket(socket) { handleSocket(socket) {
this.ensureNotDestroyed(); this.ensureNotDestroyed();

File diff suppressed because one or more lines are too long

137
dist/VoiceAgent.new.d.ts vendored Normal file
View File

@@ -0,0 +1,137 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export declare class VoiceAgent extends EventEmitter {
private model;
private instructions;
private stopWhen;
private endpoint?;
private tools;
private isDestroyed;
private _isProcessing;
private currentStreamAbortController?;
private ws;
private speech;
private conversation;
private transcription;
private inputQueue;
constructor(options: VoiceAgentOptions);
registerTools(tools: Record<string, Tool>): void;
/**
* Transcribe audio data to text using the configured transcription model.
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Generate speech from text using the configured speech model.
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason?: string): void;
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
interruptCurrentResponse(reason?: string): void;
/**
* Connect to a WebSocket server by URL.
*/
connect(url?: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void;
/**
* Send text input for processing (bypasses transcription).
*/
sendText(text: string): Promise<string>;
/**
* Send base64 audio data to be transcribed and processed.
*/
sendAudio(audioData: string): Promise<void>;
/**
* Send raw audio buffer to be transcribed and processed.
*/
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
/**
* Generate speech for full text at once (non-streaming fallback).
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/** Start listening for voice input */
startListening(): void;
/** Stop listening for voice input */
stopListening(): void;
/** Clear conversation history */
clearHistory(): void;
/** Get current conversation history */
getHistory(): ModelMessage[];
/** Set conversation history (useful for restoring sessions) */
setHistory(history: ModelMessage[]): void;
/** Disconnect from WebSocket and stop all in-flight work */
disconnect(): void;
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy(): void;
get connected(): boolean;
get processing(): boolean;
get speaking(): boolean;
get pendingSpeechChunks(): number;
get destroyed(): boolean;
private handleMessage;
private handleAudioInput;
private enqueueInput;
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
private processUserInput;
private ensureNotDestroyed;
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect;
/**
* Forward select events from a child emitter to this agent.
*/
private bubbleEvents;
}
//# sourceMappingURL=VoiceAgent.new.d.ts.map

1
dist/VoiceAgent.new.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"VoiceAgent.new.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EACL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACnB,MAAM,SAAS,CAAC;AAYjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAOD;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;gBAEnC,OAAO,EAAE,iBAAiB;IAyE/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7E;;OAEG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAItB;;OAEG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAI5D;;OAEG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQrE;;OAEG;IACU,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAMjD;;OAEG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK5C;;OAEG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;OAEG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAInE,sCAAsC;IACtC,cAAc;IAKd,qCAAqC;IACrC,aAAa;IAKb,iCAAiC;IACjC,YAAY;IAIZ,uCAAuC;IACvC,UAAU,IAAI,YAAY,EAAE;IAI5B,+DAA+D;IAC/D,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,4DAA4D;IAC5D,UAAU;IAIV;;OAEG;IACH,OAAO;IAWP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;YAIa,aAAa;YAoCb,gBAAgB;IAe9B,OAAO,CAAC,YAAY;IAQpB;;;OAGG;YACW,gBAAgB;IAyE9B,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAU3B;;OAEG;IACH,OAAO,CAAC,YAAY;CAKrB"}

379
dist/VoiceAgent.new.js vendored Normal file
View File

@@ -0,0 +1,379 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.VoiceAgent = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const core_1 = require("./core");
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
class VoiceAgent extends events_1.EventEmitter {
model;
instructions;
stopWhen;
endpoint;
tools = {};
isDestroyed = false;
_isProcessing = false;
// Abort controller for the current LLM stream
currentStreamAbortController;
// ── Managers ──────────────────────────────────────────
ws;
speech;
conversation;
transcription;
inputQueue;
constructor(options) {
super();
this.model = options.model;
this.instructions =
options.instructions || "You are a helpful voice assistant.";
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
this.endpoint = options.endpoint;
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ──────────────────────────────
this.ws = new core_1.WebSocketManager();
this.speech = new core_1.SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new core_1.ConversationManager({
history: options.history,
});
this.transcription = new core_1.TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new core_1.InputQueue();
// ── Wire managers to the WebSocket send function ─────
const sendMsg = (msg) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire the input queue processor ───────────────────
this.inputQueue.processor = (item) => this.processUserInput(item.text);
// ── Bubble events from managers ──────────────────────
this.bubbleEvents(this.ws, [
"connected",
"error",
]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle events ────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message) => this.handleMessage(message));
}
// ── Public API ────────────────────────────────────────
registerTools(tools) {
this.tools = { ...this.tools, ...tools };
}
/**
* Transcribe audio data to text using the configured transcription model.
*/
async transcribeAudio(audioData) {
return this.transcription.transcribeAudio(audioData);
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(text, abortSignal) {
return this.speech.generateSpeechFromText(text, abortSignal);
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason = "interrupted") {
this.speech.interruptSpeech(reason);
}
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
interruptCurrentResponse(reason = "interrupted") {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
/**
* Connect to a WebSocket server by URL.
*/
async connect(url) {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket) {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
/**
* Send text input for processing (bypasses transcription).
*/
async sendText(text) {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueInput(text);
}
/**
* Send base64 audio data to be transcribed and processed.
*/
async sendAudio(audioData) {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
/**
* Send raw audio buffer to be transcribed and processed.
*/
async sendAudioBuffer(audioBuffer) {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text) {
return this.speech.generateAndSendSpeechFull(text);
}
/** Start listening for voice input */
startListening() {
console.log("Starting voice agent...");
this.emit("listening");
}
/** Stop listening for voice input */
stopListening() {
console.log("Stopping voice agent...");
this.emit("stopped");
}
/** Clear conversation history */
clearHistory() {
this.conversation.clearHistory();
}
/** Get current conversation history */
getHistory() {
return this.conversation.getHistory();
}
/** Set conversation history (useful for restoring sessions) */
setHistory(history) {
this.conversation.setHistory(history);
}
/** Disconnect from WebSocket and stop all in-flight work */
disconnect() {
this.ws.disconnect();
}
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.tools = {};
this.removeAllListeners();
}
// ── Getters ───────────────────────────────────────────
get connected() {
return this.ws.isConnected;
}
get processing() {
return this._isProcessing;
}
get speaking() {
return this.speech.isSpeaking;
}
get pendingSpeechChunks() {
return this.speech.pendingChunkCount;
}
get destroyed() {
return this.isDestroyed;
}
// ── Private: message handling ─────────────────────────
async handleMessage(message) {
try {
console.log(`Received WebSocket message of type: ${message.type}`);
if (message.type === "transcript") {
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueInput(message.text);
}
else if (message.type === "audio") {
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`);
await this.handleAudioInput(message.data, message.format);
}
else if (message.type === "interrupt") {
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
this.interruptCurrentResponse(message.reason || "client_request");
}
}
catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err);
}
}
// ── Private: audio ────────────────────────────────────
async handleAudioInput(base64Audio, format) {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueInput(text);
}
}
// ── Private: input queue ──────────────────────────────
enqueueInput(text) {
return new Promise((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
// ── Private: LLM processing ───────────────────────────
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
async processUserInput(text) {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
const streamAbortSignal = this.currentStreamAbortController.signal;
try {
this.emit("text", { role: "user", text });
this.conversation.addMessage({ role: "user", content: text });
const result = (0, ai_1.streamText)({
model: this.model,
system: this.instructions,
messages: this.conversation.getHistoryRef(),
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal: streamAbortSignal,
onChunk: ({ chunk }) => {
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error);
},
});
const streamResult = await (0, core_1.processFullStream)(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush any remaining speech
this.speech.flushPendingText();
// Wait for all speech chunks to complete
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
catch (error) {
// Clean up speech state on error
this.speech.reset();
throw error;
}
finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ── Private: helpers ──────────────────────────────────
ensureNotDestroyed() {
if (this.isDestroyed) {
throw new Error("VoiceAgent has been destroyed and cannot be used");
}
}
/**
* Clean up all in-flight state when the connection drops.
*/
cleanupOnDisconnect() {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
/**
* Forward select events from a child emitter to this agent.
*/
bubbleEvents(source, events) {
for (const event of events) {
source.on(event, (...args) => this.emit(event, ...args));
}
}
}
exports.VoiceAgent = VoiceAgent;
//# sourceMappingURL=VoiceAgent.new.js.map

1
dist/VoiceAgent.new.js.map vendored Normal file

File diff suppressed because one or more lines are too long

46
dist/core/ConversationManager.d.ts vendored Normal file
View File

@@ -0,0 +1,46 @@
import { EventEmitter } from "events";
import { type ModelMessage } from "ai";
import { type HistoryConfig } from "../types";
export interface ConversationManagerOptions {
history?: Partial<HistoryConfig>;
}
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
export declare class ConversationManager extends EventEmitter {
private conversationHistory;
private historyConfig;
constructor(options?: ConversationManagerOptions);
/**
* Add a message to history and trim if needed.
*/
addMessage(message: ModelMessage): void;
/**
* Get a copy of the current history.
*/
getHistory(): ModelMessage[];
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef(): ModelMessage[];
/**
* Replace the entire conversation history.
*/
setHistory(history: ModelMessage[]): void;
/**
* Clear all conversation history.
*/
clearHistory(): void;
/**
* Get the number of messages in history.
*/
get length(): number;
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
private trimHistory;
}
//# sourceMappingURL=ConversationManager.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ConversationManager.d.ts","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,KAAK,YAAY,EAAE,MAAM,IAAI,CAAC;AACvC,OAAO,EAAE,KAAK,aAAa,EAA0B,MAAM,UAAU,CAAC;AAEtE,MAAM,WAAW,0BAA0B;IACzC,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;CAClC;AAED;;;GAGG;AACH,qBAAa,mBAAoB,SAAQ,YAAY;IACnD,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,aAAa,CAAgB;gBAEzB,OAAO,GAAE,0BAA+B;IAQpD;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,GAAG,IAAI;IAKvC;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;;OAGG;IACH,aAAa,IAAI,YAAY,EAAE;IAI/B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,IAAI;IAIzC;;OAEG;IACH,YAAY,IAAI,IAAI;IAKpB;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;;OAGG;IACH,OAAO,CAAC,WAAW;CAgDpB"}

106
dist/core/ConversationManager.js vendored Normal file
View File

@@ -0,0 +1,106 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ConversationManager = void 0;
const events_1 = require("events");
const types_1 = require("../types");
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
class ConversationManager extends events_1.EventEmitter {
conversationHistory = [];
historyConfig;
constructor(options = {}) {
super();
this.historyConfig = {
...types_1.DEFAULT_HISTORY_CONFIG,
...options.history,
};
}
/**
* Add a message to history and trim if needed.
*/
addMessage(message) {
this.conversationHistory.push(message);
this.trimHistory();
}
/**
* Get a copy of the current history.
*/
getHistory() {
return [...this.conversationHistory];
}
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef() {
return this.conversationHistory;
}
/**
* Replace the entire conversation history.
*/
setHistory(history) {
this.conversationHistory = [...history];
}
/**
* Clear all conversation history.
*/
clearHistory() {
this.conversationHistory = [];
this.emit("history_cleared");
}
/**
* Get the number of messages in history.
*/
get length() {
return this.conversationHistory.length;
}
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
trimHistory() {
const { maxMessages, maxTotalChars } = this.historyConfig;
// Trim by message count
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
const excess = this.conversationHistory.length - maxMessages;
// Round up to even number to preserve turn pairs
const toRemove = excess % 2 === 0 ? excess : excess + 1;
this.conversationHistory.splice(0, toRemove);
this.emit("history_trimmed", {
removedCount: toRemove,
reason: "max_messages",
});
}
// Trim by total character count
if (maxTotalChars > 0) {
let totalChars = this.conversationHistory.reduce((sum, msg) => {
const content = typeof msg.content === "string"
? msg.content
: JSON.stringify(msg.content);
return sum + content.length;
}, 0);
let removedCount = 0;
while (totalChars > maxTotalChars &&
this.conversationHistory.length > 2) {
const removed = this.conversationHistory.shift();
if (removed) {
const content = typeof removed.content === "string"
? removed.content
: JSON.stringify(removed.content);
totalChars -= content.length;
removedCount++;
}
}
if (removedCount > 0) {
this.emit("history_trimmed", {
removedCount,
reason: "max_total_chars",
});
}
}
}
}
exports.ConversationManager = ConversationManager;
//# sourceMappingURL=ConversationManager.js.map

1
dist/core/ConversationManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"ConversationManager.js","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AAEtC,oCAAsE;AAMtE;;;GAGG;AACH,MAAa,mBAAoB,SAAQ,qBAAY;IAC3C,mBAAmB,GAAmB,EAAE,CAAC;IACzC,aAAa,CAAgB;IAErC,YAAY,UAAsC,EAAE;QAClD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,aAAa,GAAG;YACnB,GAAG,8BAAsB;YACzB,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAqB;QAC9B,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,EAAE,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,CAAC,GAAG,IAAI,CAAC,mBAAmB,CAAC,CAAC;IACvC,CAAC;IAED;;;OAGG;IACH,aAAa;QACX,OAAO,IAAI,CAAC,mBAAmB,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAuB;QAChC,IAAI,CAAC,mBAAmB,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,YAAY;QACV,IAAI,CAAC,mBAAmB,GAAG,EAAE,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC;IACzC,CAAC;IAED;;;OAGG;IACK,WAAW;QACjB,MAAM,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;QAE1D,wBAAwB;QACxB,IAAI,WAAW,GAAG,CAAC,IAAI,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,EAAE,CAAC;YACrE,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,CAAC;YAC7D,iDAAiD;YACjD,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;YAC7C,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;gBAC3B,YAAY,EAAE,QAAQ;gBACtB,MAAM,EAAE,cAAc;aACvB,CAAC,CAAC;QACL,CAAC;QAED,gCAAgC;QAChC,IAAI,aAAa,GAAG,CAAC,EAAE,CAAC;YACtB,IAAI,UAAU,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;gBAC5D,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ;oBAC7B,CAAC,CAAC,GAAG,CAAC,OAAO;oBACb,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAClC,OAAO,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC;YAC9B,CAAC,EAAE,CAAC,CAAC,CAAC;YAEN,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,OACE,UAAU,GAAG,aAAa;gBAC1B,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,CAAC,EACnC,CAAC;gBACD,MAAM,OAAO,GAAG,IAAI,CAAC,mBAAmB,CAAC,KAAK,EAAE,CAAC;gBACjD,IAAI,OAAO,EAAE,CAAC;oBACZ,MAAM,OAAO,GACX,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ;wBACjC,CAAC,CAAC,OAAO,CAAC,OAAO;wBACjB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;oBACtC,UAAU,IAAI,OAAO,CAAC,MAAM,CAAC;oBAC7B,YAAY,EAAE,CAAC;gBACjB,CAAC;YACH,CAAC;YACD,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;gBACrB,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;oBAC3B,YAAY;oBACZ,MAAM,EAAE,iBAAiB;iBAC1B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;CACF;AA7GD,kDA6GC"}

33
dist/core/InputQueue.d.ts vendored Normal file
View File

@@ -0,0 +1,33 @@
/**
* A generic serial input queue that ensures only one processor runs at a time.
*
* @template T The shape of each queued item (must include resolve/reject)
*/
export interface QueueItem<T = string> {
resolve: (v: T) => void;
reject: (e: unknown) => void;
}
export declare class InputQueue<T extends QueueItem<any>> {
private queue;
private processing;
/** Callback invoked for each item — must return a resolved value */
processor: (item: T) => Promise<any>;
/**
* Enqueue an item for serial processing.
*/
enqueue(item: T): void;
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason: Error): void;
/**
* Number of items waiting in the queue.
*/
get length(): number;
/**
* Whether the queue is currently processing an item.
*/
get isProcessing(): boolean;
private drain;
}
//# sourceMappingURL=InputQueue.d.ts.map

1
dist/core/InputQueue.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"InputQueue.d.ts","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,SAAS,CAAC,CAAC,GAAG,MAAM;IACnC,OAAO,EAAE,CAAC,CAAC,EAAE,CAAC,KAAK,IAAI,CAAC;IACxB,MAAM,EAAE,CAAC,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CAC9B;AAED,qBAAa,UAAU,CAAC,CAAC,SAAS,SAAS,CAAC,GAAG,CAAC;IAC9C,OAAO,CAAC,KAAK,CAAW;IACxB,OAAO,CAAC,UAAU,CAAS;IAE3B,oEAAoE;IAC7D,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,OAAO,CAAC,GAAG,CAAC,CAAkB;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,CAAC,GAAG,IAAI;IAKtB;;OAEG;IACH,SAAS,CAAC,MAAM,EAAE,KAAK,GAAG,IAAI;IAQ9B;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,OAAO,CAE1B;YAIa,KAAK;CAkBpB"}

61
dist/core/InputQueue.js vendored Normal file
View File

@@ -0,0 +1,61 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.InputQueue = void 0;
class InputQueue {
queue = [];
processing = false;
/** Callback invoked for each item — must return a resolved value */
processor = async () => "";
/**
* Enqueue an item for serial processing.
*/
enqueue(item) {
this.queue.push(item);
this.drain();
}
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason) {
for (const item of this.queue) {
item.reject(reason);
}
this.queue = [];
this.processing = false;
}
/**
* Number of items waiting in the queue.
*/
get length() {
return this.queue.length;
}
/**
* Whether the queue is currently processing an item.
*/
get isProcessing() {
return this.processing;
}
// ── Private ──────────────────────────────────────────
async drain() {
if (this.processing)
return;
this.processing = true;
try {
while (this.queue.length > 0) {
const item = this.queue.shift();
try {
const result = await this.processor(item);
item.resolve(result);
}
catch (error) {
item.reject(error);
}
}
}
finally {
this.processing = false;
}
}
}
exports.InputQueue = InputQueue;
//# sourceMappingURL=InputQueue.js.map

1
dist/core/InputQueue.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"InputQueue.js","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":";;;AAUA,MAAa,UAAU;IACb,KAAK,GAAQ,EAAE,CAAC;IAChB,UAAU,GAAG,KAAK,CAAC;IAE3B,oEAAoE;IAC7D,SAAS,GAA8B,KAAK,IAAI,EAAE,CAAC,EAAE,CAAC;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAO;QACb,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,MAAa;QACrB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAChB,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;IAED,wDAAwD;IAEhD,KAAK,CAAC,KAAK;QACjB,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO;QAC5B,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;QAEvB,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;gBACjC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;oBAC1C,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;QAC1B,CAAC;IACH,CAAC;CACF;AA5DD,gCA4DC"}

83
dist/core/SpeechManager.d.ts vendored Normal file
View File

@@ -0,0 +1,83 @@
import { EventEmitter } from "events";
import { type SpeechModel } from "ai";
import { type StreamingSpeechConfig } from "../types";
export interface SpeechManagerOptions {
speechModel?: SpeechModel;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
}
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
export declare class SpeechManager extends EventEmitter {
private speechModel?;
private voice;
private speechInstructions?;
private outputFormat;
private streamingSpeechConfig;
private currentSpeechAbortController?;
private speechChunkQueue;
private nextChunkId;
private _isSpeaking;
private pendingTextBuffer;
private speechQueueDonePromise?;
private speechQueueDoneResolve?;
/** Callback to send messages over the WebSocket */
sendMessage: (message: Record<string, unknown>) => void;
constructor(options: SpeechManagerOptions);
get isSpeaking(): boolean;
get pendingChunkCount(): number;
get hasSpeechModel(): boolean;
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise(): Promise<void> | undefined;
/**
* Generate speech from text using the configured speech model.
*/
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
/**
* Generate speech for full text at once (non-streaming fallback).
*/
generateAndSendSpeechFull(text: string): Promise<void>;
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason?: string): void;
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta: string): void;
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText(): void;
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset(): void;
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
private extractSentences;
/**
* Queue a text chunk for speech generation.
*/
private queueSpeechChunk;
/**
* Generate audio for a single chunk.
*/
private generateChunkAudio;
/**
* Process the speech queue and send audio chunks in order.
*/
private processSpeechQueue;
}
//# sourceMappingURL=SpeechManager.d.ts.map

1
dist/core/SpeechManager.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"SpeechManager.d.ts","sourceRoot":"","sources":["../../src/core/SpeechManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAE3B,MAAM,UAAU,CAAC;AAElB,MAAM,WAAW,oBAAoB;IACnC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;CAClD;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,YAAY;IAC7C,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,qBAAqB,CAAwB;IAErD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;IAE5C,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAa;gBAE/D,OAAO,EAAE,oBAAoB;IAYzC,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,iBAAiB,IAAI,MAAM,CAE9B;IAED,IAAI,cAAc,IAAI,OAAO,CAE5B;IAED;;;OAGG;IACH,IAAI,gBAAgB,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,SAAS,CAEhD;IAED;;OAEG;IACG,sBAAsB,CAC1B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;OAEG;IACG,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA4B5D;;OAEG;IACH,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgCrD;;;OAGG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAazC;;;OAGG;IACH,gBAAgB,IAAI,IAAI;IAOxB;;OAEG;IACH,KAAK,IAAI,IAAI;IAkBb;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA+CxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAwCxB;;OAEG;YACW,kBAAkB;IAiChC;;OAEG;YACW,kBAAkB;CA0GjC"}

356
dist/core/SpeechManager.js vendored Normal file
View File

@@ -0,0 +1,356 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SpeechManager = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const types_1 = require("../types");
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
class SpeechManager extends events_1.EventEmitter {
speechModel;
voice;
speechInstructions;
outputFormat;
streamingSpeechConfig;
currentSpeechAbortController;
speechChunkQueue = [];
nextChunkId = 0;
_isSpeaking = false;
pendingTextBuffer = "";
// Promise-based signal for speech queue completion
speechQueueDonePromise;
speechQueueDoneResolve;
/** Callback to send messages over the WebSocket */
sendMessage = () => { };
constructor(options) {
super();
this.speechModel = options.speechModel;
this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "opus";
this.streamingSpeechConfig = {
...types_1.DEFAULT_STREAMING_SPEECH_CONFIG,
...options.streamingSpeech,
};
}
get isSpeaking() {
return this._isSpeaking;
}
get pendingChunkCount() {
return this.speechChunkQueue.length;
}
get hasSpeechModel() {
return !!this.speechModel;
}
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise() {
return this.speechQueueDonePromise;
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(text, abortSignal) {
if (!this.speechModel) {
throw new Error("Speech model not configured");
}
const result = await (0, ai_1.experimental_generateSpeech)({
model: this.speechModel,
text,
voice: this.voice,
instructions: this.speechInstructions,
outputFormat: this.outputFormat,
abortSignal,
});
return result.audio.uint8Array;
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text) {
if (!this.speechModel)
return;
try {
this.emit("speech_start", { text, streaming: false });
const audioData = await this.generateSpeechFromText(text);
const base64Audio = Buffer.from(audioData).toString("base64");
this.sendMessage({
type: "audio",
data: base64Audio,
format: this.outputFormat,
});
this.emit("audio", {
data: base64Audio,
format: this.outputFormat,
uint8Array: audioData,
});
this.emit("speech_complete", { text, streaming: false });
}
catch (error) {
console.error("Failed to generate speech:", error);
this.emit("error", error);
}
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason = "interrupted") {
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
return;
}
// Abort any pending speech generation requests
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
// Clear the speech queue
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
// Resolve any pending speech-done waiters so callers can finish
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
// Notify clients to stop audio playback
this.sendMessage({
type: "speech_interrupted",
reason,
});
this.emit("speech_interrupted", { reason });
}
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta) {
if (!this.speechModel)
return;
this.pendingTextBuffer += textDelta;
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
this.pendingTextBuffer = remaining;
for (const sentence of sentences) {
this.queueSpeechChunk(sentence);
}
}
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText() {
if (!this.speechModel || !this.pendingTextBuffer.trim())
return;
this.queueSpeechChunk(this.pendingTextBuffer);
this.pendingTextBuffer = "";
}
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset() {
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
}
// ── Private helpers ─────────────────────────────────────────
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
extractSentences(text) {
const sentences = [];
let remaining = text;
// Match sentences ending with . ! ? followed by space or end of string
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
let lastIndex = 0;
let match;
while ((match = sentenceEndPattern.exec(text)) !== null) {
const sentence = text
.slice(lastIndex, match.index + match[0].length)
.trim();
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
sentences.push(sentence);
lastIndex = match.index + match[0].length;
}
else if (sentences.length > 0) {
// Append short sentence to previous one
sentences[sentences.length - 1] += " " + sentence;
lastIndex = match.index + match[0].length;
}
}
remaining = text.slice(lastIndex);
// If remaining text is too long, force split at clause boundaries
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
const clausePattern = /[,;:]\s+/g;
let clauseMatch;
let splitIndex = 0;
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
splitIndex = clauseMatch.index + clauseMatch[0].length;
break;
}
}
if (splitIndex > 0) {
sentences.push(remaining.slice(0, splitIndex).trim());
remaining = remaining.slice(splitIndex);
}
}
return [sentences, remaining];
}
/**
* Queue a text chunk for speech generation.
*/
queueSpeechChunk(text) {
if (!this.speechModel || !text.trim())
return;
// Wrap chunk ID to prevent unbounded growth in very long sessions
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
this.nextChunkId = 0;
}
const chunk = {
id: this.nextChunkId++,
text: text.trim(),
};
// Create the speech-done promise if not already present
if (!this.speechQueueDonePromise) {
this.speechQueueDonePromise = new Promise((resolve) => {
this.speechQueueDoneResolve = resolve;
});
}
// Start generating audio immediately (parallel generation)
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
}
this.speechChunkQueue.push(chunk);
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
// Start processing queue if not already
if (!this._isSpeaking) {
this.processSpeechQueue();
}
}
/**
* Generate audio for a single chunk.
*/
async generateChunkAudio(chunk) {
if (!this.currentSpeechAbortController) {
this.currentSpeechAbortController = new AbortController();
}
try {
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`);
const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal);
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
return audioData;
}
catch (error) {
if (error.name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null;
}
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
this.emit("error", error);
return null;
}
}
/**
* Process the speech queue and send audio chunks in order.
*/
async processSpeechQueue() {
if (this._isSpeaking)
return;
this._isSpeaking = true;
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
this.emit("speech_start", { streaming: true });
this.sendMessage({ type: "speech_stream_start" });
try {
while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0];
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
// Ensure audio generation has started
if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
// Wait for this chunk's audio
const audioData = await chunk.audioPromise;
// Check if we were interrupted while waiting
if (!this._isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break;
}
// Remove from queue after processing
this.speechChunkQueue.shift();
if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64");
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
// Send audio chunk via WebSocket
this.sendMessage({
type: "audio_chunk",
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
});
// Emit for local handling
this.emit("audio_chunk", {
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
uint8Array: audioData,
});
}
else {
console.log(`No audio data generated for chunk #${chunk.id}`);
}
// Start generating next chunks in parallel
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length);
if (toStart > 0) {
console.log(`Starting parallel generation for ${toStart} more chunks`);
for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
if (nextChunk) {
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
}
}
}
}
}
}
catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error);
}
finally {
this._isSpeaking = false;
this.currentSpeechAbortController = undefined;
// Signal that the speech queue is fully drained
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
console.log(`Speech queue processing complete`);
this.sendMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true });
}
}
}
exports.SpeechManager = SpeechManager;
//# sourceMappingURL=SpeechManager.js.map

1
dist/core/SpeechManager.js.map vendored Normal file

File diff suppressed because one or more lines are too long

42
dist/core/StreamProcessor.d.ts vendored Normal file
View File

@@ -0,0 +1,42 @@
import { type streamText } from "ai";
/**
* Result of processing a full LLM stream.
*/
export interface StreamResult {
fullText: string;
fullReasoning: string;
allToolCalls: Array<{
toolName: string;
toolCallId: string;
input: unknown;
}>;
allToolResults: Array<{
toolName: string;
toolCallId: string;
output: unknown;
}>;
allSources: Array<unknown>;
allFiles: Array<unknown>;
}
export interface StreamProcessorCallbacks {
/** Called when a text delta arrives (for streaming speech, etc.) */
onTextDelta?: (text: string) => void;
/** Called when a text-end part arrives (flush speech, etc.) */
onTextEnd?: () => void;
/** Send a WebSocket message */
sendMessage: (message: Record<string, unknown>) => void;
/** Emit an event on the agent */
emitEvent: (event: string, data?: unknown) => void;
}
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
export declare function processFullStream(result: ReturnType<typeof streamText>, callbacks: StreamProcessorCallbacks, extraResponseFields?: Record<string, unknown>): Promise<StreamResult>;
/**
* Handle onChunk callback events and emit them.
*/
export declare function handleStreamChunk(chunk: any, emitEvent: (event: string, data?: unknown) => void): void;
//# sourceMappingURL=StreamProcessor.d.ts.map

1
dist/core/StreamProcessor.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"StreamProcessor.d.ts","sourceRoot":"","sources":["../../src/core/StreamProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,KAAK,CAAC;QAClB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,KAAK,EAAE,OAAO,CAAC;KAChB,CAAC,CAAC;IACH,cAAc,EAAE,KAAK,CAAC;QACpB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,UAAU,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAC3B,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;CAC1B;AAED,MAAM,WAAW,wBAAwB;IACvC,oEAAoE;IACpE,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IACrC,+DAA+D;IAC/D,SAAS,CAAC,EAAE,MAAM,IAAI,CAAC;IACvB,+BAA+B;IAC/B,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAC;IACxD,iCAAiC;IACjC,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CACpD;AAED;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,UAAU,CAAC,OAAO,UAAU,CAAC,EACrC,SAAS,EAAE,wBAAwB,EACnC,mBAAmB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC5C,OAAO,CAAC,YAAY,CAAC,CAkMvB;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,KAAK,EAAE,GAAG,EACV,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,GACjD,IAAI,CA+CN"}

228
dist/core/StreamProcessor.js vendored Normal file
View File

@@ -0,0 +1,228 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.processFullStream = processFullStream;
exports.handleStreamChunk = handleStreamChunk;
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
async function processFullStream(result, callbacks, extraResponseFields) {
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
let fullText = "";
let fullReasoning = "";
const allToolCalls = [];
const allToolResults = [];
const allSources = [];
const allFiles = [];
for await (const part of result.fullStream) {
switch (part.type) {
// ── Stream lifecycle ──────────────────────────────
case "start":
sendMessage({ type: "stream_start" });
break;
case "finish":
emitEvent("text", { role: "assistant", text: fullText });
sendMessage({
type: "stream_finish",
finishReason: part.finishReason,
usage: part.totalUsage,
});
break;
case "error":
emitEvent("error", part.error);
sendMessage({
type: "stream_error",
error: String(part.error),
});
break;
case "abort":
emitEvent("abort", { reason: part.reason });
sendMessage({
type: "stream_abort",
reason: part.reason,
});
break;
// ── Step lifecycle ────────────────────────────────
case "start-step":
sendMessage({
type: "step_start",
warnings: part.warnings,
});
break;
case "finish-step":
sendMessage({
type: "step_finish",
finishReason: part.finishReason,
usage: part.usage,
});
break;
// ── Text streaming ────────────────────────────────
case "text-start":
sendMessage({ type: "text_start", id: part.id });
break;
case "text-delta":
fullText += part.text;
onTextDelta?.(part.text);
sendMessage({
type: "text_delta",
id: part.id,
text: part.text,
});
break;
case "text-end":
onTextEnd?.();
sendMessage({ type: "text_end", id: part.id });
break;
// ── Reasoning streaming ───────────────────────────
case "reasoning-start":
sendMessage({ type: "reasoning_start", id: part.id });
break;
case "reasoning-delta":
fullReasoning += part.text;
sendMessage({
type: "reasoning_delta",
id: part.id,
text: part.text,
});
break;
case "reasoning-end":
sendMessage({ type: "reasoning_end", id: part.id });
break;
// ── Tool input streaming ──────────────────────────
case "tool-input-start":
sendMessage({
type: "tool_input_start",
id: part.id,
toolName: part.toolName,
});
break;
case "tool-input-delta":
sendMessage({
type: "tool_input_delta",
id: part.id,
delta: part.delta,
});
break;
case "tool-input-end":
sendMessage({ type: "tool_input_end", id: part.id });
break;
// ── Tool execution ────────────────────────────────
case "tool-call":
allToolCalls.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
sendMessage({
type: "tool_call",
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
break;
case "tool-result":
allToolResults.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
output: part.output,
});
sendMessage({
type: "tool_result",
toolName: part.toolName,
toolCallId: part.toolCallId,
result: part.output,
});
break;
case "tool-error":
sendMessage({
type: "tool_error",
toolName: part.toolName,
toolCallId: part.toolCallId,
error: String(part.error),
});
break;
// ── Sources and files ─────────────────────────────
case "source":
allSources.push(part);
sendMessage({
type: "source",
source: part,
});
break;
case "file":
allFiles.push(part.file);
sendMessage({
type: "file",
file: part.file,
});
break;
}
}
// Send the complete response
sendMessage({
type: "response_complete",
text: fullText,
reasoning: fullReasoning || undefined,
toolCalls: allToolCalls,
toolResults: allToolResults,
sources: allSources.length > 0 ? allSources : undefined,
files: allFiles.length > 0 ? allFiles : undefined,
...extraResponseFields,
});
return {
fullText,
fullReasoning,
allToolCalls,
allToolResults,
allSources,
allFiles,
};
}
/**
* Handle onChunk callback events and emit them.
*/
function handleStreamChunk(chunk, emitEvent) {
switch (chunk.type) {
case "text-delta":
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
break;
case "reasoning-delta":
emitEvent("chunk:reasoning_delta", {
id: chunk.id,
text: chunk.text,
});
break;
case "tool-call":
emitEvent("chunk:tool_call", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
});
break;
case "tool-result":
emitEvent("chunk:tool_result", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
result: chunk.output,
});
break;
case "tool-input-start":
emitEvent("chunk:tool_input_start", {
id: chunk.id,
toolName: chunk.toolName,
});
break;
case "tool-input-delta":
emitEvent("chunk:tool_input_delta", {
id: chunk.id,
delta: chunk.delta,
});
break;
case "source":
emitEvent("chunk:source", chunk);
break;
}
}
//# sourceMappingURL=StreamProcessor.js.map

1
dist/core/StreamProcessor.js.map vendored Normal file

File diff suppressed because one or more lines are too long

28
dist/core/TranscriptionManager.d.ts vendored Normal file
View File

@@ -0,0 +1,28 @@
import { EventEmitter } from "events";
import { type TranscriptionModel } from "ai";
export interface TranscriptionManagerOptions {
transcriptionModel?: TranscriptionModel;
maxAudioInputSize?: number;
}
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
export declare class TranscriptionManager extends EventEmitter {
private transcriptionModel?;
private maxAudioInputSize;
/** Callback to send messages over the WebSocket */
sendMessage: (message: Record<string, unknown>) => void;
constructor(options?: TranscriptionManagerOptions);
get hasTranscriptionModel(): boolean;
/**
* Transcribe audio data to text.
*/
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
processAudioInput(base64Audio: string, format?: string): Promise<string | null>;
}
//# sourceMappingURL=TranscriptionManager.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TranscriptionManager.d.ts","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,kBAAkB,EACxB,MAAM,IAAI,CAAC;AAGZ,MAAM,WAAW,2BAA2B;IAC1C,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;GAGG;AACH,qBAAa,oBAAqB,SAAQ,YAAY;IACpD,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAY;gBAE9D,OAAO,GAAE,2BAAgC;IAOrD,IAAI,qBAAqB,IAAI,OAAO,CAEnC;IAED;;OAEG;IACG,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAsCtE;;;OAGG;IACG,iBAAiB,CACrB,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CA2D1B"}

106
dist/core/TranscriptionManager.js vendored Normal file
View File

@@ -0,0 +1,106 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.TranscriptionManager = void 0;
const events_1 = require("events");
const ai_1 = require("ai");
const types_1 = require("../types");
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
class TranscriptionManager extends events_1.EventEmitter {
transcriptionModel;
maxAudioInputSize;
/** Callback to send messages over the WebSocket */
sendMessage = () => { };
constructor(options = {}) {
super();
this.transcriptionModel = options.transcriptionModel;
this.maxAudioInputSize =
options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
}
get hasTranscriptionModel() {
return !!this.transcriptionModel;
}
/**
* Transcribe audio data to text.
*/
async transcribeAudio(audioData) {
if (!this.transcriptionModel) {
throw new Error("Transcription model not configured");
}
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
try {
const result = await (0, ai_1.experimental_transcribe)({
model: this.transcriptionModel,
audio: audioData,
});
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`);
this.emit("transcription", {
text: result.text,
language: result.language,
});
// Send transcription to client for immediate feedback
this.sendMessage({
type: "transcription_result",
text: result.text,
language: result.language,
});
return result.text;
}
catch (error) {
console.error("Whisper transcription failed:", error);
throw error;
}
}
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
async processAudioInput(base64Audio, format) {
if (!this.transcriptionModel) {
const error = new Error("Transcription model not configured for audio input");
this.emit("error", error);
this.sendMessage({ type: "error", error: error.message });
return null;
}
try {
const audioBuffer = Buffer.from(base64Audio, "base64");
// Validate audio size
if (audioBuffer.length > this.maxAudioInputSize) {
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
this.emit("error", new Error(`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
return null;
}
if (audioBuffer.length === 0) {
this.emit("warning", "Received empty audio data");
return null;
}
this.emit("audio_received", { size: audioBuffer.length, format });
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`);
const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (!transcribedText.trim()) {
this.emit("warning", "Transcription returned empty text");
this.sendMessage({
type: "transcription_error",
error: "Whisper returned empty text",
});
return null;
}
return transcribedText;
}
catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error);
this.sendMessage({
type: "transcription_error",
error: `Transcription failed: ${error.message || String(error)}`,
});
return null;
}
}
}
exports.TranscriptionManager = TranscriptionManager;
//# sourceMappingURL=TranscriptionManager.js.map

1
dist/core/TranscriptionManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"TranscriptionManager.js","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AACtC,2BAGY;AACZ,oCAAkD;AAOlD;;;GAGG;AACH,MAAa,oBAAqB,SAAQ,qBAAY;IAC5C,kBAAkB,CAAsB;IACxC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,GAA+C,GAAG,EAAE,GAAE,CAAC,CAAC;IAE1E,YAAY,UAAuC,EAAE;QACnD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,CAAC;QACrD,IAAI,CAAC,iBAAiB;YACpB,OAAO,CAAC,iBAAiB,IAAI,8BAAsB,CAAC;IACxD,CAAC;IAED,IAAI,qBAAqB;QACvB,OAAO,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC;IACnC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,eAAe,CAAC,SAA8B;QAClD,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,CAAC,GAAG,CACT,WAAW,SAAS,CAAC,UAAU,qCAAqC,CACrE,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,4BAAU,EAAC;gBAC9B,KAAK,EAAE,IAAI,CAAC,kBAAkB;gBAC9B,KAAK,EAAE,SAAS;aACjB,CAAC,CAAC;YAEH,OAAO,CAAC,GAAG,CACT,kCAAkC,MAAM,CAAC,IAAI,gBAAgB,MAAM,CAAC,QAAQ,IAAI,SAAS,EAAE,CAC5F,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE;gBACzB,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,sDAAsD;YACtD,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,sBAAsB;gBAC5B,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,OAAO,MAAM,CAAC,IAAI,CAAC;QACrB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,+BAA+B,EAAE,KAAK,CAAC,CAAC;YACtD,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,iBAAiB,CACrB,WAAmB,EACnB,MAAe;QAEf,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,KAAK,CACrB,oDAAoD,CACrD,CAAC;YACF,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YAC1D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YAEvD,sBAAsB;YACtB,IAAI,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;gBAChD,MAAM,MAAM,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAC/D,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,iBAAiB,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAClE,IAAI,CAAC,IAAI,CACP,OAAO,EACP,IAAI,KAAK,CACP,0BAA0B,MAAM,0BAA0B,KAAK,KAAK,CACrE,CACF,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAClE,OAAO,CAAC,GAAG,CACT,2BAA2B,WAAW,CAAC,MAAM,mBAAmB,MAAM,IAAI,SAAS,EAAE,CACtF,CAAC;YAEF,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAAC;YAChE,OAAO,CAAC,GAAG,CAAC,sBAAsB,eAAe,GAAG,CAAC,CAAC;YAEtD,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;gBAC5B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,mCAAmC,CAAC,CAAC;gBAC1D,IAAI,CAAC,WAAW,CAAC;oBACf,IAAI,EAAE,qBAAqB;oBAC3B,KAAK,EAAE,6BAA6B;iBACrC,CAAC,CAAC;gBACH,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO,eAAe,CAAC;QACzB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,gCAAgC,EAAE,KAAK,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,qBAAqB;gBAC3B,KAAK,EAAE,yBAA0B,KAAe,CAAC,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,EAAE;aAC5E,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;CACF;AA7HD,oDA6HC"}

35
dist/core/WebSocketManager.d.ts vendored Normal file
View File

@@ -0,0 +1,35 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
export declare class WebSocketManager extends EventEmitter {
private socket?;
private _isConnected;
get isConnected(): boolean;
get currentSocket(): WebSocket | undefined;
/**
* Connect to a WebSocket server by URL.
*/
connect(url: string): Promise<void>;
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void;
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message: Record<string, unknown>): void;
/**
* Disconnect and clean up the current socket.
*/
disconnect(): void;
/**
* Attach internal event listeners on the current socket.
*/
private attachListeners;
}
//# sourceMappingURL=WebSocketManager.d.ts.map

1
dist/core/WebSocketManager.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"WebSocketManager.d.ts","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AAEtC;;;;GAIG;AACH,qBAAa,gBAAiB,SAAQ,YAAY;IAChD,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,YAAY,CAAS;IAE7B,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,aAAa,IAAI,SAAS,GAAG,SAAS,CAEzC;IAED;;OAEG;IACH,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA0BnC;;OAEG;IACH,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAYrC;;;OAGG;IACH,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI;IAgB5C;;OAEG;IACH,UAAU,IAAI,IAAI;IAmBlB;;OAEG;IACH,OAAO,CAAC,eAAe;CAuBxB"}

126
dist/core/WebSocketManager.js vendored Normal file
View File

@@ -0,0 +1,126 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.WebSocketManager = void 0;
const ws_1 = require("ws");
const events_1 = require("events");
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
class WebSocketManager extends events_1.EventEmitter {
socket;
_isConnected = false;
get isConnected() {
return this._isConnected;
}
get currentSocket() {
return this.socket;
}
/**
* Connect to a WebSocket server by URL.
*/
connect(url) {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
return new Promise((resolve, reject) => {
try {
this.socket = new ws_1.WebSocket(url);
this.attachListeners();
this.socket.once("open", () => {
this._isConnected = true;
this.emit("connected");
resolve();
});
this.socket.once("error", (error) => {
reject(error);
});
}
catch (error) {
reject(error);
}
});
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket) {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
this.socket = socket;
this._isConnected = true;
this.attachListeners();
this.emit("connected");
}
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message) {
if (!this.socket || !this._isConnected)
return;
try {
if (this.socket.readyState === ws_1.WebSocket.OPEN) {
this.socket.send(JSON.stringify(message));
}
else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
}
}
catch (error) {
// Socket may have closed between the readyState check and send()
console.error("Failed to send WebSocket message:", error);
this.emit("error", error);
}
}
/**
* Disconnect and clean up the current socket.
*/
disconnect() {
if (!this.socket)
return;
try {
this.socket.removeAllListeners();
if (this.socket.readyState === ws_1.WebSocket.OPEN ||
this.socket.readyState === ws_1.WebSocket.CONNECTING) {
this.socket.close();
}
}
catch {
// Ignore close errors — socket may already be dead
}
this.socket = undefined;
this._isConnected = false;
}
/**
* Attach internal event listeners on the current socket.
*/
attachListeners() {
if (!this.socket)
return;
this.socket.on("message", (data) => {
try {
const message = JSON.parse(data.toString());
this.emit("message", message);
}
catch (err) {
console.error("Failed to parse WebSocket message:", err);
this.emit("error", err);
}
});
this.socket.on("close", () => {
this._isConnected = false;
this.emit("disconnected");
});
this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error);
});
}
}
exports.WebSocketManager = WebSocketManager;
//# sourceMappingURL=WebSocketManager.js.map

1
dist/core/WebSocketManager.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"WebSocketManager.js","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":";;;AAAA,2BAA+B;AAC/B,mCAAsC;AAEtC;;;;GAIG;AACH,MAAa,gBAAiB,SAAQ,qBAAY;IACxC,MAAM,CAAa;IACnB,YAAY,GAAG,KAAK,CAAC;IAE7B,IAAI,WAAW;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,GAAW;QACjB,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,IAAI,CAAC,MAAM,GAAG,IAAI,cAAS,CAAC,GAAG,CAAC,CAAC;gBACjC,IAAI,CAAC,eAAe,EAAE,CAAC;gBAEvB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE;oBAC5B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;oBACzB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBACvB,OAAO,EAAE,CAAC;gBACZ,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;oBAClC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAChB,CAAC,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,CAAC,KAAK,CAAC,CAAC;YAChB,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,MAAiB;QAC5B,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QACzB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzB,CAAC;IAED;;;OAGG;IACH,IAAI,CAAC,OAAgC;QACnC,IAAI,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY;YAAE,OAAO;QAE/C,IAAI,CAAC;YACH,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI,EAAE,CAAC;gBAC9C,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;YAC5C,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,sCAAsC,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iEAAiE;YACjE,OAAO,CAAC,KAAK,CAAC,mCAAmC,EAAE,KAAK,CAAC,CAAC;YAC1D,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC;YACH,IAAI,CAAC,MAAM,CAAC,kBAAkB,EAAE,CAAC;YACjC,IACE,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI;gBACzC,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,UAAU,EAC/C,CAAC;gBACD,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YACtB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,mDAAmD;QACrD,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,SAAS,CAAC;QACxB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC5C,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAChC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,CAAC,KAAK,CAAC,oCAAoC,EAAE,GAAG,CAAC,CAAC;gBACzD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAC3B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;YAC1B,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAChC,OAAO,CAAC,KAAK,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AA5HD,4CA4HC"}

7
dist/core/index.d.ts vendored Normal file
View File

@@ -0,0 +1,7 @@
export { WebSocketManager } from "./WebSocketManager";
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
export { ConversationManager, type ConversationManagerOptions, } from "./ConversationManager";
export { TranscriptionManager, type TranscriptionManagerOptions, } from "./TranscriptionManager";
export { processFullStream, handleStreamChunk, type StreamResult, type StreamProcessorCallbacks, } from "./StreamProcessor";
export { InputQueue, type QueueItem } from "./InputQueue";
//# sourceMappingURL=index.d.ts.map

1
dist/core/index.d.ts.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,KAAK,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAC3E,OAAO,EACL,mBAAmB,EACnB,KAAK,0BAA0B,GAChC,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EACL,oBAAoB,EACpB,KAAK,2BAA2B,GACjC,MAAM,wBAAwB,CAAC;AAChC,OAAO,EACL,iBAAiB,EACjB,iBAAiB,EACjB,KAAK,YAAY,EACjB,KAAK,wBAAwB,GAC9B,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,UAAU,EAAE,KAAK,SAAS,EAAE,MAAM,cAAc,CAAC"}

17
dist/core/index.js vendored Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.InputQueue = exports.handleStreamChunk = exports.processFullStream = exports.TranscriptionManager = exports.ConversationManager = exports.SpeechManager = exports.WebSocketManager = void 0;
var WebSocketManager_1 = require("./WebSocketManager");
Object.defineProperty(exports, "WebSocketManager", { enumerable: true, get: function () { return WebSocketManager_1.WebSocketManager; } });
var SpeechManager_1 = require("./SpeechManager");
Object.defineProperty(exports, "SpeechManager", { enumerable: true, get: function () { return SpeechManager_1.SpeechManager; } });
var ConversationManager_1 = require("./ConversationManager");
Object.defineProperty(exports, "ConversationManager", { enumerable: true, get: function () { return ConversationManager_1.ConversationManager; } });
var TranscriptionManager_1 = require("./TranscriptionManager");
Object.defineProperty(exports, "TranscriptionManager", { enumerable: true, get: function () { return TranscriptionManager_1.TranscriptionManager; } });
var StreamProcessor_1 = require("./StreamProcessor");
Object.defineProperty(exports, "processFullStream", { enumerable: true, get: function () { return StreamProcessor_1.processFullStream; } });
Object.defineProperty(exports, "handleStreamChunk", { enumerable: true, get: function () { return StreamProcessor_1.handleStreamChunk; } });
var InputQueue_1 = require("./InputQueue");
Object.defineProperty(exports, "InputQueue", { enumerable: true, get: function () { return InputQueue_1.InputQueue; } });
//# sourceMappingURL=index.js.map

1
dist/core/index.js.map vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":";;;AAAA,uDAAsD;AAA7C,oHAAA,gBAAgB,OAAA;AACzB,iDAA2E;AAAlE,8GAAA,aAAa,OAAA;AACtB,6DAG+B;AAF7B,0HAAA,mBAAmB,OAAA;AAGrB,+DAGgC;AAF9B,4HAAA,oBAAoB,OAAA;AAGtB,qDAK2B;AAJzB,oHAAA,iBAAiB,OAAA;AACjB,oHAAA,iBAAiB,OAAA;AAInB,2CAA0D;AAAjD,wGAAA,UAAU,OAAA"}

4
dist/index.d.ts vendored
View File

@@ -1,4 +1,4 @@
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent"; export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent"; export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent.new";
export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types"; export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types";
//# sourceMappingURL=index.d.ts.map //# sourceMappingURL=index.d.ts.map

2
dist/index.d.ts.map vendored
View File

@@ -1 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAClE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,cAAc,CAAC;AAGtB,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"} {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}

8
dist/index.js vendored
View File

@@ -2,10 +2,10 @@
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0; exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0;
// Agents // Agents
var VoiceAgent_1 = require("./VoiceAgent"); var VoiceAgent_new_1 = require("./VoiceAgent.new");
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_1.VoiceAgent; } }); Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_new_1.VoiceAgent; } });
var VideoAgent_1 = require("./VideoAgent"); var VideoAgent_new_1 = require("./VideoAgent.new");
Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_1.VideoAgent; } }); Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_new_1.VideoAgent; } });
// Shared types // Shared types
var types_1 = require("./types"); var types_1 = require("./types");
Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } }); Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } });

2
dist/index.js.map vendored
View File

@@ -1 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,2CAAkE;AAAzD,wGAAA,UAAU,OAAA;AACnB,2CAQsB;AAPlB,wGAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"} {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,mDAAsE;AAA7D,4GAAA,UAAU,OAAA;AACnB,mDAQ0B;AAPtB,4GAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@@ -2,19 +2,19 @@ const http = require('http');
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
const PORT = 3000; const PORT = 3102;
// Create a simple HTTP server to serve the voice client HTML // Create a simple HTTP server to serve the voice client HTML
const server = http.createServer((req, res) => { const server = http.createServer((req, res) => {
if (req.url === '/' || req.url === '/index.html') { if (req.url === '/' || req.url === '/index.html') {
const htmlPath = path.join(__dirname, 'voice-client.html'); const htmlPath = path.join(__dirname, 'video-client.html');
fs.readFile(htmlPath, (err, data) => { fs.readFile(htmlPath, (err, data) => {
if (err) { if (err) {
res.writeHead(500); res.writeHead(500);
res.end('Error loading voice-client.html'); res.end('Error loading voice-client.html');
return; return;
} }
res.writeHead(200, {'Content-Type': 'text/html'}); res.writeHead(200, { 'Content-Type': 'text/html' });
res.end(data); res.end(data);
}); });
} else { } else {

998
example/video-client.html Normal file
View File

@@ -0,0 +1,998 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Video + Voice Agent Client</title>
<style>
body {
font-family: system-ui, sans-serif;
max-width: 1000px;
margin: 20px auto;
padding: 0 16px;
background: #f9fafb;
color: #111827;
}
h1 {
margin-bottom: 8px;
}
.subtitle {
color: #6b7280;
font-size: 0.95rem;
margin-bottom: 24px;
}
.card {
background: white;
border: 1px solid #e5e7eb;
border-radius: 12px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
}
.row {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: center;
margin-bottom: 16px;
}
video {
width: 100%;
max-width: 520px;
border-radius: 10px;
background: #000;
aspect-ratio: 4 / 3;
}
button {
padding: 10px 16px;
border-radius: 8px;
border: 1px solid #d1d5db;
background: white;
cursor: pointer;
font-weight: 500;
}
button.primary {
background: #2563eb;
color: white;
border-color: #2563eb;
}
button.danger {
background: #dc2626;
color: white;
border-color: #dc2626;
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.status {
font-weight: 600;
margin: 8px 0;
font-size: 0.95rem;
}
.dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 50%;
margin-right: 8px;
}
.dot.disconnected {
background: #9ca3af;
}
.dot.connected {
background: #22c55e;
}
.dot.listening {
background: #f59e0b;
animation: pulse 1.5s infinite;
}
.dot.speaking {
background: #3b82f6;
animation: pulse 1.2s infinite;
}
@keyframes pulse {
0%,
100% {
opacity: 1
}
50% {
opacity: 0.6
}
}
#transcript,
#assistant,
#reasoning,
#tools {
min-height: 48px;
padding: 12px;
border-radius: 8px;
background: #f3f4f6;
border-left: 4px solid #9ca3af;
margin-bottom: 16px;
white-space: pre-wrap;
}
#transcript {
border-left-color: #2563eb;
}
#assistant {
border-left-color: #22c55e;
}
#reasoning {
border-left-color: #f59e0b;
font-style: italic;
color: #4b5563;
}
#tools {
border-left-color: #8b5cf6;
font-size: 0.9rem;
}
#log {
background: #0f172a;
color: #e2e8f0;
font-family: 'SF Mono', monospace;
font-size: 0.82rem;
padding: 12px;
border-radius: 8px;
max-height: 240px;
overflow-y: auto;
white-space: pre-wrap;
}
.hidden {
display: none;
}
/* ── Mic selector & level meter ── */
#micRow {
margin-bottom: 12px;
}
#micSelect {
flex: 1;
min-width: 180px;
padding: 6px 8px;
border-radius: 6px;
border: 1px solid #d1d5db;
}
#refreshMicsBtn {
padding: 6px 12px;
font-size: 0.85rem;
}
.meter-wrap {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 12px;
}
.meter-wrap label {
font-size: 0.85rem;
white-space: nowrap;
}
#levelMeter {
flex: 1;
height: 14px;
border-radius: 7px;
background: #e5e7eb;
overflow: hidden;
}
#levelBar {
height: 100%;
width: 0%;
border-radius: 7px;
background: #22c55e;
transition: width 60ms linear;
}
#levelBar.hot {
background: #ef4444;
}
#rmsValue {
font-family: monospace;
font-size: 0.8rem;
width: 56px;
text-align: right;
}
/* ── Push-to-talk ── */
#pttBtn {
padding: 10px 20px;
font-size: 1rem;
font-weight: 600;
border-radius: 10px;
border: 2px solid #2563eb;
background: #eff6ff;
color: #2563eb;
cursor: pointer;
user-select: none;
touch-action: none;
}
#pttBtn:active,
#pttBtn.active {
background: #dc2626;
color: white;
border-color: #dc2626;
}
#pttBtn:disabled {
opacity: 0.4;
cursor: not-allowed;
}
</style>
</head>
<body>
<h1>📹 Video + Voice Agent</h1>
<p class="subtitle">Webcam + microphone → multimodal AI (vision + speech)</p>
<div class="card">
<video id="localVideo" autoplay playsinline muted></video>
<canvas id="frameCanvas" style="display:none"></canvas>
<div class="row" style="margin-top:16px">
<input type="text" id="wsEndpoint" value="ws://localhost:8081" style="flex:1; min-width:260px" />
<button id="connectBtn" class="primary">Connect</button>
<button id="disconnectBtn" disabled>Disconnect</button>
</div>
<!-- ── Mic selector ── -->
<div class="row" id="micRow">
<label>Microphone:</label>
<select id="micSelect">
<option value="">-- click Refresh --</option>
</select>
<button id="refreshMicsBtn">🔄 Refresh</button>
</div>
<!-- ── Live level meter ── -->
<div class="meter-wrap">
<label>Mic level:</label>
<div id="levelMeter">
<div id="levelBar"></div>
</div>
<span id="rmsValue">0.000</span>
</div>
<div class="row">
<label>Input mode:</label>
<select id="inputMode">
<option value="browser-stt">Browser STT</option>
<option value="server-whisper">Server Whisper (VAD)</option>
<option value="push-to-talk" selected>Push-to-Talk</option>
</select>
<label>Frames:</label>
<select id="frameInterval">
<option value="3000">every 3s</option>
<option value="5000" selected>every 5s</option>
<option value="10000">every 10s</option>
<option value="0">manual only</option>
</select>
</div>
<div class="row">
<button id="startMediaBtn" disabled>📹🎤 Start Camera + Mic</button>
<button id="stopMediaBtn" disabled>⏹ Stop</button>
<button id="captureBtn" disabled>Capture Frame Now</button>
<button id="pttBtn" disabled>🎙 Hold to Talk</button>
<button id="interruptBtn" class="danger" disabled>✋ Interrupt</button>
</div>
<div class="status" id="status">
<span class="dot disconnected"></span>Disconnected
</div>
</div>
<h3>👤 You said</h3>
<div id="transcript"></div>
<h3>🤖 Assistant</h3>
<div id="assistant"></div>
<div id="reasoningSection" class="hidden">
<h3>💭 Reasoning</h3>
<div id="reasoning"></div>
</div>
<div id="toolsSection" class="hidden">
<h3>🛠️ Tools</h3>
<div id="tools"></div>
</div>
<h3>📜 Log</h3>
<div id="log"></div>
<script>
// ────────────────────────────────────────────────────────────────
// State & Elements
// ────────────────────────────────────────────────────────────────
const els = {
wsEndpoint: document.getElementById('wsEndpoint'),
connectBtn: document.getElementById('connectBtn'),
disconnectBtn: document.getElementById('disconnectBtn'),
inputMode: document.getElementById('inputMode'),
frameInterval: document.getElementById('frameInterval'),
startMediaBtn: document.getElementById('startMediaBtn'),
stopMediaBtn: document.getElementById('stopMediaBtn'),
captureBtn: document.getElementById('captureBtn'),
pttBtn: document.getElementById('pttBtn'),
interruptBtn: document.getElementById('interruptBtn'),
status: document.getElementById('status'),
transcript: document.getElementById('transcript'),
assistant: document.getElementById('assistant'),
reasoningSec: document.getElementById('reasoningSection'),
reasoning: document.getElementById('reasoning'),
toolsSec: document.getElementById('toolsSection'),
tools: document.getElementById('tools'),
log: document.getElementById('log'),
video: document.getElementById('localVideo'),
canvas: document.getElementById('frameCanvas'),
micSelect: document.getElementById('micSelect'),
refreshMicsBtn: document.getElementById('refreshMicsBtn'),
levelBar: document.getElementById('levelBar'),
rmsValue: document.getElementById('rmsValue'),
};
let ws = null;
let localStream = null;
let audioOnlyStream = null; // ← ADD THIS
let mediaRecorder = null;
let audioChunks = [];
let frameTimer = null;
let audioQueue = [];
let isPlaying = false;
let currentSource = null;
// Level-meter / VAD audio nodes (use browser-native sample rate)
let meterCtx = null; // AudioContext for the meter (always running when media is on)
let meterAnalyser = null;
let meterSource = null;
let meterRafId = null;
// VAD-specific
let silenceStart = null;
let recordingStartTime = null;
const SPEECH_THRESHOLD = 0.015;
const SILENCE_THRESHOLD = 0.008;
const SILENCE_DURATION = 1400; // ms
const MIN_RECORDING_TIME = 600; // ms
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
let recognition = null;
// ────────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────────
function log(...args) {
const time = new Date().toLocaleTimeString([], { hour12: false });
const line = `[${time}] ${args.join(' ')}\n`;
els.log.textContent += line;
els.log.scrollTop = els.log.scrollHeight;
}
function setStatus(text, state = 'disconnected') {
els.status.innerHTML = `<span class="dot ${state}"></span>${text}`;
}
function enable(...btns) {
btns.forEach(b => { if (els[b]) els[b].disabled = false; });
}
function disable(...btns) {
btns.forEach(b => { if (els[b]) els[b].disabled = true; });
}
function resetUI() {
els.assistant.textContent = '';
els.reasoning.textContent = '';
els.tools.textContent = '';
els.reasoningSec.classList.add('hidden');
els.toolsSec.classList.add('hidden');
}
// ────────────────────────────────────────────────────────────────
// Mic enumeration
// ────────────────────────────────────────────────────────────────
async function refreshMics() {
try {
// Need a temporary stream to get labelled device list
const tmp = await navigator.mediaDevices.getUserMedia({ audio: true });
tmp.getTracks().forEach(t => t.stop());
const devices = await navigator.mediaDevices.enumerateDevices();
const mics = devices.filter(d => d.kind === 'audioinput');
els.micSelect.innerHTML = '';
mics.forEach((m, i) => {
const opt = document.createElement('option');
opt.value = m.deviceId;
opt.textContent = m.label || `Microphone ${i + 1}`;
els.micSelect.appendChild(opt);
});
log(`Found ${mics.length} microphone(s)`);
} catch (err) {
log('Mic enumeration failed:', err.message);
}
}
els.refreshMicsBtn.onclick = refreshMics;
// Auto-populate on page load
refreshMics();
// ────────────────────────────────────────────────────────────────
// Live audio level meter (always-on when media is active)
// Uses AnalyserNode + rAF no ScriptProcessorNode needed.
// ────────────────────────────────────────────────────────────────
function startLevelMeter(stream) {
// Use the browser's native sample rate (NO custom sampleRate!)
meterCtx = new (window.AudioContext || window.webkitAudioContext)();
meterSource = meterCtx.createMediaStreamSource(stream);
meterAnalyser = meterCtx.createAnalyser();
meterAnalyser.fftSize = 1024;
meterSource.connect(meterAnalyser);
// Do NOT connect to destination we don't want to hear ourselves
const buf = new Float32Array(meterAnalyser.fftSize);
function tick() {
meterAnalyser.getFloatTimeDomainData(buf);
let sum = 0;
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
const rms = Math.sqrt(sum / buf.length);
// Update UI
const pct = Math.min(rms / 0.15, 1) * 100; // 0.15 is "loud"
els.levelBar.style.width = pct + '%';
els.levelBar.classList.toggle('hot', rms > SPEECH_THRESHOLD);
els.rmsValue.textContent = rms.toFixed(4);
// If VAD mode is active, drive it from here
if (els.inputMode.value === 'server-whisper') {
vadTick(rms);
}
meterRafId = requestAnimationFrame(tick);
}
tick();
log(`Level meter started (sampleRate=${meterCtx.sampleRate})`);
}
function stopLevelMeter() {
if (meterRafId) { cancelAnimationFrame(meterRafId); meterRafId = null; }
if (meterSource) { meterSource.disconnect(); meterSource = null; }
if (meterAnalyser) { meterAnalyser.disconnect(); meterAnalyser = null; }
if (meterCtx) { meterCtx.close(); meterCtx = null; }
els.levelBar.style.width = '0%';
els.rmsValue.textContent = '0.000';
}
// ────────────────────────────────────────────────────────────────
// Frame capture & send
// ────────────────────────────────────────────────────────────────
function captureFrame(reason = 'timer') {
if (!els.video.videoWidth) return;
const ctx = els.canvas.getContext('2d');
els.canvas.width = els.video.videoWidth;
els.canvas.height = els.video.videoHeight;
ctx.drawImage(els.video, 0, 0);
const dataUrl = els.canvas.toDataURL('image/webp', 0.78);
const base64 = dataUrl.split(',')[1];
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({
type: 'video_frame',
sessionId: 'client-main',
sequence: Date.now(),
timestamp: Date.now(),
triggerReason: reason,
image: {
data: base64,
format: 'webp',
width: els.canvas.width,
height: els.canvas.height
}
}));
log(`Frame sent (${(base64.length / 1000).toFixed(1)} kB) — ${reason}`);
}
}
// ────────────────────────────────────────────────────────────────
// Audio playback queue
// ────────────────────────────────────────────────────────────────
async function playNext() {
if (isPlaying || audioQueue.length === 0) return;
isPlaying = true;
const { bytes, format } = audioQueue.shift();
try {
const ctx = new (window.AudioContext || window.webkitAudioContext)();
const buffer = await ctx.decodeAudioData(
bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.length)
);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
currentSource = source;
source.onended = () => {
currentSource = null;
isPlaying = false;
ctx.close();
playNext();
};
source.start(0);
log(`Playing audio chunk (${bytes.length} bytes, ${format})`);
} catch (err) {
console.error('Audio decode/play error:', err);
isPlaying = false;
playNext();
}
}
// ────────────────────────────────────────────────────────────────
// WebSocket
// ────────────────────────────────────────────────────────────────
function connect() {
const url = els.wsEndpoint.value.trim();
if (!url) return log('No endpoint');
setStatus('Connecting...', 'disconnected');
ws = new WebSocket(url);
ws.onopen = () => {
setStatus('Connected', 'connected');
enable('startMediaBtn', 'interruptBtn', 'captureBtn');
disable('connectBtn');
enable('disconnectBtn');
log(`Connected to ${url}`);
};
ws.onclose = () => {
setStatus('Disconnected', 'disconnected');
disable('startMediaBtn', 'stopMediaBtn', 'captureBtn', 'interruptBtn', 'pttBtn');
enable('connectBtn');
disable('disconnectBtn');
stopAllMedia();
log('Disconnected');
ws = null;
};
ws.onerror = (e) => {
log('WebSocket error', e);
setStatus('Error', 'disconnected');
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
handleMessage(msg);
} catch (err) {
log('Parse error:', err);
}
};
}
function disconnect() {
if (ws) ws.close();
stopAllMedia();
}
// ────────────────────────────────────────────────────────────────
// Media (camera + mic)
// ────────────────────────────────────────────────────────────────
async function startMedia() {
try {
const audioConstraint = els.micSelect.value
? { deviceId: { exact: els.micSelect.value } }
: true;
localStream = await navigator.mediaDevices.getUserMedia({
video: { width: { ideal: 640 }, height: { ideal: 480 } },
audio: audioConstraint,
});
audioOnlyStream = new MediaStream(localStream.getAudioTracks()); // ← ADD THIS
// Log which mic was actually selected
const audioTrack = localStream.getAudioTracks()[0];
log(`Mic active: "${audioTrack?.label || 'unknown'}"`);
els.video.srcObject = localStream;
await els.video.play();
enable('stopMediaBtn', 'pttBtn');
disable('startMediaBtn');
// Start the always-on level meter
startLevelMeter(localStream);
// Periodic frames
const intervalMs = Number(els.frameInterval.value);
if (intervalMs > 0) {
frameTimer = setInterval(() => captureFrame('timer'), intervalMs);
log(`Frame capture every ${intervalMs / 1000}s`);
}
// Start the selected input mode
const mode = els.inputMode.value;
if (mode === 'browser-stt') {
startBrowserSTT();
}
// VAD and push-to-talk don't need extra init they're driven by
// the level-meter tick and button events respectively.
setStatus('Listening...', 'listening');
log(`Camera + Mic started, input mode: ${mode}`);
} catch (err) {
log('getUserMedia failed:', err.message);
}
}
function stopAllMedia() {
if (frameTimer) { clearInterval(frameTimer); frameTimer = null; }
stopLevelMeter();
if (localStream) {
localStream.getTracks().forEach(t => t.stop());
audioOnlyStream = null;
localStream = null;
}
els.video.srcObject = null;
if (mediaRecorder?.state === 'recording') mediaRecorder.stop();
mediaRecorder = null;
if (recognition) recognition.stop();
recognition = null;
silenceStart = null;
recordingStartTime = null;
audioChunks = [];
disable('stopMediaBtn', 'pttBtn');
enable('startMediaBtn');
setStatus('Connected', 'connected');
log('Media stopped');
}
// ────────────────────────────────────────────────────────────────
// Shared: record a segment from localStream and send it
// ────────────────────────────────────────────────────────────────
function chosenMimeType() {
for (const mt of [
'audio/webm;codecs=opus',
'audio/webm',
'audio/ogg;codecs=opus',
'audio/mp4',
]) {
if (MediaRecorder.isTypeSupported(mt)) return mt;
}
return ''; // let browser pick default
}
function startRecording() {
if (mediaRecorder?.state === 'recording') return;
if (!audioOnlyStream) { log('No audio stream!'); return; }
audioChunks = [];
recordingStartTime = Date.now();
silenceStart = null;
const mimeType = chosenMimeType();
const opts = mimeType ? { mimeType } : undefined;
mediaRecorder = new MediaRecorder(audioOnlyStream, opts);
mediaRecorder.ondataavailable = e => {
if (e.data.size > 0) audioChunks.push(e.data);
};
mediaRecorder.onstop = async () => {
const usedMime = mediaRecorder?.mimeType || mimeType || 'audio/webm';
if (audioChunks.length === 0) {
log('No audio chunks recorded');
setStatus('Listening...', 'listening');
return;
}
const blob = new Blob(audioChunks, { type: usedMime });
if (blob.size < 800) {
log(`Audio too short (${blob.size} bytes), skipping`);
setStatus('Listening...', 'listening');
return;
}
const arrayBuffer = await blob.arrayBuffer();
const base64 = btoa(
new Uint8Array(arrayBuffer).reduce((d, b) => d + String.fromCharCode(b), '')
);
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'audio', data: base64, format: usedMime }));
log(`Sent audio (${(base64.length / 1000).toFixed(1)} kB, ${usedMime})`);
els.transcript.textContent = 'Transcribing...';
} else {
log('WS not connected, audio dropped');
}
setStatus('Listening...', 'listening');
};
mediaRecorder.start(100); // timeslice 100ms
setStatus('🔴 Recording...', 'speaking');
log('Recording started');
}
function stopRecording() {
if (mediaRecorder?.state === 'recording') {
mediaRecorder.stop();
silenceStart = null;
recordingStartTime = null;
setStatus('Processing...', 'connected');
log('Recording stopped, sending...');
}
}
// ────────────────────────────────────────────────────────────────
// VAD (driven from the level-meter rAF loop)
// ────────────────────────────────────────────────────────────────
function vadTick(rms) {
if (rms > SPEECH_THRESHOLD) {
silenceStart = null;
if (!mediaRecorder || mediaRecorder.state !== 'recording') {
startRecording();
}
} else if (rms < SILENCE_THRESHOLD && mediaRecorder?.state === 'recording') {
if (!silenceStart) {
silenceStart = Date.now();
} else if (Date.now() - silenceStart > SILENCE_DURATION) {
if (recordingStartTime && (Date.now() - recordingStartTime) > MIN_RECORDING_TIME) {
log('Silence → stopping');
stopRecording();
}
}
}
}
// ────────────────────────────────────────────────────────────────
// Push-to-Talk
// ────────────────────────────────────────────────────────────────
function pttDown() {
if (!localStream) return;
els.pttBtn.classList.add('active');
startRecording();
}
function pttUp() {
els.pttBtn.classList.remove('active');
stopRecording();
}
els.pttBtn.addEventListener('mousedown', pttDown);
els.pttBtn.addEventListener('mouseup', pttUp);
els.pttBtn.addEventListener('mouseleave', pttUp);
els.pttBtn.addEventListener('touchstart', e => { e.preventDefault(); pttDown(); });
els.pttBtn.addEventListener('touchend', e => { e.preventDefault(); pttUp(); });
// Spacebar push-to-talk (only when mode is push-to-talk)
let spaceHeld = false;
document.addEventListener('keydown', e => {
if (e.code === 'Space' && !spaceHeld && els.inputMode.value === 'push-to-talk'
&& localStream && !e.target.matches('input, textarea, select')) {
e.preventDefault();
spaceHeld = true;
pttDown();
}
});
document.addEventListener('keyup', e => {
if (e.code === 'Space' && spaceHeld) {
e.preventDefault();
spaceHeld = false;
pttUp();
}
});
// ────────────────────────────────────────────────────────────────
// Browser STT
// ────────────────────────────────────────────────────────────────
function startBrowserSTT() {
if (!SpeechRecognition) { log('Web Speech API not supported'); return; }
recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'en-US';
recognition.onresult = e => {
const transcript = Array.from(e.results).map(r => r[0].transcript).join('');
els.transcript.textContent = transcript;
if (e.results[0].isFinal) sendTranscript(transcript);
};
recognition.onerror = e => log('STT error:', e.error);
recognition.start();
log('Browser STT started');
}
// ────────────────────────────────────────────────────────────────
// Sending transcript / interrupt
// ────────────────────────────────────────────────────────────────
function sendTranscript(text) {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
ws.send(JSON.stringify({ type: 'transcript', text }));
log(`Sent transcript: ${text}`);
resetUI();
}
function interrupt() {
if (ws?.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'interrupt', reason: 'user_button' }));
log('Interrupt sent');
}
audioQueue = [];
if (currentSource) { currentSource.stop(); currentSource = null; }
isPlaying = false;
}
// ────────────────────────────────────────────────────────────────
// Server → Client messages
// ────────────────────────────────────────────────────────────────
function handleMessage(msg) {
switch (msg.type) {
case 'transcription_result':
els.transcript.textContent = msg.text || '(empty)';
log(`Transcription: ${msg.text}`);
break;
case 'text_delta':
els.assistant.textContent += msg.text || '';
break;
case 'reasoning_delta':
els.reasoningSec.classList.remove('hidden');
els.reasoning.textContent += msg.text || '';
break;
case 'tool_call':
case 'tool_result':
els.toolsSec.classList.remove('hidden');
els.tools.innerHTML += `<div>${msg.type}: ${msg.toolName || '?'}${JSON.stringify(msg.result || msg.input || {})}</div>`;
break;
case 'audio_chunk':
case 'audio':
const bytes = Uint8Array.from(atob(msg.data), c => c.charCodeAt(0));
audioQueue.push({ bytes, format: msg.format || 'mp3' });
playNext();
break;
case 'speech_interrupted':
audioQueue = [];
if (currentSource) currentSource.stop();
isPlaying = false;
log(`Speech interrupted: ${msg.reason || '?'}`);
break;
case 'response_complete':
log('Response complete');
break;
case 'capture_frame':
log(`Server requested frame: ${msg.reason}`);
captureFrame(msg.reason || 'server_request');
break;
case 'frame_ack':
break; // silent
case 'session_init':
log(`Session: ${msg.sessionId}`);
break;
case 'stream_start':
resetUI();
break;
case 'stream_finish':
log(`Stream finished: ${msg.finishReason}`);
break;
case 'speech_stream_start':
break;
case 'speech_stream_end':
log('Speech done');
break;
case 'error':
log(`ERROR: ${msg.error}`);
console.error('Server error:', msg.error);
break;
case 'transcription_error':
log(`Transcription error: ${msg.error}`);
els.transcript.textContent = `Error: ${msg.error}`;
break;
default:
if (msg.type?.includes('stream') || msg.type?.includes('step')) {
// verbose stream events log quietly
} else {
log(`[${msg.type}]`);
}
}
}
// ────────────────────────────────────────────────────────────────
// Event listeners
// ────────────────────────────────────────────────────────────────
els.connectBtn.onclick = connect;
els.disconnectBtn.onclick = disconnect;
els.startMediaBtn.onclick = startMedia;
els.stopMediaBtn.onclick = stopAllMedia;
els.captureBtn.onclick = () => captureFrame('manual');
els.interruptBtn.onclick = interrupt;
els.frameInterval.onchange = () => {
if (frameTimer) {
clearInterval(frameTimer);
const ms = Number(els.frameInterval.value);
if (ms > 0) frameTimer = setInterval(() => captureFrame('timer'), ms);
}
};
document.getElementById('wsEndpoint').addEventListener('keypress', e => {
if (e.key === 'Enter') connect();
});
</script>
</body>
</html>

161
example/ws-server-video.ts Normal file
View File

@@ -0,0 +1,161 @@
// ws-server-video.ts
import "dotenv/config";
import { WebSocketServer } from "ws";
import { VideoAgent } from "../src/VideoAgent.new"; // adjust path
import { tool } from "ai";
import { z } from "zod";
import { openai } from "@ai-sdk/openai";
import { mkdirSync, writeFileSync } from "fs";
import { join, dirname } from "path";
import { fileURLToPath } from "url";
// ── Frame saving ────────────────────────────────────────────────────────
const __dirname = typeof import.meta.dirname === "string"
? import.meta.dirname
: dirname(fileURLToPath(import.meta.url));
const FRAMES_DIR = join(__dirname, "frames");
mkdirSync(FRAMES_DIR, { recursive: true });
console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
let frameCounter = 0;
function saveFrame(msg: {
sequence?: number;
timestamp?: number;
triggerReason?: string;
image: { data: string; format?: string; width?: number; height?: number };
}) {
const idx = frameCounter++;
const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
const ts = new Date(msg.timestamp ?? Date.now())
.toISOString()
.replace(/[:.]/g, "-");
const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
const filepath = join(FRAMES_DIR, filename);
const buf = Buffer.from(msg.image.data, "base64");
writeFileSync(filepath, buf);
console.log(
`[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` +
`${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
`, ${msg.triggerReason ?? "unknown"})`
);
}
const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
const url = new URL(endpoint);
const port = Number(url.port || 8081);
const host = url.hostname || "localhost";
// ── Tools (same as demo.ts) ────────────────────────────────────────────
const weatherTool = tool({
description: "Get the weather in a location",
inputSchema: z.object({
location: z.string().describe("The location to get the weather for"),
}),
execute: async ({ location }) => ({
location,
temperature: 72 + Math.floor(Math.random() * 21) - 10,
conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
Math.floor(Math.random() * 4)
],
}),
});
const timeTool = tool({
description: "Get the current time",
inputSchema: z.object({}),
execute: async () => ({
time: new Date().toLocaleTimeString(),
timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
}),
});
const wss = new WebSocketServer({ port, host });
wss.on("listening", () => {
console.log(`[video-ws] listening on ${endpoint}`);
console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
});
wss.on("connection", (socket) => {
console.log("[video-ws] ✓ client connected");
const agent = new VideoAgent({
model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
transcriptionModel: openai.transcription("whisper-1"),
speechModel: openai.speech("gpt-4o-mini-tts"),
instructions: `You are a helpful video+voice assistant.
You can SEE what the user is showing via webcam.
Describe what you see when it helps answer the question.
Keep spoken answers concise and natural.`,
voice: "echo",
streamingSpeech: {
minChunkSize: 25,
maxChunkSize: 140,
parallelGeneration: true,
maxParallelRequests: 3,
},
tools: { getWeather: weatherTool, getTime: timeTool },
// Tune these depending on your budget & latency goals
maxContextFrames: 6, // very important — each frame ≈ 100400 tokens
maxFrameInputSize: 2_500_000, // ~2.5 MB
});
// Reuse most of the same event logging you have in ws-server.ts
agent.on("text", (data: { role: string; text: string }) => {
console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
});
agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
process.stdout.write(data.text || "");
});
agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`);
});
agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
// Audio and transcription events
agent.on("audio_received", ({ size, format }) => {
console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
});
agent.on("transcription", ({ text, language }) => {
console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
});
// Speech events
agent.on("speech_start", () => console.log(`[video] Speech started`));
agent.on("speech_complete", () => console.log(`[video] Speech complete`));
agent.on("audio_chunk", ({ chunkId, text }) => {
console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
});
// Error handling
agent.on("error", (error: Error) => {
console.error(`[video] ERROR:`, error);
});
agent.on("warning", (warning: string) => {
console.warn(`[video] WARNING:`, warning);
});
agent.on("disconnected", () => {
agent.destroy();
console.log("[video-ws] ✗ client disconnected (agent destroyed)");
});
// ── Intercept raw messages to save frames to disk ────────────────────
socket.on("message", (raw) => {
try {
const msg = JSON.parse(raw.toString());
if (msg.type === "video_frame" && msg.image?.data) {
saveFrame(msg);
}
} catch {
// not JSON — ignore, agent will handle binary etc.
}
});
// The crucial line — same as VoiceAgent
agent.handleSocket(socket);
});

View File

@@ -1,6 +1,6 @@
{ {
"name": "voice-agent-ai-sdk", "name": "voice-agent-ai-sdk",
"version": "0.2.1-beta.0", "version": "1.0.1",
"description": "Voice AI Agent with ai-sdk", "description": "Voice AI Agent with ai-sdk",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",
@@ -15,6 +15,7 @@
"demo": "tsx example/demo.ts", "demo": "tsx example/demo.ts",
"ws:server": "tsx example/ws-server.ts", "ws:server": "tsx example/ws-server.ts",
"client": "node example/serve-client.js", "client": "node example/serve-client.js",
"ws:video": "tsx example/ws-server-video.ts",
"prepublishOnly": "pnpm build" "prepublishOnly": "pnpm build"
}, },
"keywords": [ "keywords": [

818
src/VideoAgent.new.ts Normal file
View File

@@ -0,0 +1,818 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import {
streamText,
type LanguageModel,
stepCountIs,
type Tool,
type ModelMessage,
type TranscriptionModel,
type SpeechModel,
} from "ai";
import {
type StreamingSpeechConfig,
type HistoryConfig,
} from "./types";
import {
WebSocketManager,
SpeechManager,
ConversationManager,
TranscriptionManager,
InputQueue,
type QueueItem,
processFullStream,
handleStreamChunk,
} from "./core";
// ── Video-specific types ────────────────────────────────
/**
* Trigger reasons for frame capture
*/
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
/**
* Video frame data structure sent to/from the client
*/
interface VideoFrame {
type: "video_frame";
sessionId: string;
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
previousFrameRef?: string;
image: {
data: string;
format: string;
width: number;
height: number;
};
}
/**
* Audio data structure
*/
interface AudioData {
type: "audio";
sessionId: string;
data: string;
format: string;
sampleRate?: number;
duration?: number;
timestamp: number;
}
/**
* Backend configuration for video processing
*/
interface VideoAgentConfig {
/** Maximum frames to keep in context buffer for conversation history */
maxContextFrames: number;
}
/**
* Frame context for maintaining visual conversation history
*/
interface FrameContext {
sequence: number;
timestamp: number;
triggerReason: FrameTriggerReason;
frameHash: string;
description?: string;
}
/** Default maximum frame input size (5 MB) */
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
/** Default video agent config */
const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
maxContextFrames: 10,
};
// ── Options & queue item ────────────────────────────────
export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
history?: Partial<HistoryConfig>;
maxAudioInputSize?: number;
/** Maximum frame input size in bytes (default: 5 MB) */
maxFrameInputSize?: number;
/** Maximum frames to keep in context buffer (default: 10) */
maxContextFrames?: number;
/** Session ID for this video agent instance */
sessionId?: string;
}
/** Shape of items in the video agent's input queue */
interface VideoInputItem extends QueueItem<string> {
text?: string;
frame?: VideoFrame;
}
// ── VideoAgent class ────────────────────────────────────
export class VideoAgent extends EventEmitter {
private model: LanguageModel;
private instructions: string;
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
private endpoint?: string;
private tools: Record<string, Tool> = {};
private isDestroyed = false;
private _isProcessing = false;
// Abort controller for the current LLM stream
private currentStreamAbortController?: AbortController;
// ── Managers ─────────────────────────────────────────
private ws: WebSocketManager;
private speech: SpeechManager;
private conversation: ConversationManager;
private transcription: TranscriptionManager;
private inputQueue: InputQueue<VideoInputItem>;
// ── Video-specific state ────────────────────────────
private sessionId: string;
private frameSequence = 0;
private lastFrameTimestamp = 0;
private lastFrameHash?: string;
private frameContextBuffer: FrameContext[] = [];
private currentFrameData?: string;
private videoConfig: VideoAgentConfig;
private maxFrameInputSize: number;
constructor(options: VideoAgentOptions) {
super();
this.model = options.model;
this.instructions =
options.instructions ||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
When analyzing images, be concise but informative. Describe what you see when asked.
Keep responses conversational since they will be spoken aloud.
Use tools when needed to provide accurate information.`;
this.stopWhen = options.stopWhen || stepCountIs(5);
this.endpoint = options.endpoint;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
this.sessionId = options.sessionId || this.generateSessionId();
this.videoConfig = {
...DEFAULT_VIDEO_AGENT_CONFIG,
maxContextFrames:
options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
};
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ─────────────────────────
this.ws = new WebSocketManager();
this.speech = new SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new ConversationManager({
history: options.history,
});
this.transcription = new TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new InputQueue<VideoInputItem>();
// ── Wire managers to WebSocket send ─────────────
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire input queue processor ──────────────────
this.inputQueue.processor = (item) => this.processQueueItem(item);
// ── Bubble events from managers ─────────────────
this.bubbleEvents(this.ws, ["connected", "error"]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle ──────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message: any) => this.handleMessage(message));
}
// ══════════════════════════════════════════════════════
// Public API
// ══════════════════════════════════════════════════════
public registerTools(tools: Record<string, Tool>) {
this.tools = { ...this.tools, ...tools };
}
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
return this.transcription.transcribeAudio(audioData);
}
public async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
return this.speech.generateSpeechFromText(text, abortSignal);
}
public interruptSpeech(reason: string = "interrupted"): void {
this.speech.interruptSpeech(reason);
}
public interruptCurrentResponse(reason: string = "interrupted"): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
public async connect(url?: string): Promise<void> {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
public async sendText(text: string): Promise<string> {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueTextInput(text);
}
public async sendAudio(audioData: string): Promise<void> {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Send a video frame with optional text query for vision analysis
*/
public async sendFrame(
frameData: string,
query?: string,
options?: { width?: number; height?: number; format?: string }
): Promise<string> {
this.ensureNotDestroyed();
const frame: VideoFrame = {
type: "video_frame",
sessionId: this.sessionId,
sequence: this.frameSequence++,
timestamp: Date.now(),
triggerReason: "user_request",
previousFrameRef: this.lastFrameHash,
image: {
data: frameData,
format: options?.format || "webp",
width: options?.width || 640,
height: options?.height || 480,
},
};
// Update local frame state
await this.handleVideoFrame(frame);
if (query) {
return this.enqueueMultimodalInput(query, frame);
}
return "";
}
/**
* Request client to capture and send a frame
*/
public requestFrameCapture(reason: FrameTriggerReason): void {
this.ws.send({
type: "capture_frame",
reason,
timestamp: Date.now(),
});
this.emit("frame_requested", { reason });
}
public getConfig(): VideoAgentConfig {
return { ...this.videoConfig };
}
public updateConfig(config: Partial<VideoAgentConfig>): void {
this.videoConfig = { ...this.videoConfig, ...config };
this.emit("config_changed", this.videoConfig);
}
startListening() {
this.emit("listening");
}
stopListening() {
this.emit("stopped");
}
clearHistory() {
this.conversation.clearHistory();
this.frameContextBuffer = [];
}
getHistory(): ModelMessage[] {
return this.conversation.getHistory();
}
setHistory(history: ModelMessage[]) {
this.conversation.setHistory(history);
}
getFrameContext(): FrameContext[] {
return [...this.frameContextBuffer];
}
getSessionId(): string {
return this.sessionId;
}
disconnect() {
this.ws.disconnect();
}
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.frameContextBuffer = [];
this.tools = {};
this.removeAllListeners();
}
// ── Getters ─────────────────────────────────────────
get connected(): boolean {
return this.ws.isConnected;
}
get processing(): boolean {
return this._isProcessing;
}
get speaking(): boolean {
return this.speech.isSpeaking;
}
get pendingSpeechChunks(): number {
return this.speech.pendingChunkCount;
}
get destroyed(): boolean {
return this.isDestroyed;
}
get currentFrameSequence(): number {
return this.frameSequence;
}
get hasVisualContext(): boolean {
return !!this.currentFrameData;
}
// ══════════════════════════════════════════════════════
// Private — message handling
// ══════════════════════════════════════════════════════
private async handleMessage(message: any): Promise<void> {
try {
switch (message.type) {
case "transcript":
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
await this.enqueueTextInput(message.text);
break;
case "audio":
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
this.requestFrameCapture("user_request");
try {
await this.handleAudioInput(message.data, message.format);
} catch (audioError) {
this.emit("error", audioError);
}
break;
case "video_frame":
await this.handleVideoFrame(message);
break;
case "interrupt":
this.interruptCurrentResponse(message.reason || "client_request");
break;
case "client_ready":
this.handleClientReady(message);
break;
}
} catch (err) {
this.emit("error", err);
}
}
private handleClientReady(message: any): void {
this.ws.send({
type: "session_init",
sessionId: this.sessionId,
});
this.emit("client_ready", message.capabilities);
}
// ══════════════════════════════════════════════════════
// Private — audio
// ══════════════════════════════════════════════════════
private async handleAudioInput(
base64Audio: string,
format?: string
): Promise<void> {
const text = await this.transcription.processAudioInput(base64Audio, format);
if (text) {
await this.enqueueTextInput(text);
}
}
// ══════════════════════════════════════════════════════
// Private — video frames
// ══════════════════════════════════════════════════════
private async handleVideoFrame(frame: VideoFrame): Promise<void> {
try {
if (!frame.image?.data) {
this.emit("warning", "Received empty or invalid video frame");
return;
}
const frameSize = Buffer.from(frame.image.data, "base64").length;
if (frameSize > this.maxFrameInputSize) {
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
this.emit(
"error",
new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`)
);
return;
}
const frameHash = this.hashFrame(frame.image.data);
this.lastFrameTimestamp = frame.timestamp;
this.lastFrameHash = frameHash;
this.currentFrameData = frame.image.data;
this.addFrameToContext({
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
frameHash,
});
this.emit("frame_received", {
sequence: frame.sequence,
timestamp: frame.timestamp,
triggerReason: frame.triggerReason,
size: frameSize,
dimensions: { width: frame.image.width, height: frame.image.height },
});
this.ws.send({
type: "frame_ack",
sequence: frame.sequence,
timestamp: Date.now(),
});
} catch (error) {
this.emit("error", error);
}
}
private addFrameToContext(context: FrameContext): void {
this.frameContextBuffer.push(context);
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
this.frameContextBuffer.shift();
}
}
private hashFrame(data: string): string {
let hash = 0;
for (let i = 0; i < data.length; i++) {
const char = data.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
}
private generateSessionId(): string {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2, 10);
return `vs_${timestamp}_${randomPart}`;
}
// ══════════════════════════════════════════════════════
// Private — input queue
// ══════════════════════════════════════════════════════
private enqueueTextInput(text: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
private enqueueMultimodalInput(text: string, frame: VideoFrame): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, frame, resolve, reject });
});
}
/**
* Route queued items to the correct processor.
*/
private async processQueueItem(item: VideoInputItem): Promise<string> {
if (item.frame && item.text) {
return this.processMultimodalInput(item.text, item.frame);
} else if (item.text) {
return this.processUserInput(item.text);
}
return "";
}
// ══════════════════════════════════════════════════════
// Private — multimodal content building
// ══════════════════════════════════════════════════════
private buildMultimodalContent(
text: string,
frameData?: string
): Array<{ type: "text"; text: string } | { type: "image"; image: string }> {
const content: Array<
{ type: "text"; text: string } | { type: "image"; image: string }
> = [];
if (this.frameContextBuffer.length > 0) {
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
content.push({ type: "text", text: contextSummary });
}
const imageData = frameData || this.currentFrameData;
if (imageData) {
content.push({ type: "image", image: imageData });
}
content.push({ type: "text", text });
return content;
}
// ══════════════════════════════════════════════════════
// Private — LLM processing
// ══════════════════════════════════════════════════════
/**
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
*/
private async runStream(
messages: ModelMessage[],
abortSignal: AbortSignal
): Promise<string> {
const result = streamText({
model: this.model,
system: this.instructions,
messages,
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal,
onChunk: ({ chunk }) => {
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
this.emit("error", error);
},
});
const streamResult = await processFullStream(
result,
{
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
},
{
sessionId: this.sessionId,
frameContext:
this.frameContextBuffer.length > 0
? {
frameCount: this.frameContextBuffer.length,
lastFrameSequence:
this.frameContextBuffer[this.frameContextBuffer.length - 1]
?.sequence,
}
: undefined,
}
);
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush remaining speech & wait for queue
this.speech.flushPendingText();
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
}
/**
* Process text-only input (with optional visual context from latest frame).
*/
private async processUserInput(text: string): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text });
const hasVisual = !!this.currentFrameData;
let messages: ModelMessage[];
if (hasVisual) {
const content = this.buildMultimodalContent(text);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Visual context] ${text}` }],
});
messages = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
} else {
this.conversation.addMessage({ role: "user", content: text });
messages = this.conversation.getHistoryRef();
}
return await this.runStream(
messages,
this.currentStreamAbortController.signal
);
} catch (error) {
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
/**
* Process multimodal input (text + explicit video frame).
*/
private async processMultimodalInput(
text: string,
frame: VideoFrame
): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
try {
this.emit("text", { role: "user", text, hasImage: true });
const content = this.buildMultimodalContent(text, frame.image.data);
this.conversation.addMessage({
role: "user",
content: [{ type: "text", text: `[Image attached] ${text}` }],
});
const messages: ModelMessage[] = [
...this.conversation.getHistoryRef().slice(0, -1),
{ role: "user", content },
];
return await this.runStream(
messages,
this.currentStreamAbortController.signal
);
} catch (error) {
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ══════════════════════════════════════════════════════
// Private — helpers
// ══════════════════════════════════════════════════════
private ensureNotDestroyed(): void {
if (this.isDestroyed) {
throw new Error("VideoAgent has been destroyed and cannot be used");
}
}
private cleanupOnDisconnect(): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.currentFrameData = undefined;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
private bubbleEvents(source: EventEmitter, events: string[]): void {
for (const event of events) {
source.on(event, (...args: any[]) => this.emit(event, ...args));
}
}
}
// Export types for external use
export type {
VideoFrame,
AudioData,
VideoAgentConfig,
FrameContext,
FrameTriggerReason,
};
// Re-export shared types
export type { StreamingSpeechConfig, HistoryConfig } from "./types";

View File

@@ -84,6 +84,10 @@ const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
}; };
export interface VideoAgentOptions { export interface VideoAgentOptions {
/**
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
*/
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o')) model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1')) transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1'))
speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts')) speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts'))
@@ -183,7 +187,7 @@ Use tools when needed to provide accurate information.`;
this.endpoint = options.endpoint; this.endpoint = options.endpoint;
this.voice = options.voice || "alloy"; this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions; this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "mp3"; this.outputFormat = options.outputFormat || "opus";
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE; this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE; this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
@@ -265,7 +269,6 @@ Use tools when needed to provide accurate information.`;
this.socket.on("message", async (data) => { this.socket.on("message", async (data) => {
try { try {
const message = JSON.parse(data.toString()); const message = JSON.parse(data.toString());
console.log(`Received WebSocket message of type: ${message.type}`);
switch (message.type) { switch (message.type) {
// Handle transcribed text from the client/STT // Handle transcribed text from the client/STT
@@ -278,7 +281,6 @@ Use tools when needed to provide accurate information.`;
this.interruptCurrentResponse("user_speaking"); this.interruptCurrentResponse("user_speaking");
// Force capture current frame when user speaks // Force capture current frame when user speaks
this.requestFrameCapture("user_request"); this.requestFrameCapture("user_request");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueTextInput(message.text); await this.enqueueTextInput(message.text);
break; break;
@@ -292,10 +294,11 @@ Use tools when needed to provide accurate information.`;
this.interruptCurrentResponse("user_speaking"); this.interruptCurrentResponse("user_speaking");
// Force capture current frame when user speaks // Force capture current frame when user speaks
this.requestFrameCapture("user_request"); this.requestFrameCapture("user_request");
console.log( try {
`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
);
await this.processAudioInput(message); await this.processAudioInput(message);
} catch (audioError) {
this.emit("error", audioError);
}
break; break;
// Handle video frame from client // Handle video frame from client
@@ -305,7 +308,6 @@ Use tools when needed to provide accurate information.`;
// Handle explicit interrupt request from client // Handle explicit interrupt request from client
case "interrupt": case "interrupt":
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
this.interruptCurrentResponse(message.reason || "client_request"); this.interruptCurrentResponse(message.reason || "client_request");
break; break;
@@ -315,23 +317,20 @@ Use tools when needed to provide accurate information.`;
break; break;
default: default:
console.log(`Unknown message type: ${message.type}`); break;
} }
} catch (err) { } catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err); this.emit("error", err);
} }
}); });
this.socket.on("close", () => { this.socket.on("close", () => {
console.log("Disconnected");
this.isConnected = false; this.isConnected = false;
this.cleanupOnDisconnect(); this.cleanupOnDisconnect();
this.emit("disconnected"); this.emit("disconnected");
}); });
this.socket.on("error", (error) => { this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error); this.emit("error", error);
}); });
} }
@@ -340,8 +339,6 @@ Use tools when needed to provide accurate information.`;
* Handle client ready signal * Handle client ready signal
*/ */
private handleClientReady(message: any): void { private handleClientReady(message: any): void {
console.log(`Client ready, capabilities: ${JSON.stringify(message.capabilities || {})}`);
// Send session info to client // Send session info to client
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "session_init", type: "session_init",
@@ -403,12 +400,7 @@ Use tools when needed to provide accurate information.`;
sequence: frame.sequence, sequence: frame.sequence,
timestamp: Date.now(), timestamp: Date.now(),
}); });
console.log(
`Received frame #${frame.sequence} (${frame.triggerReason}): ${(frameSize / 1024).toFixed(1)}KB, ${frame.image.width}x${frame.image.height}`
);
} catch (error) { } catch (error) {
console.error("Failed to handle video frame:", error);
this.emit("error", error); this.emit("error", error);
} }
} }
@@ -485,18 +477,12 @@ Use tools when needed to provide accurate information.`;
throw new Error("Transcription model not configured"); throw new Error("Transcription model not configured");
} }
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
try { try {
const result = await transcribe({ const result = await transcribe({
model: this.transcriptionModel, model: this.transcriptionModel,
audio: audioData, audio: audioData,
}); });
console.log(
`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`
);
this.emit("transcription", { this.emit("transcription", {
text: result.text, text: result.text,
language: result.language, language: result.language,
@@ -511,7 +497,6 @@ Use tools when needed to provide accurate information.`;
return result.text; return result.text;
} catch (error) { } catch (error) {
console.error("Whisper transcription failed:", error);
throw error; throw error;
} }
} }
@@ -707,21 +692,15 @@ Use tools when needed to provide accurate information.`;
} }
try { try {
console.log(
`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`
);
const audioData = await this.generateSpeechFromText( const audioData = await this.generateSpeechFromText(
chunk.text, chunk.text,
this.currentSpeechAbortController.signal this.currentSpeechAbortController.signal
); );
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
return audioData; return audioData;
} catch (error) { } catch (error) {
if ((error as Error).name === "AbortError") { if ((error as Error).name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null; return null;
} }
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
this.emit("error", error); this.emit("error", error);
return null; return null;
} }
@@ -734,7 +713,6 @@ Use tools when needed to provide accurate information.`;
if (this.isSpeaking) return; if (this.isSpeaking) return;
this.isSpeaking = true; this.isSpeaking = true;
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
this.emit("speech_start", { streaming: true }); this.emit("speech_start", { streaming: true });
this.sendWebSocketMessage({ type: "speech_stream_start" }); this.sendWebSocketMessage({ type: "speech_stream_start" });
@@ -742,10 +720,6 @@ Use tools when needed to provide accurate information.`;
while (this.speechChunkQueue.length > 0) { while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0]; const chunk = this.speechChunkQueue[0];
console.log(
`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`
);
if (!chunk.audioPromise) { if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk); chunk.audioPromise = this.generateChunkAudio(chunk);
} }
@@ -753,7 +727,6 @@ Use tools when needed to provide accurate information.`;
const audioData = await chunk.audioPromise; const audioData = await chunk.audioPromise;
if (!this.isSpeaking) { if (!this.isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break; break;
} }
@@ -761,9 +734,6 @@ Use tools when needed to provide accurate information.`;
if (audioData) { if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64"); const base64Audio = Buffer.from(audioData).toString("base64");
console.log(
`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`
);
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "audio_chunk", type: "audio_chunk",
@@ -780,8 +750,6 @@ Use tools when needed to provide accurate information.`;
text: chunk.text, text: chunk.text,
uint8Array: audioData, uint8Array: audioData,
}); });
} else {
console.log(`No audio data generated for chunk #${chunk.id}`);
} }
if (this.streamingSpeechConfig.parallelGeneration) { if (this.streamingSpeechConfig.parallelGeneration) {
@@ -792,7 +760,6 @@ Use tools when needed to provide accurate information.`;
); );
if (toStart > 0) { if (toStart > 0) {
console.log(`Starting parallel generation for ${toStart} more chunks`);
for (let i = 0; i < toStart; i++) { for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise); const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
if (nextChunk) { if (nextChunk) {
@@ -803,7 +770,6 @@ Use tools when needed to provide accurate information.`;
} }
} }
} catch (error) { } catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error); this.emit("error", error);
} finally { } finally {
this.isSpeaking = false; this.isSpeaking = false;
@@ -815,7 +781,6 @@ Use tools when needed to provide accurate information.`;
this.speechQueueDonePromise = undefined; this.speechQueueDonePromise = undefined;
} }
console.log(`Speech queue processing complete`);
this.sendWebSocketMessage({ type: "speech_stream_end" }); this.sendWebSocketMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true }); this.emit("speech_complete", { streaming: true });
} }
@@ -850,9 +815,14 @@ Use tools when needed to provide accurate information.`;
/** /**
* Process incoming audio data: transcribe and generate response * Process incoming audio data: transcribe and generate response
*/ */
private async processAudioInput(audioMessage: AudioData): Promise<void> { private async processAudioInput(audioMessage: AudioData | { type: string; data: string; format?: string; sessionId?: string }): Promise<void> {
if (!this.transcriptionModel) { if (!this.transcriptionModel) {
this.emit("error", new Error("Transcription model not configured for audio input")); const error = new Error("Transcription model not configured for audio input");
this.emit("error", error);
this.sendWebSocketMessage({
type: "error",
error: error.message,
});
return; return;
} }
@@ -877,15 +847,11 @@ Use tools when needed to provide accurate information.`;
this.emit("audio_received", { this.emit("audio_received", {
size: audioBuffer.length, size: audioBuffer.length,
format: audioMessage.format, format: audioMessage.format,
sessionId: audioMessage.sessionId, sessionId: audioMessage.sessionId || this.sessionId,
}); });
console.log(
`Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
);
const transcribedText = await this.transcribeAudio(audioBuffer); const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (transcribedText.trim()) { if (transcribedText.trim()) {
await this.enqueueTextInput(transcribedText); await this.enqueueTextInput(transcribedText);
@@ -897,7 +863,6 @@ Use tools when needed to provide accurate information.`;
}); });
} }
} catch (error) { } catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error); this.emit("error", error);
this.sendWebSocketMessage({ this.sendWebSocketMessage({
type: "transcription_error", type: "transcription_error",
@@ -1049,7 +1014,9 @@ Use tools when needed to provide accurate information.`;
* Drain the input queue, processing one request at a time * Drain the input queue, processing one request at a time
*/ */
private async drainInputQueue(): Promise<void> { private async drainInputQueue(): Promise<void> {
if (this.processingQueue) return; if (this.processingQueue) {
return;
}
this.processingQueue = true; this.processingQueue = true;
try { try {
@@ -1151,7 +1118,6 @@ Use tools when needed to provide accurate information.`;
} }
}, },
onError: ({ error }) => { onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error); this.emit("error", error);
}, },
}); });
@@ -1229,7 +1195,6 @@ Use tools when needed to provide accurate information.`;
} }
}, },
onError: ({ error }) => { onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error); this.emit("error", error);
}, },
}); });
@@ -1513,23 +1478,9 @@ Use tools when needed to provide accurate information.`;
try { try {
if (this.socket.readyState === WebSocket.OPEN) { if (this.socket.readyState === WebSocket.OPEN) {
if (message.type === "audio_chunk" || message.type === "audio") {
const { data, ...rest } = message as any;
console.log(
`Sending WebSocket message: ${message.type}`,
data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "",
rest
);
} else {
console.log(`Sending WebSocket message: ${message.type}`);
}
this.socket.send(JSON.stringify(message)); this.socket.send(JSON.stringify(message));
} else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
} }
} catch (error) { } catch (error) {
console.error("Failed to send WebSocket message:", error);
this.emit("error", error); this.emit("error", error);
} }
} }
@@ -1538,7 +1489,6 @@ Use tools when needed to provide accurate information.`;
* Start listening for voice/video input * Start listening for voice/video input
*/ */
startListening() { startListening() {
console.log("Starting video agent...");
this.emit("listening"); this.emit("listening");
} }
@@ -1546,7 +1496,6 @@ Use tools when needed to provide accurate information.`;
* Stop listening for voice/video input * Stop listening for voice/video input
*/ */
stopListening() { stopListening() {
console.log("Stopping video agent...");
this.emit("stopped"); this.emit("stopped");
} }

484
src/VoiceAgent.new.ts Normal file
View File

@@ -0,0 +1,484 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
import {
streamText,
type LanguageModel,
stepCountIs,
type Tool,
type ModelMessage,
type TranscriptionModel,
type SpeechModel,
} from "ai";
import {
type StreamingSpeechConfig,
type HistoryConfig,
} from "./types";
import {
WebSocketManager,
SpeechManager,
ConversationManager,
TranscriptionManager,
InputQueue,
type QueueItem,
processFullStream,
handleStreamChunk,
} from "./core";
export interface VoiceAgentOptions {
model: LanguageModel;
transcriptionModel?: TranscriptionModel;
speechModel?: SpeechModel;
instructions?: string;
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
tools?: Record<string, Tool>;
endpoint?: string;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
/** Configuration for streaming speech generation */
streamingSpeech?: Partial<StreamingSpeechConfig>;
/** Configuration for conversation history memory limits */
history?: Partial<HistoryConfig>;
/** Maximum audio input size in bytes (default: 10 MB) */
maxAudioInputSize?: number;
}
/** Shape of items in the voice agent's input queue */
interface VoiceInputItem extends QueueItem<string> {
text: string;
}
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export class VoiceAgent extends EventEmitter {
private model: LanguageModel;
private instructions: string;
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
private endpoint?: string;
private tools: Record<string, Tool> = {};
private isDestroyed = false;
private _isProcessing = false;
// Abort controller for the current LLM stream
private currentStreamAbortController?: AbortController;
// ── Managers ──────────────────────────────────────────
private ws: WebSocketManager;
private speech: SpeechManager;
private conversation: ConversationManager;
private transcription: TranscriptionManager;
private inputQueue: InputQueue<VoiceInputItem>;
constructor(options: VoiceAgentOptions) {
super();
this.model = options.model;
this.instructions =
options.instructions || "You are a helpful voice assistant.";
this.stopWhen = options.stopWhen || stepCountIs(5);
this.endpoint = options.endpoint;
if (options.tools) {
this.tools = { ...options.tools };
}
// ── Initialize managers ──────────────────────────────
this.ws = new WebSocketManager();
this.speech = new SpeechManager({
speechModel: options.speechModel,
voice: options.voice,
speechInstructions: options.speechInstructions,
outputFormat: options.outputFormat,
streamingSpeech: options.streamingSpeech,
});
this.conversation = new ConversationManager({
history: options.history,
});
this.transcription = new TranscriptionManager({
transcriptionModel: options.transcriptionModel,
maxAudioInputSize: options.maxAudioInputSize,
});
this.inputQueue = new InputQueue<VoiceInputItem>();
// ── Wire managers to the WebSocket send function ─────
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
this.speech.sendMessage = sendMsg;
this.transcription.sendMessage = sendMsg;
// ── Wire the input queue processor ───────────────────
this.inputQueue.processor = (item) => this.processUserInput(item.text);
// ── Bubble events from managers ──────────────────────
this.bubbleEvents(this.ws, [
"connected",
"error",
]);
this.bubbleEvents(this.speech, [
"speech_start",
"speech_complete",
"speech_interrupted",
"speech_chunk_queued",
"audio_chunk",
"audio",
"error",
]);
this.bubbleEvents(this.conversation, [
"history_cleared",
"history_trimmed",
]);
this.bubbleEvents(this.transcription, [
"transcription",
"audio_received",
"error",
"warning",
]);
// ── Handle WebSocket lifecycle events ────────────────
this.ws.on("disconnected", () => {
this.cleanupOnDisconnect();
this.emit("disconnected");
});
this.ws.on("message", (message: any) => this.handleMessage(message));
}
// ── Public API ────────────────────────────────────────
public registerTools(tools: Record<string, Tool>) {
this.tools = { ...this.tools, ...tools };
}
/**
* Transcribe audio data to text using the configured transcription model.
*/
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
return this.transcription.transcribeAudio(audioData);
}
/**
* Generate speech from text using the configured speech model.
*/
public async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
return this.speech.generateSpeechFromText(text, abortSignal);
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
public interruptSpeech(reason: string = "interrupted"): void {
this.speech.interruptSpeech(reason);
}
/**
* Interrupt both the current LLM stream and ongoing speech.
*/
public interruptCurrentResponse(reason: string = "interrupted"): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.interruptSpeech(reason);
}
/**
* Connect to a WebSocket server by URL.
*/
public async connect(url?: string): Promise<void> {
this.ensureNotDestroyed();
const wsUrl = url || this.endpoint || "ws://localhost:8080";
await this.ws.connect(wsUrl);
}
/**
* Attach an existing WebSocket (server-side usage).
*/
public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed();
this.ws.handleSocket(socket);
}
/**
* Send text input for processing (bypasses transcription).
*/
public async sendText(text: string): Promise<string> {
this.ensureNotDestroyed();
if (!text || !text.trim()) {
throw new Error("Text input cannot be empty");
}
return this.enqueueInput(text);
}
/**
* Send base64 audio data to be transcribed and processed.
*/
public async sendAudio(audioData: string): Promise<void> {
this.ensureNotDestroyed();
await this.handleAudioInput(audioData);
}
/**
* Send raw audio buffer to be transcribed and processed.
*/
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
this.ensureNotDestroyed();
const base64Audio = Buffer.from(audioBuffer).toString("base64");
await this.handleAudioInput(base64Audio);
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
public async generateAndSendSpeechFull(text: string): Promise<void> {
return this.speech.generateAndSendSpeechFull(text);
}
/** Start listening for voice input */
startListening() {
console.log("Starting voice agent...");
this.emit("listening");
}
/** Stop listening for voice input */
stopListening() {
console.log("Stopping voice agent...");
this.emit("stopped");
}
/** Clear conversation history */
clearHistory() {
this.conversation.clearHistory();
}
/** Get current conversation history */
getHistory(): ModelMessage[] {
return this.conversation.getHistory();
}
/** Set conversation history (useful for restoring sessions) */
setHistory(history: ModelMessage[]) {
this.conversation.setHistory(history);
}
/** Disconnect from WebSocket and stop all in-flight work */
disconnect() {
this.ws.disconnect();
}
/**
* Permanently destroy the agent, releasing all resources.
*/
destroy() {
this.isDestroyed = true;
this.cleanupOnDisconnect();
this.ws.disconnect();
this.conversation.clearHistory();
this.tools = {};
this.removeAllListeners();
}
// ── Getters ───────────────────────────────────────────
get connected(): boolean {
return this.ws.isConnected;
}
get processing(): boolean {
return this._isProcessing;
}
get speaking(): boolean {
return this.speech.isSpeaking;
}
get pendingSpeechChunks(): number {
return this.speech.pendingChunkCount;
}
get destroyed(): boolean {
return this.isDestroyed;
}
// ── Private: message handling ─────────────────────────
private async handleMessage(message: any): Promise<void> {
try {
console.log(`Received WebSocket message of type: ${message.type}`);
if (message.type === "transcript") {
if (typeof message.text !== "string" || !message.text.trim()) {
this.emit("warning", "Received empty or invalid transcript message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(`Processing transcript: "${message.text}"`);
await this.enqueueInput(message.text);
} else if (message.type === "audio") {
if (typeof message.data !== "string" || !message.data) {
this.emit("warning", "Received empty or invalid audio message");
return;
}
this.interruptCurrentResponse("user_speaking");
console.log(
`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
);
await this.handleAudioInput(message.data, message.format);
} else if (message.type === "interrupt") {
console.log(
`Received interrupt request: ${message.reason || "client_request"}`
);
this.interruptCurrentResponse(message.reason || "client_request");
}
} catch (err) {
console.error("Failed to process message:", err);
this.emit("error", err);
}
}
// ── Private: audio ────────────────────────────────────
private async handleAudioInput(
base64Audio: string,
format?: string
): Promise<void> {
const text = await this.transcription.processAudioInput(
base64Audio,
format
);
if (text) {
await this.enqueueInput(text);
}
}
// ── Private: input queue ──────────────────────────────
private enqueueInput(text: string): Promise<string> {
return new Promise<string>((resolve, reject) => {
this.inputQueue.enqueue({ text, resolve, reject });
});
}
// ── Private: LLM processing ───────────────────────────
/**
* Process user input with streaming text generation.
* Called serially by the input queue.
*/
private async processUserInput(text: string): Promise<string> {
this._isProcessing = true;
this.currentStreamAbortController = new AbortController();
const streamAbortSignal = this.currentStreamAbortController.signal;
try {
this.emit("text", { role: "user", text });
this.conversation.addMessage({ role: "user", content: text });
const result = streamText({
model: this.model,
system: this.instructions,
messages: this.conversation.getHistoryRef(),
tools: this.tools,
stopWhen: this.stopWhen,
abortSignal: streamAbortSignal,
onChunk: ({ chunk }) => {
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
},
onFinish: async (event) => {
for (const step of event.steps) {
for (const toolResult of step.toolResults) {
this.emit("tool_result", {
name: toolResult.toolName,
toolCallId: toolResult.toolCallId,
result: toolResult.output,
});
}
}
},
onError: ({ error }) => {
console.error("Stream error:", error);
this.emit("error", error);
},
});
const streamResult = await processFullStream(result, {
onTextDelta: (delta) => this.speech.processTextDelta(delta),
onTextEnd: () => this.speech.flushPendingText(),
sendMessage: (msg) => this.ws.send(msg),
emitEvent: (event, data) => this.emit(event, data),
});
// Add assistant response to history
if (streamResult.fullText) {
this.conversation.addMessage({
role: "assistant",
content: streamResult.fullText,
});
}
// Flush any remaining speech
this.speech.flushPendingText();
// Wait for all speech chunks to complete
if (this.speech.queueDonePromise) {
await this.speech.queueDonePromise;
}
return streamResult.fullText;
} catch (error) {
// Clean up speech state on error
this.speech.reset();
throw error;
} finally {
this._isProcessing = false;
this.currentStreamAbortController = undefined;
}
}
// ── Private: helpers ──────────────────────────────────
private ensureNotDestroyed(): void {
if (this.isDestroyed) {
throw new Error("VoiceAgent has been destroyed and cannot be used");
}
}
/**
* Clean up all in-flight state when the connection drops.
*/
private cleanupOnDisconnect(): void {
if (this.currentStreamAbortController) {
this.currentStreamAbortController.abort();
this.currentStreamAbortController = undefined;
}
this.speech.reset();
this._isProcessing = false;
this.inputQueue.rejectAll(new Error("Connection closed"));
}
/**
* Forward select events from a child emitter to this agent.
*/
private bubbleEvents(source: EventEmitter, events: string[]): void {
for (const event of events) {
source.on(event, (...args: any[]) => this.emit(event, ...args));
}
}
}

View File

@@ -39,6 +39,25 @@ export interface VoiceAgentOptions {
maxAudioInputSize?: number; maxAudioInputSize?: number;
} }
/**
* A single-session voice agent that manages one WebSocket connection at a time.
*
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
* input queue, speech state, and WebSocket. It is designed for **one user per
* instance**. To support multiple concurrent users, create a separate
* `VoiceAgent` for each connection:
*
* ```ts
* wss.on("connection", (socket) => {
* const agent = new VoiceAgent({ model, ... });
* agent.handleSocket(socket);
* agent.on("disconnected", () => agent.destroy());
* });
* ```
*
* Sharing a single instance across multiple users will cause conversation
* history cross-contamination, interleaved audio, and unpredictable behavior.
*/
export class VoiceAgent extends EventEmitter { export class VoiceAgent extends EventEmitter {
private socket?: WebSocket; private socket?: WebSocket;
private tools: Record<string, Tool> = {}; private tools: Record<string, Tool> = {};
@@ -90,7 +109,7 @@ export class VoiceAgent extends EventEmitter {
this.endpoint = options.endpoint; this.endpoint = options.endpoint;
this.voice = options.voice || "alloy"; this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions; this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "mp3"; this.outputFormat = options.outputFormat || "opus";
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE; this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
if (options.tools) { if (options.tools) {
this.tools = { ...options.tools }; this.tools = { ...options.tools };
@@ -695,6 +714,10 @@ export class VoiceAgent extends EventEmitter {
* Attach an existing WebSocket (server-side usage). * Attach an existing WebSocket (server-side usage).
* Use this when a WS server accepts a connection and you want the * Use this when a WS server accepts a connection and you want the
* agent to handle messages on that socket. * agent to handle messages on that socket.
*
* **Note:** Calling this while a socket is already attached will cleanly
* tear down the previous connection first. Each `VoiceAgent` instance
* supports only one socket at a time — create a new agent per user.
*/ */
public handleSocket(socket: WebSocket): void { public handleSocket(socket: WebSocket): void {
this.ensureNotDestroyed(); this.ensureNotDestroyed();

View File

@@ -0,0 +1,122 @@
import { EventEmitter } from "events";
import { type ModelMessage } from "ai";
import { type HistoryConfig, DEFAULT_HISTORY_CONFIG } from "../types";
export interface ConversationManagerOptions {
history?: Partial<HistoryConfig>;
}
/**
* Manages conversation history (ModelMessage[]) with configurable
* limits on message count and total character size.
*/
export class ConversationManager extends EventEmitter {
private conversationHistory: ModelMessage[] = [];
private historyConfig: HistoryConfig;
constructor(options: ConversationManagerOptions = {}) {
super();
this.historyConfig = {
...DEFAULT_HISTORY_CONFIG,
...options.history,
};
}
/**
* Add a message to history and trim if needed.
*/
addMessage(message: ModelMessage): void {
this.conversationHistory.push(message);
this.trimHistory();
}
/**
* Get a copy of the current history.
*/
getHistory(): ModelMessage[] {
return [...this.conversationHistory];
}
/**
* Get a direct reference to the history array.
* Use with caution — prefer getHistory() for safety.
*/
getHistoryRef(): ModelMessage[] {
return this.conversationHistory;
}
/**
* Replace the entire conversation history.
*/
setHistory(history: ModelMessage[]): void {
this.conversationHistory = [...history];
}
/**
* Clear all conversation history.
*/
clearHistory(): void {
this.conversationHistory = [];
this.emit("history_cleared");
}
/**
* Get the number of messages in history.
*/
get length(): number {
return this.conversationHistory.length;
}
/**
* Trim conversation history to stay within configured limits.
* Removes oldest messages (always in pairs to preserve user/assistant turns).
*/
private trimHistory(): void {
const { maxMessages, maxTotalChars } = this.historyConfig;
// Trim by message count
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
const excess = this.conversationHistory.length - maxMessages;
// Round up to even number to preserve turn pairs
const toRemove = excess % 2 === 0 ? excess : excess + 1;
this.conversationHistory.splice(0, toRemove);
this.emit("history_trimmed", {
removedCount: toRemove,
reason: "max_messages",
});
}
// Trim by total character count
if (maxTotalChars > 0) {
let totalChars = this.conversationHistory.reduce((sum, msg) => {
const content =
typeof msg.content === "string"
? msg.content
: JSON.stringify(msg.content);
return sum + content.length;
}, 0);
let removedCount = 0;
while (
totalChars > maxTotalChars &&
this.conversationHistory.length > 2
) {
const removed = this.conversationHistory.shift();
if (removed) {
const content =
typeof removed.content === "string"
? removed.content
: JSON.stringify(removed.content);
totalChars -= content.length;
removedCount++;
}
}
if (removedCount > 0) {
this.emit("history_trimmed", {
removedCount,
reason: "max_total_chars",
});
}
}
}
}

71
src/core/InputQueue.ts Normal file
View File

@@ -0,0 +1,71 @@
/**
* A generic serial input queue that ensures only one processor runs at a time.
*
* @template T The shape of each queued item (must include resolve/reject)
*/
export interface QueueItem<T = string> {
resolve: (v: T) => void;
reject: (e: unknown) => void;
}
export class InputQueue<T extends QueueItem<any>> {
private queue: T[] = [];
private processing = false;
/** Callback invoked for each item — must return a resolved value */
public processor: (item: T) => Promise<any> = async () => "";
/**
* Enqueue an item for serial processing.
*/
enqueue(item: T): void {
this.queue.push(item);
this.drain();
}
/**
* Reject all pending items (used on disconnect/destroy).
*/
rejectAll(reason: Error): void {
for (const item of this.queue) {
item.reject(reason);
}
this.queue = [];
this.processing = false;
}
/**
* Number of items waiting in the queue.
*/
get length(): number {
return this.queue.length;
}
/**
* Whether the queue is currently processing an item.
*/
get isProcessing(): boolean {
return this.processing;
}
// ── Private ──────────────────────────────────────────
private async drain(): Promise<void> {
if (this.processing) return;
this.processing = true;
try {
while (this.queue.length > 0) {
const item = this.queue.shift()!;
try {
const result = await this.processor(item);
item.resolve(result);
} catch (error) {
item.reject(error);
}
}
} finally {
this.processing = false;
}
}
}

453
src/core/SpeechManager.ts Normal file
View File

@@ -0,0 +1,453 @@
import { EventEmitter } from "events";
import {
experimental_generateSpeech as generateSpeech,
type SpeechModel,
} from "ai";
import {
type SpeechChunk,
type StreamingSpeechConfig,
DEFAULT_STREAMING_SPEECH_CONFIG,
} from "../types";
export interface SpeechManagerOptions {
speechModel?: SpeechModel;
voice?: string;
speechInstructions?: string;
outputFormat?: string;
streamingSpeech?: Partial<StreamingSpeechConfig>;
}
/**
* Manages text-to-speech generation, streaming speech chunking,
* parallel TTS requests, and speech interruption.
*/
export class SpeechManager extends EventEmitter {
private speechModel?: SpeechModel;
private voice: string;
private speechInstructions?: string;
private outputFormat: string;
private streamingSpeechConfig: StreamingSpeechConfig;
private currentSpeechAbortController?: AbortController;
private speechChunkQueue: SpeechChunk[] = [];
private nextChunkId = 0;
private _isSpeaking = false;
private pendingTextBuffer = "";
// Promise-based signal for speech queue completion
private speechQueueDonePromise?: Promise<void>;
private speechQueueDoneResolve?: () => void;
/** Callback to send messages over the WebSocket */
public sendMessage: (message: Record<string, unknown>) => void = () => { };
constructor(options: SpeechManagerOptions) {
super();
this.speechModel = options.speechModel;
this.voice = options.voice || "alloy";
this.speechInstructions = options.speechInstructions;
this.outputFormat = options.outputFormat || "opus";
this.streamingSpeechConfig = {
...DEFAULT_STREAMING_SPEECH_CONFIG,
...options.streamingSpeech,
};
}
get isSpeaking(): boolean {
return this._isSpeaking;
}
get pendingChunkCount(): number {
return this.speechChunkQueue.length;
}
get hasSpeechModel(): boolean {
return !!this.speechModel;
}
/**
* Returns a promise that resolves when the speech queue is fully drained.
* Returns undefined if there is nothing queued.
*/
get queueDonePromise(): Promise<void> | undefined {
return this.speechQueueDonePromise;
}
/**
* Generate speech from text using the configured speech model.
*/
async generateSpeechFromText(
text: string,
abortSignal?: AbortSignal
): Promise<Uint8Array> {
if (!this.speechModel) {
throw new Error("Speech model not configured");
}
const result = await generateSpeech({
model: this.speechModel,
text,
voice: this.voice,
instructions: this.speechInstructions,
outputFormat: this.outputFormat,
abortSignal,
});
return result.audio.uint8Array;
}
/**
* Generate speech for full text at once (non-streaming fallback).
*/
async generateAndSendSpeechFull(text: string): Promise<void> {
if (!this.speechModel) return;
try {
this.emit("speech_start", { text, streaming: false });
const audioData = await this.generateSpeechFromText(text);
const base64Audio = Buffer.from(audioData).toString("base64");
this.sendMessage({
type: "audio",
data: base64Audio,
format: this.outputFormat,
});
this.emit("audio", {
data: base64Audio,
format: this.outputFormat,
uint8Array: audioData,
});
this.emit("speech_complete", { text, streaming: false });
} catch (error) {
console.error("Failed to generate speech:", error);
this.emit("error", error);
}
}
/**
* Interrupt ongoing speech generation and playback (barge-in support).
*/
interruptSpeech(reason: string = "interrupted"): void {
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
return;
}
// Abort any pending speech generation requests
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
// Clear the speech queue
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
// Resolve any pending speech-done waiters so callers can finish
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
// Notify clients to stop audio playback
this.sendMessage({
type: "speech_interrupted",
reason,
});
this.emit("speech_interrupted", { reason });
}
/**
* Process a text delta for streaming speech.
* Call this as text chunks arrive from the LLM.
*/
processTextDelta(textDelta: string): void {
if (!this.speechModel) return;
this.pendingTextBuffer += textDelta;
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
this.pendingTextBuffer = remaining;
for (const sentence of sentences) {
this.queueSpeechChunk(sentence);
}
}
/**
* Flush any remaining text in the buffer to speech.
* Call this when the LLM stream ends.
*/
flushPendingText(): void {
if (!this.speechModel || !this.pendingTextBuffer.trim()) return;
this.queueSpeechChunk(this.pendingTextBuffer);
this.pendingTextBuffer = "";
}
/**
* Reset all speech state (used on disconnect / cleanup).
*/
reset(): void {
if (this.currentSpeechAbortController) {
this.currentSpeechAbortController.abort();
this.currentSpeechAbortController = undefined;
}
this.speechChunkQueue = [];
this.pendingTextBuffer = "";
this._isSpeaking = false;
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
}
// ── Private helpers ─────────────────────────────────────────
/**
* Extract complete sentences from text buffer.
* Returns [extractedSentences, remainingBuffer].
*/
private extractSentences(text: string): [string[], string] {
const sentences: string[] = [];
let remaining = text;
// Match sentences ending with . ! ? followed by space or end of string
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
let lastIndex = 0;
let match;
while ((match = sentenceEndPattern.exec(text)) !== null) {
const sentence = text
.slice(lastIndex, match.index + match[0].length)
.trim();
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
sentences.push(sentence);
lastIndex = match.index + match[0].length;
} else if (sentences.length > 0) {
// Append short sentence to previous one
sentences[sentences.length - 1] += " " + sentence;
lastIndex = match.index + match[0].length;
}
}
remaining = text.slice(lastIndex);
// If remaining text is too long, force split at clause boundaries
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
const clausePattern = /[,;:]\s+/g;
let clauseMatch;
let splitIndex = 0;
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
splitIndex = clauseMatch.index + clauseMatch[0].length;
break;
}
}
if (splitIndex > 0) {
sentences.push(remaining.slice(0, splitIndex).trim());
remaining = remaining.slice(splitIndex);
}
}
return [sentences, remaining];
}
/**
* Queue a text chunk for speech generation.
*/
private queueSpeechChunk(text: string): void {
if (!this.speechModel || !text.trim()) return;
// Wrap chunk ID to prevent unbounded growth in very long sessions
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
this.nextChunkId = 0;
}
const chunk: SpeechChunk = {
id: this.nextChunkId++,
text: text.trim(),
};
// Create the speech-done promise if not already present
if (!this.speechQueueDonePromise) {
this.speechQueueDonePromise = new Promise<void>((resolve) => {
this.speechQueueDoneResolve = resolve;
});
}
// Start generating audio immediately (parallel generation)
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter(
(c) => c.audioPromise
).length;
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
}
this.speechChunkQueue.push(chunk);
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
// Start processing queue if not already
if (!this._isSpeaking) {
this.processSpeechQueue();
}
}
/**
* Generate audio for a single chunk.
*/
private async generateChunkAudio(
chunk: SpeechChunk
): Promise<Uint8Array | null> {
if (!this.currentSpeechAbortController) {
this.currentSpeechAbortController = new AbortController();
}
try {
console.log(
`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`
);
const audioData = await this.generateSpeechFromText(
chunk.text,
this.currentSpeechAbortController.signal
);
console.log(
`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`
);
return audioData;
} catch (error) {
if ((error as Error).name === "AbortError") {
console.log(`Audio generation aborted for chunk ${chunk.id}`);
return null;
}
console.error(
`Failed to generate audio for chunk ${chunk.id}:`,
error
);
this.emit("error", error);
return null;
}
}
/**
* Process the speech queue and send audio chunks in order.
*/
private async processSpeechQueue(): Promise<void> {
if (this._isSpeaking) return;
this._isSpeaking = true;
console.log(
`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`
);
this.emit("speech_start", { streaming: true });
this.sendMessage({ type: "speech_stream_start" });
try {
while (this.speechChunkQueue.length > 0) {
const chunk = this.speechChunkQueue[0];
console.log(
`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`
);
// Ensure audio generation has started
if (!chunk.audioPromise) {
chunk.audioPromise = this.generateChunkAudio(chunk);
}
// Wait for this chunk's audio
const audioData = await chunk.audioPromise;
// Check if we were interrupted while waiting
if (!this._isSpeaking) {
console.log(`Speech interrupted during chunk #${chunk.id}`);
break;
}
// Remove from queue after processing
this.speechChunkQueue.shift();
if (audioData) {
const base64Audio = Buffer.from(audioData).toString("base64");
console.log(
`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`
);
// Send audio chunk via WebSocket
this.sendMessage({
type: "audio_chunk",
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
});
// Emit for local handling
this.emit("audio_chunk", {
chunkId: chunk.id,
data: base64Audio,
format: this.outputFormat,
text: chunk.text,
uint8Array: audioData,
});
} else {
console.log(`No audio data generated for chunk #${chunk.id}`);
}
// Start generating next chunks in parallel
if (this.streamingSpeechConfig.parallelGeneration) {
const activeRequests = this.speechChunkQueue.filter(
(c) => c.audioPromise
).length;
const toStart = Math.min(
this.streamingSpeechConfig.maxParallelRequests - activeRequests,
this.speechChunkQueue.length
);
if (toStart > 0) {
console.log(
`Starting parallel generation for ${toStart} more chunks`
);
for (let i = 0; i < toStart; i++) {
const nextChunk = this.speechChunkQueue.find(
(c) => !c.audioPromise
);
if (nextChunk) {
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
}
}
}
}
}
} catch (error) {
console.error("Error in speech queue processing:", error);
this.emit("error", error);
} finally {
this._isSpeaking = false;
this.currentSpeechAbortController = undefined;
// Signal that the speech queue is fully drained
if (this.speechQueueDoneResolve) {
this.speechQueueDoneResolve();
this.speechQueueDoneResolve = undefined;
this.speechQueueDonePromise = undefined;
}
console.log(`Speech queue processing complete`);
this.sendMessage({ type: "speech_stream_end" });
this.emit("speech_complete", { streaming: true });
}
}
}

293
src/core/StreamProcessor.ts Normal file
View File

@@ -0,0 +1,293 @@
import { type streamText } from "ai";
/**
* Result of processing a full LLM stream.
*/
export interface StreamResult {
fullText: string;
fullReasoning: string;
allToolCalls: Array<{
toolName: string;
toolCallId: string;
input: unknown;
}>;
allToolResults: Array<{
toolName: string;
toolCallId: string;
output: unknown;
}>;
allSources: Array<unknown>;
allFiles: Array<unknown>;
}
export interface StreamProcessorCallbacks {
/** Called when a text delta arrives (for streaming speech, etc.) */
onTextDelta?: (text: string) => void;
/** Called when a text-end part arrives (flush speech, etc.) */
onTextEnd?: () => void;
/** Send a WebSocket message */
sendMessage: (message: Record<string, unknown>) => void;
/** Emit an event on the agent */
emitEvent: (event: string, data?: unknown) => void;
}
/**
* Processes the fullStream from an AI SDK `streamText` call,
* forwarding events to WebSocket clients and collecting the complete response.
*
* This is a standalone function (not a class) because it has no persistent state.
*/
export async function processFullStream(
result: ReturnType<typeof streamText>,
callbacks: StreamProcessorCallbacks,
extraResponseFields?: Record<string, unknown>
): Promise<StreamResult> {
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
let fullText = "";
let fullReasoning = "";
const allToolCalls: StreamResult["allToolCalls"] = [];
const allToolResults: StreamResult["allToolResults"] = [];
const allSources: unknown[] = [];
const allFiles: unknown[] = [];
for await (const part of result.fullStream) {
switch (part.type) {
// ── Stream lifecycle ──────────────────────────────
case "start":
sendMessage({ type: "stream_start" });
break;
case "finish":
emitEvent("text", { role: "assistant", text: fullText });
sendMessage({
type: "stream_finish",
finishReason: part.finishReason,
usage: part.totalUsage,
});
break;
case "error":
emitEvent("error", part.error);
sendMessage({
type: "stream_error",
error: String(part.error),
});
break;
case "abort":
emitEvent("abort", { reason: part.reason });
sendMessage({
type: "stream_abort",
reason: part.reason,
});
break;
// ── Step lifecycle ────────────────────────────────
case "start-step":
sendMessage({
type: "step_start",
warnings: part.warnings,
});
break;
case "finish-step":
sendMessage({
type: "step_finish",
finishReason: part.finishReason,
usage: part.usage,
});
break;
// ── Text streaming ────────────────────────────────
case "text-start":
sendMessage({ type: "text_start", id: part.id });
break;
case "text-delta":
fullText += part.text;
onTextDelta?.(part.text);
sendMessage({
type: "text_delta",
id: part.id,
text: part.text,
});
break;
case "text-end":
onTextEnd?.();
sendMessage({ type: "text_end", id: part.id });
break;
// ── Reasoning streaming ───────────────────────────
case "reasoning-start":
sendMessage({ type: "reasoning_start", id: part.id });
break;
case "reasoning-delta":
fullReasoning += part.text;
sendMessage({
type: "reasoning_delta",
id: part.id,
text: part.text,
});
break;
case "reasoning-end":
sendMessage({ type: "reasoning_end", id: part.id });
break;
// ── Tool input streaming ──────────────────────────
case "tool-input-start":
sendMessage({
type: "tool_input_start",
id: part.id,
toolName: part.toolName,
});
break;
case "tool-input-delta":
sendMessage({
type: "tool_input_delta",
id: part.id,
delta: part.delta,
});
break;
case "tool-input-end":
sendMessage({ type: "tool_input_end", id: part.id });
break;
// ── Tool execution ────────────────────────────────
case "tool-call":
allToolCalls.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
sendMessage({
type: "tool_call",
toolName: part.toolName,
toolCallId: part.toolCallId,
input: part.input,
});
break;
case "tool-result":
allToolResults.push({
toolName: part.toolName,
toolCallId: part.toolCallId,
output: part.output,
});
sendMessage({
type: "tool_result",
toolName: part.toolName,
toolCallId: part.toolCallId,
result: part.output,
});
break;
case "tool-error":
sendMessage({
type: "tool_error",
toolName: part.toolName,
toolCallId: part.toolCallId,
error: String(part.error),
});
break;
// ── Sources and files ─────────────────────────────
case "source":
allSources.push(part);
sendMessage({
type: "source",
source: part,
});
break;
case "file":
allFiles.push(part.file);
sendMessage({
type: "file",
file: part.file,
});
break;
}
}
// Send the complete response
sendMessage({
type: "response_complete",
text: fullText,
reasoning: fullReasoning || undefined,
toolCalls: allToolCalls,
toolResults: allToolResults,
sources: allSources.length > 0 ? allSources : undefined,
files: allFiles.length > 0 ? allFiles : undefined,
...extraResponseFields,
});
return {
fullText,
fullReasoning,
allToolCalls,
allToolResults,
allSources,
allFiles,
};
}
/**
* Handle onChunk callback events and emit them.
*/
export function handleStreamChunk(
chunk: any,
emitEvent: (event: string, data?: unknown) => void
): void {
switch (chunk.type) {
case "text-delta":
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
break;
case "reasoning-delta":
emitEvent("chunk:reasoning_delta", {
id: chunk.id,
text: chunk.text,
});
break;
case "tool-call":
emitEvent("chunk:tool_call", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
input: chunk.input,
});
break;
case "tool-result":
emitEvent("chunk:tool_result", {
toolName: chunk.toolName,
toolCallId: chunk.toolCallId,
result: chunk.output,
});
break;
case "tool-input-start":
emitEvent("chunk:tool_input_start", {
id: chunk.id,
toolName: chunk.toolName,
});
break;
case "tool-input-delta":
emitEvent("chunk:tool_input_delta", {
id: chunk.id,
delta: chunk.delta,
});
break;
case "source":
emitEvent("chunk:source", chunk);
break;
}
}

View File

@@ -0,0 +1,142 @@
import { EventEmitter } from "events";
import {
experimental_transcribe as transcribe,
type TranscriptionModel,
} from "ai";
import { DEFAULT_MAX_AUDIO_SIZE } from "../types";
export interface TranscriptionManagerOptions {
transcriptionModel?: TranscriptionModel;
maxAudioInputSize?: number;
}
/**
* Handles audio transcription using the AI SDK transcription model
* and validation of incoming audio data.
*/
export class TranscriptionManager extends EventEmitter {
private transcriptionModel?: TranscriptionModel;
private maxAudioInputSize: number;
/** Callback to send messages over the WebSocket */
public sendMessage: (message: Record<string, unknown>) => void = () => {};
constructor(options: TranscriptionManagerOptions = {}) {
super();
this.transcriptionModel = options.transcriptionModel;
this.maxAudioInputSize =
options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
}
get hasTranscriptionModel(): boolean {
return !!this.transcriptionModel;
}
/**
* Transcribe audio data to text.
*/
async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
if (!this.transcriptionModel) {
throw new Error("Transcription model not configured");
}
console.log(
`Sending ${audioData.byteLength} bytes to Whisper for transcription`
);
try {
const result = await transcribe({
model: this.transcriptionModel,
audio: audioData,
});
console.log(
`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`
);
this.emit("transcription", {
text: result.text,
language: result.language,
});
// Send transcription to client for immediate feedback
this.sendMessage({
type: "transcription_result",
text: result.text,
language: result.language,
});
return result.text;
} catch (error) {
console.error("Whisper transcription failed:", error);
throw error;
}
}
/**
* Process incoming base64-encoded audio: validate, decode, transcribe.
* Returns the transcribed text, or null if invalid / empty.
*/
async processAudioInput(
base64Audio: string,
format?: string
): Promise<string | null> {
if (!this.transcriptionModel) {
const error = new Error(
"Transcription model not configured for audio input"
);
this.emit("error", error);
this.sendMessage({ type: "error", error: error.message });
return null;
}
try {
const audioBuffer = Buffer.from(base64Audio, "base64");
// Validate audio size
if (audioBuffer.length > this.maxAudioInputSize) {
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
this.emit(
"error",
new Error(
`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`
)
);
return null;
}
if (audioBuffer.length === 0) {
this.emit("warning", "Received empty audio data");
return null;
}
this.emit("audio_received", { size: audioBuffer.length, format });
console.log(
`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`
);
const transcribedText = await this.transcribeAudio(audioBuffer);
console.log(`Transcribed text: "${transcribedText}"`);
if (!transcribedText.trim()) {
this.emit("warning", "Transcription returned empty text");
this.sendMessage({
type: "transcription_error",
error: "Whisper returned empty text",
});
return null;
}
return transcribedText;
} catch (error) {
console.error("Failed to process audio input:", error);
this.emit("error", error);
this.sendMessage({
type: "transcription_error",
error: `Transcription failed: ${(error as Error).message || String(error)}`,
});
return null;
}
}
}

View File

@@ -0,0 +1,133 @@
import { WebSocket } from "ws";
import { EventEmitter } from "events";
/**
* Manages a single WebSocket connection lifecycle.
* Handles connecting, attaching existing sockets, sending messages,
* and clean disconnection.
*/
export class WebSocketManager extends EventEmitter {
private socket?: WebSocket;
private _isConnected = false;
get isConnected(): boolean {
return this._isConnected;
}
get currentSocket(): WebSocket | undefined {
return this.socket;
}
/**
* Connect to a WebSocket server by URL.
*/
connect(url: string): Promise<void> {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
return new Promise((resolve, reject) => {
try {
this.socket = new WebSocket(url);
this.attachListeners();
this.socket.once("open", () => {
this._isConnected = true;
this.emit("connected");
resolve();
});
this.socket.once("error", (error) => {
reject(error);
});
} catch (error) {
reject(error);
}
});
}
/**
* Attach an existing WebSocket (server-side usage).
*/
handleSocket(socket: WebSocket): void {
// Clean up any existing connection first
if (this.socket) {
this.disconnect();
}
this.socket = socket;
this._isConnected = true;
this.attachListeners();
this.emit("connected");
}
/**
* Send a JSON message via WebSocket if connected.
* Gracefully handles send failures (e.g., socket closing mid-send).
*/
send(message: Record<string, unknown>): void {
if (!this.socket || !this._isConnected) return;
try {
if (this.socket.readyState === WebSocket.OPEN) {
this.socket.send(JSON.stringify(message));
} else {
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
}
} catch (error) {
// Socket may have closed between the readyState check and send()
console.error("Failed to send WebSocket message:", error);
this.emit("error", error);
}
}
/**
* Disconnect and clean up the current socket.
*/
disconnect(): void {
if (!this.socket) return;
try {
this.socket.removeAllListeners();
if (
this.socket.readyState === WebSocket.OPEN ||
this.socket.readyState === WebSocket.CONNECTING
) {
this.socket.close();
}
} catch {
// Ignore close errors — socket may already be dead
}
this.socket = undefined;
this._isConnected = false;
}
/**
* Attach internal event listeners on the current socket.
*/
private attachListeners(): void {
if (!this.socket) return;
this.socket.on("message", (data) => {
try {
const message = JSON.parse(data.toString());
this.emit("message", message);
} catch (err) {
console.error("Failed to parse WebSocket message:", err);
this.emit("error", err);
}
});
this.socket.on("close", () => {
this._isConnected = false;
this.emit("disconnected");
});
this.socket.on("error", (error) => {
console.error("WebSocket error:", error);
this.emit("error", error);
});
}
}

17
src/core/index.ts Normal file
View File

@@ -0,0 +1,17 @@
export { WebSocketManager } from "./WebSocketManager";
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
export {
ConversationManager,
type ConversationManagerOptions,
} from "./ConversationManager";
export {
TranscriptionManager,
type TranscriptionManagerOptions,
} from "./TranscriptionManager";
export {
processFullStream,
handleStreamChunk,
type StreamResult,
type StreamProcessorCallbacks,
} from "./StreamProcessor";
export { InputQueue, type QueueItem } from "./InputQueue";

View File

@@ -1,5 +1,5 @@
// Agents // Agents
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent"; export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
export { export {
VideoAgent, VideoAgent,
type VideoAgentOptions, type VideoAgentOptions,
@@ -8,7 +8,7 @@ export {
type VideoAgentConfig, type VideoAgentConfig,
type FrameContext, type FrameContext,
type FrameTriggerReason, type FrameTriggerReason,
} from "./VideoAgent"; } from "./VideoAgent.new";
// Shared types // Shared types
export { export {