Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bf4ba8ea77 | ||
|
|
5e7eb469ae | ||
|
|
4dd30b89c0 | ||
|
|
97a3078578 | ||
|
|
990d17abe7 | ||
|
|
c5542fc156 | ||
|
|
bbe354b70b | ||
|
|
6ab04788e1 |
4
dist/VideoAgent.d.ts
vendored
@@ -53,6 +53,10 @@ interface FrameContext {
|
||||
description?: string;
|
||||
}
|
||||
export interface VideoAgentOptions {
|
||||
/**
|
||||
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
|
||||
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
|
||||
*/
|
||||
model: LanguageModel;
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
speechModel?: SpeechModel;
|
||||
|
||||
2
dist/VideoAgent.d.ts.map
vendored
66
dist/VideoAgent.js
vendored
@@ -71,7 +71,7 @@ Use tools when needed to provide accurate information.`;
|
||||
this.endpoint = options.endpoint;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "mp3";
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
|
||||
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
|
||||
// Generate or use provided session ID
|
||||
@@ -142,7 +142,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.socket.on("message", async (data) => {
|
||||
try {
|
||||
const message = JSON.parse(data.toString());
|
||||
console.log(`Received WebSocket message of type: ${message.type}`);
|
||||
switch (message.type) {
|
||||
// Handle transcribed text from the client/STT
|
||||
case "transcript":
|
||||
@@ -154,7 +153,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
// Force capture current frame when user speaks
|
||||
this.requestFrameCapture("user_request");
|
||||
console.log(`Processing transcript: "${message.text}"`);
|
||||
await this.enqueueTextInput(message.text);
|
||||
break;
|
||||
// Handle raw audio data that needs transcription
|
||||
@@ -167,8 +165,12 @@ Use tools when needed to provide accurate information.`;
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
// Force capture current frame when user speaks
|
||||
this.requestFrameCapture("user_request");
|
||||
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`);
|
||||
try {
|
||||
await this.processAudioInput(message);
|
||||
}
|
||||
catch (audioError) {
|
||||
this.emit("error", audioError);
|
||||
}
|
||||
break;
|
||||
// Handle video frame from client
|
||||
case "video_frame":
|
||||
@@ -176,7 +178,6 @@ Use tools when needed to provide accurate information.`;
|
||||
break;
|
||||
// Handle explicit interrupt request from client
|
||||
case "interrupt":
|
||||
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
break;
|
||||
// Handle client ready signal
|
||||
@@ -184,22 +185,19 @@ Use tools when needed to provide accurate information.`;
|
||||
this.handleClientReady(message);
|
||||
break;
|
||||
default:
|
||||
console.log(`Unknown message type: ${message.type}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
console.error("Failed to process message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
});
|
||||
this.socket.on("close", () => {
|
||||
console.log("Disconnected");
|
||||
this.isConnected = false;
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
this.socket.on("error", (error) => {
|
||||
console.error("WebSocket error:", error);
|
||||
this.emit("error", error);
|
||||
});
|
||||
}
|
||||
@@ -207,7 +205,6 @@ Use tools when needed to provide accurate information.`;
|
||||
* Handle client ready signal
|
||||
*/
|
||||
handleClientReady(message) {
|
||||
console.log(`Client ready, capabilities: ${JSON.stringify(message.capabilities || {})}`);
|
||||
// Send session info to client
|
||||
this.sendWebSocketMessage({
|
||||
type: "session_init",
|
||||
@@ -259,10 +256,8 @@ Use tools when needed to provide accurate information.`;
|
||||
sequence: frame.sequence,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
console.log(`Received frame #${frame.sequence} (${frame.triggerReason}): ${(frameSize / 1024).toFixed(1)}KB, ${frame.image.width}x${frame.image.height}`);
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Failed to handle video frame:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
@@ -331,13 +326,11 @@ Use tools when needed to provide accurate information.`;
|
||||
if (!this.transcriptionModel) {
|
||||
throw new Error("Transcription model not configured");
|
||||
}
|
||||
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
|
||||
try {
|
||||
const result = await (0, ai_1.experimental_transcribe)({
|
||||
model: this.transcriptionModel,
|
||||
audio: audioData,
|
||||
});
|
||||
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`);
|
||||
this.emit("transcription", {
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
@@ -351,7 +344,6 @@ Use tools when needed to provide accurate information.`;
|
||||
return result.text;
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Whisper transcription failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@@ -513,17 +505,13 @@ Use tools when needed to provide accurate information.`;
|
||||
this.currentSpeechAbortController = new AbortController();
|
||||
}
|
||||
try {
|
||||
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`);
|
||||
const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal);
|
||||
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
|
||||
return audioData;
|
||||
}
|
||||
catch (error) {
|
||||
if (error.name === "AbortError") {
|
||||
console.log(`Audio generation aborted for chunk ${chunk.id}`);
|
||||
return null;
|
||||
}
|
||||
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
|
||||
this.emit("error", error);
|
||||
return null;
|
||||
}
|
||||
@@ -535,25 +523,21 @@ Use tools when needed to provide accurate information.`;
|
||||
if (this.isSpeaking)
|
||||
return;
|
||||
this.isSpeaking = true;
|
||||
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
|
||||
this.emit("speech_start", { streaming: true });
|
||||
this.sendWebSocketMessage({ type: "speech_stream_start" });
|
||||
try {
|
||||
while (this.speechChunkQueue.length > 0) {
|
||||
const chunk = this.speechChunkQueue[0];
|
||||
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
|
||||
if (!chunk.audioPromise) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
const audioData = await chunk.audioPromise;
|
||||
if (!this.isSpeaking) {
|
||||
console.log(`Speech interrupted during chunk #${chunk.id}`);
|
||||
break;
|
||||
}
|
||||
this.speechChunkQueue.shift();
|
||||
if (audioData) {
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
|
||||
this.sendWebSocketMessage({
|
||||
type: "audio_chunk",
|
||||
chunkId: chunk.id,
|
||||
@@ -569,14 +553,10 @@ Use tools when needed to provide accurate information.`;
|
||||
uint8Array: audioData,
|
||||
});
|
||||
}
|
||||
else {
|
||||
console.log(`No audio data generated for chunk #${chunk.id}`);
|
||||
}
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
|
||||
const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length);
|
||||
if (toStart > 0) {
|
||||
console.log(`Starting parallel generation for ${toStart} more chunks`);
|
||||
for (let i = 0; i < toStart; i++) {
|
||||
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
|
||||
if (nextChunk) {
|
||||
@@ -588,7 +568,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Error in speech queue processing:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
finally {
|
||||
@@ -599,7 +578,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
console.log(`Speech queue processing complete`);
|
||||
this.sendWebSocketMessage({ type: "speech_stream_end" });
|
||||
this.emit("speech_complete", { streaming: true });
|
||||
}
|
||||
@@ -631,7 +609,12 @@ Use tools when needed to provide accurate information.`;
|
||||
*/
|
||||
async processAudioInput(audioMessage) {
|
||||
if (!this.transcriptionModel) {
|
||||
this.emit("error", new Error("Transcription model not configured for audio input"));
|
||||
const error = new Error("Transcription model not configured for audio input");
|
||||
this.emit("error", error);
|
||||
this.sendWebSocketMessage({
|
||||
type: "error",
|
||||
error: error.message,
|
||||
});
|
||||
return;
|
||||
}
|
||||
try {
|
||||
@@ -649,11 +632,9 @@ Use tools when needed to provide accurate information.`;
|
||||
this.emit("audio_received", {
|
||||
size: audioBuffer.length,
|
||||
format: audioMessage.format,
|
||||
sessionId: audioMessage.sessionId,
|
||||
sessionId: audioMessage.sessionId || this.sessionId,
|
||||
});
|
||||
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`);
|
||||
const transcribedText = await this.transcribeAudio(audioBuffer);
|
||||
console.log(`Transcribed text: "${transcribedText}"`);
|
||||
if (transcribedText.trim()) {
|
||||
await this.enqueueTextInput(transcribedText);
|
||||
}
|
||||
@@ -666,7 +647,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Failed to process audio input:", error);
|
||||
this.emit("error", error);
|
||||
this.sendWebSocketMessage({
|
||||
type: "transcription_error",
|
||||
@@ -796,8 +776,9 @@ Use tools when needed to provide accurate information.`;
|
||||
* Drain the input queue, processing one request at a time
|
||||
*/
|
||||
async drainInputQueue() {
|
||||
if (this.processingQueue)
|
||||
if (this.processingQueue) {
|
||||
return;
|
||||
}
|
||||
this.processingQueue = true;
|
||||
try {
|
||||
while (this.inputQueue.length > 0) {
|
||||
@@ -889,7 +870,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
@@ -960,7 +940,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
@@ -1203,21 +1182,10 @@ Use tools when needed to provide accurate information.`;
|
||||
return;
|
||||
try {
|
||||
if (this.socket.readyState === ws_1.WebSocket.OPEN) {
|
||||
if (message.type === "audio_chunk" || message.type === "audio") {
|
||||
const { data, ...rest } = message;
|
||||
console.log(`Sending WebSocket message: ${message.type}`, data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "", rest);
|
||||
}
|
||||
else {
|
||||
console.log(`Sending WebSocket message: ${message.type}`);
|
||||
}
|
||||
this.socket.send(JSON.stringify(message));
|
||||
}
|
||||
else {
|
||||
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Failed to send WebSocket message:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
@@ -1225,14 +1193,12 @@ Use tools when needed to provide accurate information.`;
|
||||
* Start listening for voice/video input
|
||||
*/
|
||||
startListening() {
|
||||
console.log("Starting video agent...");
|
||||
this.emit("listening");
|
||||
}
|
||||
/**
|
||||
* Stop listening for voice/video input
|
||||
*/
|
||||
stopListening() {
|
||||
console.log("Stopping video agent...");
|
||||
this.emit("stopped");
|
||||
}
|
||||
/**
|
||||
|
||||
2
dist/VideoAgent.js.map
vendored
175
dist/VideoAgent.new.d.ts
vendored
Normal file
@@ -0,0 +1,175 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
||||
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
|
||||
/**
|
||||
* Trigger reasons for frame capture
|
||||
*/
|
||||
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
|
||||
/**
|
||||
* Video frame data structure sent to/from the client
|
||||
*/
|
||||
interface VideoFrame {
|
||||
type: "video_frame";
|
||||
sessionId: string;
|
||||
sequence: number;
|
||||
timestamp: number;
|
||||
triggerReason: FrameTriggerReason;
|
||||
previousFrameRef?: string;
|
||||
image: {
|
||||
data: string;
|
||||
format: string;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Audio data structure
|
||||
*/
|
||||
interface AudioData {
|
||||
type: "audio";
|
||||
sessionId: string;
|
||||
data: string;
|
||||
format: string;
|
||||
sampleRate?: number;
|
||||
duration?: number;
|
||||
timestamp: number;
|
||||
}
|
||||
/**
|
||||
* Backend configuration for video processing
|
||||
*/
|
||||
interface VideoAgentConfig {
|
||||
/** Maximum frames to keep in context buffer for conversation history */
|
||||
maxContextFrames: number;
|
||||
}
|
||||
/**
|
||||
* Frame context for maintaining visual conversation history
|
||||
*/
|
||||
interface FrameContext {
|
||||
sequence: number;
|
||||
timestamp: number;
|
||||
triggerReason: FrameTriggerReason;
|
||||
frameHash: string;
|
||||
description?: string;
|
||||
}
|
||||
export interface VideoAgentOptions {
|
||||
/**
|
||||
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
|
||||
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
|
||||
*/
|
||||
model: LanguageModel;
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
speechModel?: SpeechModel;
|
||||
instructions?: string;
|
||||
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
tools?: Record<string, Tool>;
|
||||
endpoint?: string;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
history?: Partial<HistoryConfig>;
|
||||
maxAudioInputSize?: number;
|
||||
/** Maximum frame input size in bytes (default: 5 MB) */
|
||||
maxFrameInputSize?: number;
|
||||
/** Maximum frames to keep in context buffer (default: 10) */
|
||||
maxContextFrames?: number;
|
||||
/** Session ID for this video agent instance */
|
||||
sessionId?: string;
|
||||
}
|
||||
export declare class VideoAgent extends EventEmitter {
|
||||
private model;
|
||||
private instructions;
|
||||
private stopWhen;
|
||||
private endpoint?;
|
||||
private tools;
|
||||
private isDestroyed;
|
||||
private _isProcessing;
|
||||
private currentStreamAbortController?;
|
||||
private ws;
|
||||
private speech;
|
||||
private conversation;
|
||||
private transcription;
|
||||
private inputQueue;
|
||||
private sessionId;
|
||||
private frameSequence;
|
||||
private lastFrameTimestamp;
|
||||
private lastFrameHash?;
|
||||
private frameContextBuffer;
|
||||
private currentFrameData?;
|
||||
private videoConfig;
|
||||
private maxFrameInputSize;
|
||||
constructor(options: VideoAgentOptions);
|
||||
registerTools(tools: Record<string, Tool>): void;
|
||||
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
|
||||
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
|
||||
interruptSpeech(reason?: string): void;
|
||||
interruptCurrentResponse(reason?: string): void;
|
||||
connect(url?: string): Promise<void>;
|
||||
handleSocket(socket: WebSocket): void;
|
||||
sendText(text: string): Promise<string>;
|
||||
sendAudio(audioData: string): Promise<void>;
|
||||
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
|
||||
/**
|
||||
* Send a video frame with optional text query for vision analysis
|
||||
*/
|
||||
sendFrame(frameData: string, query?: string, options?: {
|
||||
width?: number;
|
||||
height?: number;
|
||||
format?: string;
|
||||
}): Promise<string>;
|
||||
/**
|
||||
* Request client to capture and send a frame
|
||||
*/
|
||||
requestFrameCapture(reason: FrameTriggerReason): void;
|
||||
getConfig(): VideoAgentConfig;
|
||||
updateConfig(config: Partial<VideoAgentConfig>): void;
|
||||
startListening(): void;
|
||||
stopListening(): void;
|
||||
clearHistory(): void;
|
||||
getHistory(): ModelMessage[];
|
||||
setHistory(history: ModelMessage[]): void;
|
||||
getFrameContext(): FrameContext[];
|
||||
getSessionId(): string;
|
||||
disconnect(): void;
|
||||
destroy(): void;
|
||||
get connected(): boolean;
|
||||
get processing(): boolean;
|
||||
get speaking(): boolean;
|
||||
get pendingSpeechChunks(): number;
|
||||
get destroyed(): boolean;
|
||||
get currentFrameSequence(): number;
|
||||
get hasVisualContext(): boolean;
|
||||
private handleMessage;
|
||||
private handleClientReady;
|
||||
private handleAudioInput;
|
||||
private handleVideoFrame;
|
||||
private addFrameToContext;
|
||||
private hashFrame;
|
||||
private generateSessionId;
|
||||
private enqueueTextInput;
|
||||
private enqueueMultimodalInput;
|
||||
/**
|
||||
* Route queued items to the correct processor.
|
||||
*/
|
||||
private processQueueItem;
|
||||
private buildMultimodalContent;
|
||||
/**
|
||||
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
|
||||
*/
|
||||
private runStream;
|
||||
/**
|
||||
* Process text-only input (with optional visual context from latest frame).
|
||||
*/
|
||||
private processUserInput;
|
||||
/**
|
||||
* Process multimodal input (text + explicit video frame).
|
||||
*/
|
||||
private processMultimodalInput;
|
||||
private ensureNotDestroyed;
|
||||
private cleanupOnDisconnect;
|
||||
private bubbleEvents;
|
||||
}
|
||||
export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
|
||||
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
|
||||
//# sourceMappingURL=VideoAgent.new.d.ts.map
|
||||
1
dist/VideoAgent.new.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"VideoAgent.new.d.ts","sourceRoot":"","sources":["../src/VideoAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACH,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACnB,MAAM,IAAI,CAAC;AACZ,OAAO,EACH,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACrB,MAAM,SAAS,CAAC;AAcjB;;GAEG;AACH,KAAK,kBAAkB,GAAG,cAAc,GAAG,cAAc,GAAG,OAAO,GAAG,SAAS,CAAC;AAEhF;;GAEG;AACH,UAAU,UAAU;IAChB,IAAI,EAAE,aAAa,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,EAAE;QACH,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;KAClB,CAAC;CACL;AAED;;GAEG;AACH,UAAU,SAAS;IACf,IAAI,EAAE,OAAO,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,UAAU,gBAAgB;IACtB,wEAAwE;IACxE,gBAAgB,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,UAAU,YAAY;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,kBAAkB,CAAC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,CAAC,EAAE,MAAM,CAAC;CACxB;AAYD,MAAM,WAAW,iBAAiB;IAC9B;;;OAGG;IACH,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,6DAA6D;IAC7D,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;CACtB;AAUD,qBAAa,UAAW,SAAQ,YAAY;IACxC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;IAG/C,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,aAAa,CAAK;IAC1B,OAAO,CAAC,kBAAkB,CAAK;IAC/B,OAAO,CAAC,aAAa,CAAC,CAAS;IAC/B,OAAO,CAAC,kBAAkB,CAAsB;IAChD,OAAO,CAAC,gBAAgB,CAAC,CAAS;IAClC,OAAO,CAAC,WAAW,CAAmB;IACtC,OAAO,CAAC,iBAAiB,CAAS;gBAEtB,OAAO,EAAE,iBAAiB;IAmF/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAInC,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAIhE,sBAAsB,CAC/B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GAC1B,OAAO,CAAC,UAAU,CAAC;IAIf,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAIrD,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQxD,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAM1C,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK/B,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQvC,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAK3C,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,SAAS,CAClB,SAAS,EAAE,MAAM,EACjB,KAAK,CAAC,EAAE,MAAM,EACd,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,GAC/D,OAAO,CAAC,MAAM,CAAC;IA4BlB;;OAEG;IACI,mBAAmB,CAAC,MAAM,EAAE,kBAAkB,GAAG,IAAI;IASrD,SAAS,IAAI,gBAAgB;IAI7B,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,gBAAgB,CAAC,GAAG,IAAI;IAK5D,cAAc;IAId,aAAa;IAIb,YAAY;IAKZ,UAAU,IAAI,YAAY,EAAE;IAI5B,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,eAAe,IAAI,YAAY,EAAE;IAIjC,YAAY,IAAI,MAAM;IAItB,UAAU;IAIV,OAAO;IAYP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,oBAAoB,IAAI,MAAM,CAEjC;IAED,IAAI,gBAAgB,IAAI,OAAO,CAE9B;YAMa,aAAa;IA4C3B,OAAO,CAAC,iBAAiB;YAYX,gBAAgB;YAchB,gBAAgB;IAgD9B,OAAO,CAAC,iBAAiB;IAOzB,OAAO,CAAC,SAAS;IAUjB,OAAO,CAAC,iBAAiB;IAUzB,OAAO,CAAC,gBAAgB;IAMxB,OAAO,CAAC,sBAAsB;IAM9B;;OAEG;YACW,gBAAgB;IAa9B,OAAO,CAAC,sBAAsB;IA0B9B;;OAEG;YACW,SAAS;IAqEvB;;OAEG;YACW,gBAAgB;IAsC9B;;OAEG;YACW,sBAAsB;IAuCpC,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,mBAAmB;IAW3B,OAAO,CAAC,YAAY;CAKvB;AAGD,YAAY,EACR,UAAU,EACV,SAAS,EACT,gBAAgB,EAChB,YAAY,EACZ,kBAAkB,GACrB,CAAC;AAGF,YAAY,EAAE,qBAAqB,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC"}
|
||||
571
dist/VideoAgent.new.js
vendored
Normal file
@@ -0,0 +1,571 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.VideoAgent = void 0;
|
||||
const events_1 = require("events");
|
||||
const ai_1 = require("ai");
|
||||
const core_1 = require("./core");
|
||||
/** Default maximum frame input size (5 MB) */
|
||||
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
|
||||
/** Default video agent config */
|
||||
const DEFAULT_VIDEO_AGENT_CONFIG = {
|
||||
maxContextFrames: 10,
|
||||
};
|
||||
// ── VideoAgent class ────────────────────────────────────
|
||||
class VideoAgent extends events_1.EventEmitter {
|
||||
model;
|
||||
instructions;
|
||||
stopWhen;
|
||||
endpoint;
|
||||
tools = {};
|
||||
isDestroyed = false;
|
||||
_isProcessing = false;
|
||||
// Abort controller for the current LLM stream
|
||||
currentStreamAbortController;
|
||||
// ── Managers ─────────────────────────────────────────
|
||||
ws;
|
||||
speech;
|
||||
conversation;
|
||||
transcription;
|
||||
inputQueue;
|
||||
// ── Video-specific state ────────────────────────────
|
||||
sessionId;
|
||||
frameSequence = 0;
|
||||
lastFrameTimestamp = 0;
|
||||
lastFrameHash;
|
||||
frameContextBuffer = [];
|
||||
currentFrameData;
|
||||
videoConfig;
|
||||
maxFrameInputSize;
|
||||
constructor(options) {
|
||||
super();
|
||||
this.model = options.model;
|
||||
this.instructions =
|
||||
options.instructions ||
|
||||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
|
||||
When analyzing images, be concise but informative. Describe what you see when asked.
|
||||
Keep responses conversational since they will be spoken aloud.
|
||||
Use tools when needed to provide accurate information.`;
|
||||
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
|
||||
this.endpoint = options.endpoint;
|
||||
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
|
||||
this.sessionId = options.sessionId || this.generateSessionId();
|
||||
this.videoConfig = {
|
||||
...DEFAULT_VIDEO_AGENT_CONFIG,
|
||||
maxContextFrames: options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
|
||||
};
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
}
|
||||
// ── Initialize managers ─────────────────────────
|
||||
this.ws = new core_1.WebSocketManager();
|
||||
this.speech = new core_1.SpeechManager({
|
||||
speechModel: options.speechModel,
|
||||
voice: options.voice,
|
||||
speechInstructions: options.speechInstructions,
|
||||
outputFormat: options.outputFormat,
|
||||
streamingSpeech: options.streamingSpeech,
|
||||
});
|
||||
this.conversation = new core_1.ConversationManager({
|
||||
history: options.history,
|
||||
});
|
||||
this.transcription = new core_1.TranscriptionManager({
|
||||
transcriptionModel: options.transcriptionModel,
|
||||
maxAudioInputSize: options.maxAudioInputSize,
|
||||
});
|
||||
this.inputQueue = new core_1.InputQueue();
|
||||
// ── Wire managers to WebSocket send ─────────────
|
||||
const sendMsg = (msg) => this.ws.send(msg);
|
||||
this.speech.sendMessage = sendMsg;
|
||||
this.transcription.sendMessage = sendMsg;
|
||||
// ── Wire input queue processor ──────────────────
|
||||
this.inputQueue.processor = (item) => this.processQueueItem(item);
|
||||
// ── Bubble events from managers ─────────────────
|
||||
this.bubbleEvents(this.ws, ["connected", "error"]);
|
||||
this.bubbleEvents(this.speech, [
|
||||
"speech_start",
|
||||
"speech_complete",
|
||||
"speech_interrupted",
|
||||
"speech_chunk_queued",
|
||||
"audio_chunk",
|
||||
"audio",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.conversation, [
|
||||
"history_cleared",
|
||||
"history_trimmed",
|
||||
]);
|
||||
this.bubbleEvents(this.transcription, [
|
||||
"transcription",
|
||||
"audio_received",
|
||||
"error",
|
||||
"warning",
|
||||
]);
|
||||
// ── Handle WebSocket lifecycle ──────────────────
|
||||
this.ws.on("disconnected", () => {
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
this.ws.on("message", (message) => this.handleMessage(message));
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Public API
|
||||
// ══════════════════════════════════════════════════════
|
||||
registerTools(tools) {
|
||||
this.tools = { ...this.tools, ...tools };
|
||||
}
|
||||
async transcribeAudio(audioData) {
|
||||
return this.transcription.transcribeAudio(audioData);
|
||||
}
|
||||
async generateSpeechFromText(text, abortSignal) {
|
||||
return this.speech.generateSpeechFromText(text, abortSignal);
|
||||
}
|
||||
interruptSpeech(reason = "interrupted") {
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
interruptCurrentResponse(reason = "interrupted") {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
async connect(url) {
|
||||
this.ensureNotDestroyed();
|
||||
const wsUrl = url || this.endpoint || "ws://localhost:8080";
|
||||
await this.ws.connect(wsUrl);
|
||||
}
|
||||
handleSocket(socket) {
|
||||
this.ensureNotDestroyed();
|
||||
this.ws.handleSocket(socket);
|
||||
}
|
||||
async sendText(text) {
|
||||
this.ensureNotDestroyed();
|
||||
if (!text || !text.trim()) {
|
||||
throw new Error("Text input cannot be empty");
|
||||
}
|
||||
return this.enqueueTextInput(text);
|
||||
}
|
||||
async sendAudio(audioData) {
|
||||
this.ensureNotDestroyed();
|
||||
await this.handleAudioInput(audioData);
|
||||
}
|
||||
async sendAudioBuffer(audioBuffer) {
|
||||
this.ensureNotDestroyed();
|
||||
const base64Audio = Buffer.from(audioBuffer).toString("base64");
|
||||
await this.handleAudioInput(base64Audio);
|
||||
}
|
||||
/**
|
||||
* Send a video frame with optional text query for vision analysis
|
||||
*/
|
||||
async sendFrame(frameData, query, options) {
|
||||
this.ensureNotDestroyed();
|
||||
const frame = {
|
||||
type: "video_frame",
|
||||
sessionId: this.sessionId,
|
||||
sequence: this.frameSequence++,
|
||||
timestamp: Date.now(),
|
||||
triggerReason: "user_request",
|
||||
previousFrameRef: this.lastFrameHash,
|
||||
image: {
|
||||
data: frameData,
|
||||
format: options?.format || "webp",
|
||||
width: options?.width || 640,
|
||||
height: options?.height || 480,
|
||||
},
|
||||
};
|
||||
// Update local frame state
|
||||
await this.handleVideoFrame(frame);
|
||||
if (query) {
|
||||
return this.enqueueMultimodalInput(query, frame);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
/**
|
||||
* Request client to capture and send a frame
|
||||
*/
|
||||
requestFrameCapture(reason) {
|
||||
this.ws.send({
|
||||
type: "capture_frame",
|
||||
reason,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
this.emit("frame_requested", { reason });
|
||||
}
|
||||
getConfig() {
|
||||
return { ...this.videoConfig };
|
||||
}
|
||||
updateConfig(config) {
|
||||
this.videoConfig = { ...this.videoConfig, ...config };
|
||||
this.emit("config_changed", this.videoConfig);
|
||||
}
|
||||
startListening() {
|
||||
this.emit("listening");
|
||||
}
|
||||
stopListening() {
|
||||
this.emit("stopped");
|
||||
}
|
||||
clearHistory() {
|
||||
this.conversation.clearHistory();
|
||||
this.frameContextBuffer = [];
|
||||
}
|
||||
getHistory() {
|
||||
return this.conversation.getHistory();
|
||||
}
|
||||
setHistory(history) {
|
||||
this.conversation.setHistory(history);
|
||||
}
|
||||
getFrameContext() {
|
||||
return [...this.frameContextBuffer];
|
||||
}
|
||||
getSessionId() {
|
||||
return this.sessionId;
|
||||
}
|
||||
disconnect() {
|
||||
this.ws.disconnect();
|
||||
}
|
||||
destroy() {
|
||||
this.isDestroyed = true;
|
||||
this.cleanupOnDisconnect();
|
||||
this.ws.disconnect();
|
||||
this.conversation.clearHistory();
|
||||
this.frameContextBuffer = [];
|
||||
this.tools = {};
|
||||
this.removeAllListeners();
|
||||
}
|
||||
// ── Getters ─────────────────────────────────────────
|
||||
get connected() {
|
||||
return this.ws.isConnected;
|
||||
}
|
||||
get processing() {
|
||||
return this._isProcessing;
|
||||
}
|
||||
get speaking() {
|
||||
return this.speech.isSpeaking;
|
||||
}
|
||||
get pendingSpeechChunks() {
|
||||
return this.speech.pendingChunkCount;
|
||||
}
|
||||
get destroyed() {
|
||||
return this.isDestroyed;
|
||||
}
|
||||
get currentFrameSequence() {
|
||||
return this.frameSequence;
|
||||
}
|
||||
get hasVisualContext() {
|
||||
return !!this.currentFrameData;
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — message handling
|
||||
// ══════════════════════════════════════════════════════
|
||||
async handleMessage(message) {
|
||||
try {
|
||||
switch (message.type) {
|
||||
case "transcript":
|
||||
if (typeof message.text !== "string" || !message.text.trim()) {
|
||||
this.emit("warning", "Received empty or invalid transcript message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
this.requestFrameCapture("user_request");
|
||||
await this.enqueueTextInput(message.text);
|
||||
break;
|
||||
case "audio":
|
||||
if (typeof message.data !== "string" || !message.data) {
|
||||
this.emit("warning", "Received empty or invalid audio message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
this.requestFrameCapture("user_request");
|
||||
try {
|
||||
await this.handleAudioInput(message.data, message.format);
|
||||
}
|
||||
catch (audioError) {
|
||||
this.emit("error", audioError);
|
||||
}
|
||||
break;
|
||||
case "video_frame":
|
||||
await this.handleVideoFrame(message);
|
||||
break;
|
||||
case "interrupt":
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
break;
|
||||
case "client_ready":
|
||||
this.handleClientReady(message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
this.emit("error", err);
|
||||
}
|
||||
}
|
||||
handleClientReady(message) {
|
||||
this.ws.send({
|
||||
type: "session_init",
|
||||
sessionId: this.sessionId,
|
||||
});
|
||||
this.emit("client_ready", message.capabilities);
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — audio
|
||||
// ══════════════════════════════════════════════════════
|
||||
async handleAudioInput(base64Audio, format) {
|
||||
const text = await this.transcription.processAudioInput(base64Audio, format);
|
||||
if (text) {
|
||||
await this.enqueueTextInput(text);
|
||||
}
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — video frames
|
||||
// ══════════════════════════════════════════════════════
|
||||
async handleVideoFrame(frame) {
|
||||
try {
|
||||
if (!frame.image?.data) {
|
||||
this.emit("warning", "Received empty or invalid video frame");
|
||||
return;
|
||||
}
|
||||
const frameSize = Buffer.from(frame.image.data, "base64").length;
|
||||
if (frameSize > this.maxFrameInputSize) {
|
||||
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
|
||||
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
|
||||
this.emit("error", new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
|
||||
return;
|
||||
}
|
||||
const frameHash = this.hashFrame(frame.image.data);
|
||||
this.lastFrameTimestamp = frame.timestamp;
|
||||
this.lastFrameHash = frameHash;
|
||||
this.currentFrameData = frame.image.data;
|
||||
this.addFrameToContext({
|
||||
sequence: frame.sequence,
|
||||
timestamp: frame.timestamp,
|
||||
triggerReason: frame.triggerReason,
|
||||
frameHash,
|
||||
});
|
||||
this.emit("frame_received", {
|
||||
sequence: frame.sequence,
|
||||
timestamp: frame.timestamp,
|
||||
triggerReason: frame.triggerReason,
|
||||
size: frameSize,
|
||||
dimensions: { width: frame.image.width, height: frame.image.height },
|
||||
});
|
||||
this.ws.send({
|
||||
type: "frame_ack",
|
||||
sequence: frame.sequence,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
addFrameToContext(context) {
|
||||
this.frameContextBuffer.push(context);
|
||||
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
|
||||
this.frameContextBuffer.shift();
|
||||
}
|
||||
}
|
||||
hashFrame(data) {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const char = data.charCodeAt(i);
|
||||
hash = ((hash << 5) - hash) + char;
|
||||
hash = hash & hash;
|
||||
}
|
||||
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
|
||||
}
|
||||
generateSessionId() {
|
||||
const timestamp = Date.now().toString(36);
|
||||
const randomPart = Math.random().toString(36).substring(2, 10);
|
||||
return `vs_${timestamp}_${randomPart}`;
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — input queue
|
||||
// ══════════════════════════════════════════════════════
|
||||
enqueueTextInput(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, resolve, reject });
|
||||
});
|
||||
}
|
||||
enqueueMultimodalInput(text, frame) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, frame, resolve, reject });
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Route queued items to the correct processor.
|
||||
*/
|
||||
async processQueueItem(item) {
|
||||
if (item.frame && item.text) {
|
||||
return this.processMultimodalInput(item.text, item.frame);
|
||||
}
|
||||
else if (item.text) {
|
||||
return this.processUserInput(item.text);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — multimodal content building
|
||||
// ══════════════════════════════════════════════════════
|
||||
buildMultimodalContent(text, frameData) {
|
||||
const content = [];
|
||||
if (this.frameContextBuffer.length > 0) {
|
||||
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
|
||||
content.push({ type: "text", text: contextSummary });
|
||||
}
|
||||
const imageData = frameData || this.currentFrameData;
|
||||
if (imageData) {
|
||||
content.push({ type: "image", image: imageData });
|
||||
}
|
||||
content.push({ type: "text", text });
|
||||
return content;
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — LLM processing
|
||||
// ══════════════════════════════════════════════════════
|
||||
/**
|
||||
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
|
||||
*/
|
||||
async runStream(messages, abortSignal) {
|
||||
const result = (0, ai_1.streamText)({
|
||||
model: this.model,
|
||||
system: this.instructions,
|
||||
messages,
|
||||
tools: this.tools,
|
||||
stopWhen: this.stopWhen,
|
||||
abortSignal,
|
||||
onChunk: ({ chunk }) => {
|
||||
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
|
||||
},
|
||||
onFinish: async (event) => {
|
||||
for (const step of event.steps) {
|
||||
for (const toolResult of step.toolResults) {
|
||||
this.emit("tool_result", {
|
||||
name: toolResult.toolName,
|
||||
toolCallId: toolResult.toolCallId,
|
||||
result: toolResult.output,
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
const streamResult = await (0, core_1.processFullStream)(result, {
|
||||
onTextDelta: (delta) => this.speech.processTextDelta(delta),
|
||||
onTextEnd: () => this.speech.flushPendingText(),
|
||||
sendMessage: (msg) => this.ws.send(msg),
|
||||
emitEvent: (event, data) => this.emit(event, data),
|
||||
}, {
|
||||
sessionId: this.sessionId,
|
||||
frameContext: this.frameContextBuffer.length > 0
|
||||
? {
|
||||
frameCount: this.frameContextBuffer.length,
|
||||
lastFrameSequence: this.frameContextBuffer[this.frameContextBuffer.length - 1]
|
||||
?.sequence,
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
// Add assistant response to history
|
||||
if (streamResult.fullText) {
|
||||
this.conversation.addMessage({
|
||||
role: "assistant",
|
||||
content: streamResult.fullText,
|
||||
});
|
||||
}
|
||||
// Flush remaining speech & wait for queue
|
||||
this.speech.flushPendingText();
|
||||
if (this.speech.queueDonePromise) {
|
||||
await this.speech.queueDonePromise;
|
||||
}
|
||||
return streamResult.fullText;
|
||||
}
|
||||
/**
|
||||
* Process text-only input (with optional visual context from latest frame).
|
||||
*/
|
||||
async processUserInput(text) {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
try {
|
||||
this.emit("text", { role: "user", text });
|
||||
const hasVisual = !!this.currentFrameData;
|
||||
let messages;
|
||||
if (hasVisual) {
|
||||
const content = this.buildMultimodalContent(text);
|
||||
this.conversation.addMessage({
|
||||
role: "user",
|
||||
content: [{ type: "text", text: `[Visual context] ${text}` }],
|
||||
});
|
||||
messages = [
|
||||
...this.conversation.getHistoryRef().slice(0, -1),
|
||||
{ role: "user", content },
|
||||
];
|
||||
}
|
||||
else {
|
||||
this.conversation.addMessage({ role: "user", content: text });
|
||||
messages = this.conversation.getHistoryRef();
|
||||
}
|
||||
return await this.runStream(messages, this.currentStreamAbortController.signal);
|
||||
}
|
||||
catch (error) {
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process multimodal input (text + explicit video frame).
|
||||
*/
|
||||
async processMultimodalInput(text, frame) {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
try {
|
||||
this.emit("text", { role: "user", text, hasImage: true });
|
||||
const content = this.buildMultimodalContent(text, frame.image.data);
|
||||
this.conversation.addMessage({
|
||||
role: "user",
|
||||
content: [{ type: "text", text: `[Image attached] ${text}` }],
|
||||
});
|
||||
const messages = [
|
||||
...this.conversation.getHistoryRef().slice(0, -1),
|
||||
{ role: "user", content },
|
||||
];
|
||||
return await this.runStream(messages, this.currentStreamAbortController.signal);
|
||||
}
|
||||
catch (error) {
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — helpers
|
||||
// ══════════════════════════════════════════════════════
|
||||
ensureNotDestroyed() {
|
||||
if (this.isDestroyed) {
|
||||
throw new Error("VideoAgent has been destroyed and cannot be used");
|
||||
}
|
||||
}
|
||||
cleanupOnDisconnect() {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.reset();
|
||||
this._isProcessing = false;
|
||||
this.currentFrameData = undefined;
|
||||
this.inputQueue.rejectAll(new Error("Connection closed"));
|
||||
}
|
||||
bubbleEvents(source, events) {
|
||||
for (const event of events) {
|
||||
source.on(event, (...args) => this.emit(event, ...args));
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.VideoAgent = VideoAgent;
|
||||
//# sourceMappingURL=VideoAgent.new.js.map
|
||||
1
dist/VideoAgent.new.js.map
vendored
Normal file
23
dist/VoiceAgent.d.ts
vendored
@@ -20,6 +20,25 @@ export interface VoiceAgentOptions {
|
||||
/** Maximum audio input size in bytes (default: 10 MB) */
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
export declare class VoiceAgent extends EventEmitter {
|
||||
private socket?;
|
||||
private tools;
|
||||
@@ -120,6 +139,10 @@ export declare class VoiceAgent extends EventEmitter {
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
* Use this when a WS server accepts a connection and you want the
|
||||
* agent to handle messages on that socket.
|
||||
*
|
||||
* **Note:** Calling this while a socket is already attached will cleanly
|
||||
* tear down the previous connection first. Each `VoiceAgent` instance
|
||||
* supports only one socket at a time — create a new agent per user.
|
||||
*/
|
||||
handleSocket(socket: WebSocket): void;
|
||||
/**
|
||||
|
||||
2
dist/VoiceAgent.d.ts.map
vendored
@@ -1 +1 @@
|
||||
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
|
||||
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;;;;;OAQG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
|
||||
25
dist/VoiceAgent.js
vendored
@@ -5,6 +5,25 @@ const ws_1 = require("ws");
|
||||
const events_1 = require("events");
|
||||
const ai_1 = require("ai");
|
||||
const types_1 = require("./types");
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
class VoiceAgent extends events_1.EventEmitter {
|
||||
socket;
|
||||
tools = {};
|
||||
@@ -50,7 +69,7 @@ class VoiceAgent extends events_1.EventEmitter {
|
||||
this.endpoint = options.endpoint;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "mp3";
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
@@ -576,6 +595,10 @@ class VoiceAgent extends events_1.EventEmitter {
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
* Use this when a WS server accepts a connection and you want the
|
||||
* agent to handle messages on that socket.
|
||||
*
|
||||
* **Note:** Calling this while a socket is already attached will cleanly
|
||||
* tear down the previous connection first. Each `VoiceAgent` instance
|
||||
* supports only one socket at a time — create a new agent per user.
|
||||
*/
|
||||
handleSocket(socket) {
|
||||
this.ensureNotDestroyed();
|
||||
|
||||
2
dist/VoiceAgent.js.map
vendored
137
dist/VoiceAgent.new.d.ts
vendored
Normal file
@@ -0,0 +1,137 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
import { streamText, type LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
||||
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
|
||||
export interface VoiceAgentOptions {
|
||||
model: LanguageModel;
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
speechModel?: SpeechModel;
|
||||
instructions?: string;
|
||||
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
tools?: Record<string, Tool>;
|
||||
endpoint?: string;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
/** Configuration for streaming speech generation */
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
/** Configuration for conversation history memory limits */
|
||||
history?: Partial<HistoryConfig>;
|
||||
/** Maximum audio input size in bytes (default: 10 MB) */
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
export declare class VoiceAgent extends EventEmitter {
|
||||
private model;
|
||||
private instructions;
|
||||
private stopWhen;
|
||||
private endpoint?;
|
||||
private tools;
|
||||
private isDestroyed;
|
||||
private _isProcessing;
|
||||
private currentStreamAbortController?;
|
||||
private ws;
|
||||
private speech;
|
||||
private conversation;
|
||||
private transcription;
|
||||
private inputQueue;
|
||||
constructor(options: VoiceAgentOptions);
|
||||
registerTools(tools: Record<string, Tool>): void;
|
||||
/**
|
||||
* Transcribe audio data to text using the configured transcription model.
|
||||
*/
|
||||
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
interruptSpeech(reason?: string): void;
|
||||
/**
|
||||
* Interrupt both the current LLM stream and ongoing speech.
|
||||
*/
|
||||
interruptCurrentResponse(reason?: string): void;
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
connect(url?: string): Promise<void>;
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
handleSocket(socket: WebSocket): void;
|
||||
/**
|
||||
* Send text input for processing (bypasses transcription).
|
||||
*/
|
||||
sendText(text: string): Promise<string>;
|
||||
/**
|
||||
* Send base64 audio data to be transcribed and processed.
|
||||
*/
|
||||
sendAudio(audioData: string): Promise<void>;
|
||||
/**
|
||||
* Send raw audio buffer to be transcribed and processed.
|
||||
*/
|
||||
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
generateAndSendSpeechFull(text: string): Promise<void>;
|
||||
/** Start listening for voice input */
|
||||
startListening(): void;
|
||||
/** Stop listening for voice input */
|
||||
stopListening(): void;
|
||||
/** Clear conversation history */
|
||||
clearHistory(): void;
|
||||
/** Get current conversation history */
|
||||
getHistory(): ModelMessage[];
|
||||
/** Set conversation history (useful for restoring sessions) */
|
||||
setHistory(history: ModelMessage[]): void;
|
||||
/** Disconnect from WebSocket and stop all in-flight work */
|
||||
disconnect(): void;
|
||||
/**
|
||||
* Permanently destroy the agent, releasing all resources.
|
||||
*/
|
||||
destroy(): void;
|
||||
get connected(): boolean;
|
||||
get processing(): boolean;
|
||||
get speaking(): boolean;
|
||||
get pendingSpeechChunks(): number;
|
||||
get destroyed(): boolean;
|
||||
private handleMessage;
|
||||
private handleAudioInput;
|
||||
private enqueueInput;
|
||||
/**
|
||||
* Process user input with streaming text generation.
|
||||
* Called serially by the input queue.
|
||||
*/
|
||||
private processUserInput;
|
||||
private ensureNotDestroyed;
|
||||
/**
|
||||
* Clean up all in-flight state when the connection drops.
|
||||
*/
|
||||
private cleanupOnDisconnect;
|
||||
/**
|
||||
* Forward select events from a child emitter to this agent.
|
||||
*/
|
||||
private bubbleEvents;
|
||||
}
|
||||
//# sourceMappingURL=VoiceAgent.new.d.ts.map
|
||||
1
dist/VoiceAgent.new.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"VoiceAgent.new.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.new.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,KAAK,aAAa,EAElB,KAAK,IAAI,EACT,KAAK,YAAY,EACjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EACL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EACnB,MAAM,SAAS,CAAC;AAYjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAOD;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,aAAa,CAAS;IAG9B,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,aAAa,CAAuB;IAC5C,OAAO,CAAC,UAAU,CAA6B;gBAEnC,OAAO,EAAE,iBAAiB;IAyE/B,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7E;;OAEG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAItB;;OAEG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAI5D;;OAEG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAQrE;;OAEG;IACU,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAMjD;;OAEG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAK5C;;OAEG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;OAEG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;OAEG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAInE,sCAAsC;IACtC,cAAc;IAKd,qCAAqC;IACrC,aAAa;IAKb,iCAAiC;IACjC,YAAY;IAIZ,uCAAuC;IACvC,UAAU,IAAI,YAAY,EAAE;IAI5B,+DAA+D;IAC/D,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC,4DAA4D;IAC5D,UAAU;IAIV;;OAEG;IACH,OAAO;IAWP,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED,IAAI,SAAS,IAAI,OAAO,CAEvB;YAIa,aAAa;YAoCb,gBAAgB;IAe9B,OAAO,CAAC,YAAY;IAQpB;;;OAGG;YACW,gBAAgB;IAyE9B,OAAO,CAAC,kBAAkB;IAM1B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAU3B;;OAEG;IACH,OAAO,CAAC,YAAY;CAKrB"}
|
||||
379
dist/VoiceAgent.new.js
vendored
Normal file
@@ -0,0 +1,379 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.VoiceAgent = void 0;
|
||||
const events_1 = require("events");
|
||||
const ai_1 = require("ai");
|
||||
const core_1 = require("./core");
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
class VoiceAgent extends events_1.EventEmitter {
|
||||
model;
|
||||
instructions;
|
||||
stopWhen;
|
||||
endpoint;
|
||||
tools = {};
|
||||
isDestroyed = false;
|
||||
_isProcessing = false;
|
||||
// Abort controller for the current LLM stream
|
||||
currentStreamAbortController;
|
||||
// ── Managers ──────────────────────────────────────────
|
||||
ws;
|
||||
speech;
|
||||
conversation;
|
||||
transcription;
|
||||
inputQueue;
|
||||
constructor(options) {
|
||||
super();
|
||||
this.model = options.model;
|
||||
this.instructions =
|
||||
options.instructions || "You are a helpful voice assistant.";
|
||||
this.stopWhen = options.stopWhen || (0, ai_1.stepCountIs)(5);
|
||||
this.endpoint = options.endpoint;
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
}
|
||||
// ── Initialize managers ──────────────────────────────
|
||||
this.ws = new core_1.WebSocketManager();
|
||||
this.speech = new core_1.SpeechManager({
|
||||
speechModel: options.speechModel,
|
||||
voice: options.voice,
|
||||
speechInstructions: options.speechInstructions,
|
||||
outputFormat: options.outputFormat,
|
||||
streamingSpeech: options.streamingSpeech,
|
||||
});
|
||||
this.conversation = new core_1.ConversationManager({
|
||||
history: options.history,
|
||||
});
|
||||
this.transcription = new core_1.TranscriptionManager({
|
||||
transcriptionModel: options.transcriptionModel,
|
||||
maxAudioInputSize: options.maxAudioInputSize,
|
||||
});
|
||||
this.inputQueue = new core_1.InputQueue();
|
||||
// ── Wire managers to the WebSocket send function ─────
|
||||
const sendMsg = (msg) => this.ws.send(msg);
|
||||
this.speech.sendMessage = sendMsg;
|
||||
this.transcription.sendMessage = sendMsg;
|
||||
// ── Wire the input queue processor ───────────────────
|
||||
this.inputQueue.processor = (item) => this.processUserInput(item.text);
|
||||
// ── Bubble events from managers ──────────────────────
|
||||
this.bubbleEvents(this.ws, [
|
||||
"connected",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.speech, [
|
||||
"speech_start",
|
||||
"speech_complete",
|
||||
"speech_interrupted",
|
||||
"speech_chunk_queued",
|
||||
"audio_chunk",
|
||||
"audio",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.conversation, [
|
||||
"history_cleared",
|
||||
"history_trimmed",
|
||||
]);
|
||||
this.bubbleEvents(this.transcription, [
|
||||
"transcription",
|
||||
"audio_received",
|
||||
"error",
|
||||
"warning",
|
||||
]);
|
||||
// ── Handle WebSocket lifecycle events ────────────────
|
||||
this.ws.on("disconnected", () => {
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
this.ws.on("message", (message) => this.handleMessage(message));
|
||||
}
|
||||
// ── Public API ────────────────────────────────────────
|
||||
registerTools(tools) {
|
||||
this.tools = { ...this.tools, ...tools };
|
||||
}
|
||||
/**
|
||||
* Transcribe audio data to text using the configured transcription model.
|
||||
*/
|
||||
async transcribeAudio(audioData) {
|
||||
return this.transcription.transcribeAudio(audioData);
|
||||
}
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
async generateSpeechFromText(text, abortSignal) {
|
||||
return this.speech.generateSpeechFromText(text, abortSignal);
|
||||
}
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
interruptSpeech(reason = "interrupted") {
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
/**
|
||||
* Interrupt both the current LLM stream and ongoing speech.
|
||||
*/
|
||||
interruptCurrentResponse(reason = "interrupted") {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
async connect(url) {
|
||||
this.ensureNotDestroyed();
|
||||
const wsUrl = url || this.endpoint || "ws://localhost:8080";
|
||||
await this.ws.connect(wsUrl);
|
||||
}
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
handleSocket(socket) {
|
||||
this.ensureNotDestroyed();
|
||||
this.ws.handleSocket(socket);
|
||||
}
|
||||
/**
|
||||
* Send text input for processing (bypasses transcription).
|
||||
*/
|
||||
async sendText(text) {
|
||||
this.ensureNotDestroyed();
|
||||
if (!text || !text.trim()) {
|
||||
throw new Error("Text input cannot be empty");
|
||||
}
|
||||
return this.enqueueInput(text);
|
||||
}
|
||||
/**
|
||||
* Send base64 audio data to be transcribed and processed.
|
||||
*/
|
||||
async sendAudio(audioData) {
|
||||
this.ensureNotDestroyed();
|
||||
await this.handleAudioInput(audioData);
|
||||
}
|
||||
/**
|
||||
* Send raw audio buffer to be transcribed and processed.
|
||||
*/
|
||||
async sendAudioBuffer(audioBuffer) {
|
||||
this.ensureNotDestroyed();
|
||||
const base64Audio = Buffer.from(audioBuffer).toString("base64");
|
||||
await this.handleAudioInput(base64Audio);
|
||||
}
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
async generateAndSendSpeechFull(text) {
|
||||
return this.speech.generateAndSendSpeechFull(text);
|
||||
}
|
||||
/** Start listening for voice input */
|
||||
startListening() {
|
||||
console.log("Starting voice agent...");
|
||||
this.emit("listening");
|
||||
}
|
||||
/** Stop listening for voice input */
|
||||
stopListening() {
|
||||
console.log("Stopping voice agent...");
|
||||
this.emit("stopped");
|
||||
}
|
||||
/** Clear conversation history */
|
||||
clearHistory() {
|
||||
this.conversation.clearHistory();
|
||||
}
|
||||
/** Get current conversation history */
|
||||
getHistory() {
|
||||
return this.conversation.getHistory();
|
||||
}
|
||||
/** Set conversation history (useful for restoring sessions) */
|
||||
setHistory(history) {
|
||||
this.conversation.setHistory(history);
|
||||
}
|
||||
/** Disconnect from WebSocket and stop all in-flight work */
|
||||
disconnect() {
|
||||
this.ws.disconnect();
|
||||
}
|
||||
/**
|
||||
* Permanently destroy the agent, releasing all resources.
|
||||
*/
|
||||
destroy() {
|
||||
this.isDestroyed = true;
|
||||
this.cleanupOnDisconnect();
|
||||
this.ws.disconnect();
|
||||
this.conversation.clearHistory();
|
||||
this.tools = {};
|
||||
this.removeAllListeners();
|
||||
}
|
||||
// ── Getters ───────────────────────────────────────────
|
||||
get connected() {
|
||||
return this.ws.isConnected;
|
||||
}
|
||||
get processing() {
|
||||
return this._isProcessing;
|
||||
}
|
||||
get speaking() {
|
||||
return this.speech.isSpeaking;
|
||||
}
|
||||
get pendingSpeechChunks() {
|
||||
return this.speech.pendingChunkCount;
|
||||
}
|
||||
get destroyed() {
|
||||
return this.isDestroyed;
|
||||
}
|
||||
// ── Private: message handling ─────────────────────────
|
||||
async handleMessage(message) {
|
||||
try {
|
||||
console.log(`Received WebSocket message of type: ${message.type}`);
|
||||
if (message.type === "transcript") {
|
||||
if (typeof message.text !== "string" || !message.text.trim()) {
|
||||
this.emit("warning", "Received empty or invalid transcript message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
console.log(`Processing transcript: "${message.text}"`);
|
||||
await this.enqueueInput(message.text);
|
||||
}
|
||||
else if (message.type === "audio") {
|
||||
if (typeof message.data !== "string" || !message.data) {
|
||||
this.emit("warning", "Received empty or invalid audio message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`);
|
||||
await this.handleAudioInput(message.data, message.format);
|
||||
}
|
||||
else if (message.type === "interrupt") {
|
||||
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
console.error("Failed to process message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
}
|
||||
// ── Private: audio ────────────────────────────────────
|
||||
async handleAudioInput(base64Audio, format) {
|
||||
const text = await this.transcription.processAudioInput(base64Audio, format);
|
||||
if (text) {
|
||||
await this.enqueueInput(text);
|
||||
}
|
||||
}
|
||||
// ── Private: input queue ──────────────────────────────
|
||||
enqueueInput(text) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, resolve, reject });
|
||||
});
|
||||
}
|
||||
// ── Private: LLM processing ───────────────────────────
|
||||
/**
|
||||
* Process user input with streaming text generation.
|
||||
* Called serially by the input queue.
|
||||
*/
|
||||
async processUserInput(text) {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
const streamAbortSignal = this.currentStreamAbortController.signal;
|
||||
try {
|
||||
this.emit("text", { role: "user", text });
|
||||
this.conversation.addMessage({ role: "user", content: text });
|
||||
const result = (0, ai_1.streamText)({
|
||||
model: this.model,
|
||||
system: this.instructions,
|
||||
messages: this.conversation.getHistoryRef(),
|
||||
tools: this.tools,
|
||||
stopWhen: this.stopWhen,
|
||||
abortSignal: streamAbortSignal,
|
||||
onChunk: ({ chunk }) => {
|
||||
(0, core_1.handleStreamChunk)(chunk, (event, data) => this.emit(event, data));
|
||||
},
|
||||
onFinish: async (event) => {
|
||||
for (const step of event.steps) {
|
||||
for (const toolResult of step.toolResults) {
|
||||
this.emit("tool_result", {
|
||||
name: toolResult.toolName,
|
||||
toolCallId: toolResult.toolCallId,
|
||||
result: toolResult.output,
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
const streamResult = await (0, core_1.processFullStream)(result, {
|
||||
onTextDelta: (delta) => this.speech.processTextDelta(delta),
|
||||
onTextEnd: () => this.speech.flushPendingText(),
|
||||
sendMessage: (msg) => this.ws.send(msg),
|
||||
emitEvent: (event, data) => this.emit(event, data),
|
||||
});
|
||||
// Add assistant response to history
|
||||
if (streamResult.fullText) {
|
||||
this.conversation.addMessage({
|
||||
role: "assistant",
|
||||
content: streamResult.fullText,
|
||||
});
|
||||
}
|
||||
// Flush any remaining speech
|
||||
this.speech.flushPendingText();
|
||||
// Wait for all speech chunks to complete
|
||||
if (this.speech.queueDonePromise) {
|
||||
await this.speech.queueDonePromise;
|
||||
}
|
||||
return streamResult.fullText;
|
||||
}
|
||||
catch (error) {
|
||||
// Clean up speech state on error
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
}
|
||||
finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
// ── Private: helpers ──────────────────────────────────
|
||||
ensureNotDestroyed() {
|
||||
if (this.isDestroyed) {
|
||||
throw new Error("VoiceAgent has been destroyed and cannot be used");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Clean up all in-flight state when the connection drops.
|
||||
*/
|
||||
cleanupOnDisconnect() {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.reset();
|
||||
this._isProcessing = false;
|
||||
this.inputQueue.rejectAll(new Error("Connection closed"));
|
||||
}
|
||||
/**
|
||||
* Forward select events from a child emitter to this agent.
|
||||
*/
|
||||
bubbleEvents(source, events) {
|
||||
for (const event of events) {
|
||||
source.on(event, (...args) => this.emit(event, ...args));
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.VoiceAgent = VoiceAgent;
|
||||
//# sourceMappingURL=VoiceAgent.new.js.map
|
||||
1
dist/VoiceAgent.new.js.map
vendored
Normal file
46
dist/core/ConversationManager.d.ts
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
import { EventEmitter } from "events";
|
||||
import { type ModelMessage } from "ai";
|
||||
import { type HistoryConfig } from "../types";
|
||||
export interface ConversationManagerOptions {
|
||||
history?: Partial<HistoryConfig>;
|
||||
}
|
||||
/**
|
||||
* Manages conversation history (ModelMessage[]) with configurable
|
||||
* limits on message count and total character size.
|
||||
*/
|
||||
export declare class ConversationManager extends EventEmitter {
|
||||
private conversationHistory;
|
||||
private historyConfig;
|
||||
constructor(options?: ConversationManagerOptions);
|
||||
/**
|
||||
* Add a message to history and trim if needed.
|
||||
*/
|
||||
addMessage(message: ModelMessage): void;
|
||||
/**
|
||||
* Get a copy of the current history.
|
||||
*/
|
||||
getHistory(): ModelMessage[];
|
||||
/**
|
||||
* Get a direct reference to the history array.
|
||||
* Use with caution — prefer getHistory() for safety.
|
||||
*/
|
||||
getHistoryRef(): ModelMessage[];
|
||||
/**
|
||||
* Replace the entire conversation history.
|
||||
*/
|
||||
setHistory(history: ModelMessage[]): void;
|
||||
/**
|
||||
* Clear all conversation history.
|
||||
*/
|
||||
clearHistory(): void;
|
||||
/**
|
||||
* Get the number of messages in history.
|
||||
*/
|
||||
get length(): number;
|
||||
/**
|
||||
* Trim conversation history to stay within configured limits.
|
||||
* Removes oldest messages (always in pairs to preserve user/assistant turns).
|
||||
*/
|
||||
private trimHistory;
|
||||
}
|
||||
//# sourceMappingURL=ConversationManager.d.ts.map
|
||||
1
dist/core/ConversationManager.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"ConversationManager.d.ts","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,KAAK,YAAY,EAAE,MAAM,IAAI,CAAC;AACvC,OAAO,EAAE,KAAK,aAAa,EAA0B,MAAM,UAAU,CAAC;AAEtE,MAAM,WAAW,0BAA0B;IACzC,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;CAClC;AAED;;;GAGG;AACH,qBAAa,mBAAoB,SAAQ,YAAY;IACnD,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,aAAa,CAAgB;gBAEzB,OAAO,GAAE,0BAA+B;IAQpD;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,GAAG,IAAI;IAKvC;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;;OAGG;IACH,aAAa,IAAI,YAAY,EAAE;IAI/B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE,GAAG,IAAI;IAIzC;;OAEG;IACH,YAAY,IAAI,IAAI;IAKpB;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;;OAGG;IACH,OAAO,CAAC,WAAW;CAgDpB"}
|
||||
106
dist/core/ConversationManager.js
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.ConversationManager = void 0;
|
||||
const events_1 = require("events");
|
||||
const types_1 = require("../types");
|
||||
/**
|
||||
* Manages conversation history (ModelMessage[]) with configurable
|
||||
* limits on message count and total character size.
|
||||
*/
|
||||
class ConversationManager extends events_1.EventEmitter {
|
||||
conversationHistory = [];
|
||||
historyConfig;
|
||||
constructor(options = {}) {
|
||||
super();
|
||||
this.historyConfig = {
|
||||
...types_1.DEFAULT_HISTORY_CONFIG,
|
||||
...options.history,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Add a message to history and trim if needed.
|
||||
*/
|
||||
addMessage(message) {
|
||||
this.conversationHistory.push(message);
|
||||
this.trimHistory();
|
||||
}
|
||||
/**
|
||||
* Get a copy of the current history.
|
||||
*/
|
||||
getHistory() {
|
||||
return [...this.conversationHistory];
|
||||
}
|
||||
/**
|
||||
* Get a direct reference to the history array.
|
||||
* Use with caution — prefer getHistory() for safety.
|
||||
*/
|
||||
getHistoryRef() {
|
||||
return this.conversationHistory;
|
||||
}
|
||||
/**
|
||||
* Replace the entire conversation history.
|
||||
*/
|
||||
setHistory(history) {
|
||||
this.conversationHistory = [...history];
|
||||
}
|
||||
/**
|
||||
* Clear all conversation history.
|
||||
*/
|
||||
clearHistory() {
|
||||
this.conversationHistory = [];
|
||||
this.emit("history_cleared");
|
||||
}
|
||||
/**
|
||||
* Get the number of messages in history.
|
||||
*/
|
||||
get length() {
|
||||
return this.conversationHistory.length;
|
||||
}
|
||||
/**
|
||||
* Trim conversation history to stay within configured limits.
|
||||
* Removes oldest messages (always in pairs to preserve user/assistant turns).
|
||||
*/
|
||||
trimHistory() {
|
||||
const { maxMessages, maxTotalChars } = this.historyConfig;
|
||||
// Trim by message count
|
||||
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
|
||||
const excess = this.conversationHistory.length - maxMessages;
|
||||
// Round up to even number to preserve turn pairs
|
||||
const toRemove = excess % 2 === 0 ? excess : excess + 1;
|
||||
this.conversationHistory.splice(0, toRemove);
|
||||
this.emit("history_trimmed", {
|
||||
removedCount: toRemove,
|
||||
reason: "max_messages",
|
||||
});
|
||||
}
|
||||
// Trim by total character count
|
||||
if (maxTotalChars > 0) {
|
||||
let totalChars = this.conversationHistory.reduce((sum, msg) => {
|
||||
const content = typeof msg.content === "string"
|
||||
? msg.content
|
||||
: JSON.stringify(msg.content);
|
||||
return sum + content.length;
|
||||
}, 0);
|
||||
let removedCount = 0;
|
||||
while (totalChars > maxTotalChars &&
|
||||
this.conversationHistory.length > 2) {
|
||||
const removed = this.conversationHistory.shift();
|
||||
if (removed) {
|
||||
const content = typeof removed.content === "string"
|
||||
? removed.content
|
||||
: JSON.stringify(removed.content);
|
||||
totalChars -= content.length;
|
||||
removedCount++;
|
||||
}
|
||||
}
|
||||
if (removedCount > 0) {
|
||||
this.emit("history_trimmed", {
|
||||
removedCount,
|
||||
reason: "max_total_chars",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.ConversationManager = ConversationManager;
|
||||
//# sourceMappingURL=ConversationManager.js.map
|
||||
1
dist/core/ConversationManager.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"ConversationManager.js","sourceRoot":"","sources":["../../src/core/ConversationManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AAEtC,oCAAsE;AAMtE;;;GAGG;AACH,MAAa,mBAAoB,SAAQ,qBAAY;IAC3C,mBAAmB,GAAmB,EAAE,CAAC;IACzC,aAAa,CAAgB;IAErC,YAAY,UAAsC,EAAE;QAClD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,aAAa,GAAG;YACnB,GAAG,8BAAsB;YACzB,GAAG,OAAO,CAAC,OAAO;SACnB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAqB;QAC9B,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,EAAE,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,CAAC,GAAG,IAAI,CAAC,mBAAmB,CAAC,CAAC;IACvC,CAAC;IAED;;;OAGG;IACH,aAAa;QACX,OAAO,IAAI,CAAC,mBAAmB,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,OAAuB;QAChC,IAAI,CAAC,mBAAmB,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,YAAY;QACV,IAAI,CAAC,mBAAmB,GAAG,EAAE,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC;IACzC,CAAC;IAED;;;OAGG;IACK,WAAW;QACjB,MAAM,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC;QAE1D,wBAAwB;QACxB,IAAI,WAAW,GAAG,CAAC,IAAI,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,EAAE,CAAC;YACrE,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,WAAW,CAAC;YAC7D,iDAAiD;YACjD,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACxD,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;YAC7C,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;gBAC3B,YAAY,EAAE,QAAQ;gBACtB,MAAM,EAAE,cAAc;aACvB,CAAC,CAAC;QACL,CAAC;QAED,gCAAgC;QAChC,IAAI,aAAa,GAAG,CAAC,EAAE,CAAC;YACtB,IAAI,UAAU,GAAG,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;gBAC5D,MAAM,OAAO,GACX,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ;oBAC7B,CAAC,CAAC,GAAG,CAAC,OAAO;oBACb,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAClC,OAAO,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC;YAC9B,CAAC,EAAE,CAAC,CAAC,CAAC;YAEN,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,OACE,UAAU,GAAG,aAAa;gBAC1B,IAAI,CAAC,mBAAmB,CAAC,MAAM,GAAG,CAAC,EACnC,CAAC;gBACD,MAAM,OAAO,GAAG,IAAI,CAAC,mBAAmB,CAAC,KAAK,EAAE,CAAC;gBACjD,IAAI,OAAO,EAAE,CAAC;oBACZ,MAAM,OAAO,GACX,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ;wBACjC,CAAC,CAAC,OAAO,CAAC,OAAO;wBACjB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;oBACtC,UAAU,IAAI,OAAO,CAAC,MAAM,CAAC;oBAC7B,YAAY,EAAE,CAAC;gBACjB,CAAC;YACH,CAAC;YACD,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;gBACrB,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;oBAC3B,YAAY;oBACZ,MAAM,EAAE,iBAAiB;iBAC1B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;CACF;AA7GD,kDA6GC"}
|
||||
33
dist/core/InputQueue.d.ts
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
/**
|
||||
* A generic serial input queue that ensures only one processor runs at a time.
|
||||
*
|
||||
* @template T The shape of each queued item (must include resolve/reject)
|
||||
*/
|
||||
export interface QueueItem<T = string> {
|
||||
resolve: (v: T) => void;
|
||||
reject: (e: unknown) => void;
|
||||
}
|
||||
export declare class InputQueue<T extends QueueItem<any>> {
|
||||
private queue;
|
||||
private processing;
|
||||
/** Callback invoked for each item — must return a resolved value */
|
||||
processor: (item: T) => Promise<any>;
|
||||
/**
|
||||
* Enqueue an item for serial processing.
|
||||
*/
|
||||
enqueue(item: T): void;
|
||||
/**
|
||||
* Reject all pending items (used on disconnect/destroy).
|
||||
*/
|
||||
rejectAll(reason: Error): void;
|
||||
/**
|
||||
* Number of items waiting in the queue.
|
||||
*/
|
||||
get length(): number;
|
||||
/**
|
||||
* Whether the queue is currently processing an item.
|
||||
*/
|
||||
get isProcessing(): boolean;
|
||||
private drain;
|
||||
}
|
||||
//# sourceMappingURL=InputQueue.d.ts.map
|
||||
1
dist/core/InputQueue.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"InputQueue.d.ts","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,SAAS,CAAC,CAAC,GAAG,MAAM;IACnC,OAAO,EAAE,CAAC,CAAC,EAAE,CAAC,KAAK,IAAI,CAAC;IACxB,MAAM,EAAE,CAAC,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CAC9B;AAED,qBAAa,UAAU,CAAC,CAAC,SAAS,SAAS,CAAC,GAAG,CAAC;IAC9C,OAAO,CAAC,KAAK,CAAW;IACxB,OAAO,CAAC,UAAU,CAAS;IAE3B,oEAAoE;IAC7D,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,OAAO,CAAC,GAAG,CAAC,CAAkB;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,CAAC,GAAG,IAAI;IAKtB;;OAEG;IACH,SAAS,CAAC,MAAM,EAAE,KAAK,GAAG,IAAI;IAQ9B;;OAEG;IACH,IAAI,MAAM,IAAI,MAAM,CAEnB;IAED;;OAEG;IACH,IAAI,YAAY,IAAI,OAAO,CAE1B;YAIa,KAAK;CAkBpB"}
|
||||
61
dist/core/InputQueue.js
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.InputQueue = void 0;
|
||||
class InputQueue {
|
||||
queue = [];
|
||||
processing = false;
|
||||
/** Callback invoked for each item — must return a resolved value */
|
||||
processor = async () => "";
|
||||
/**
|
||||
* Enqueue an item for serial processing.
|
||||
*/
|
||||
enqueue(item) {
|
||||
this.queue.push(item);
|
||||
this.drain();
|
||||
}
|
||||
/**
|
||||
* Reject all pending items (used on disconnect/destroy).
|
||||
*/
|
||||
rejectAll(reason) {
|
||||
for (const item of this.queue) {
|
||||
item.reject(reason);
|
||||
}
|
||||
this.queue = [];
|
||||
this.processing = false;
|
||||
}
|
||||
/**
|
||||
* Number of items waiting in the queue.
|
||||
*/
|
||||
get length() {
|
||||
return this.queue.length;
|
||||
}
|
||||
/**
|
||||
* Whether the queue is currently processing an item.
|
||||
*/
|
||||
get isProcessing() {
|
||||
return this.processing;
|
||||
}
|
||||
// ── Private ──────────────────────────────────────────
|
||||
async drain() {
|
||||
if (this.processing)
|
||||
return;
|
||||
this.processing = true;
|
||||
try {
|
||||
while (this.queue.length > 0) {
|
||||
const item = this.queue.shift();
|
||||
try {
|
||||
const result = await this.processor(item);
|
||||
item.resolve(result);
|
||||
}
|
||||
catch (error) {
|
||||
item.reject(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
this.processing = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.InputQueue = InputQueue;
|
||||
//# sourceMappingURL=InputQueue.js.map
|
||||
1
dist/core/InputQueue.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"InputQueue.js","sourceRoot":"","sources":["../../src/core/InputQueue.ts"],"names":[],"mappings":";;;AAUA,MAAa,UAAU;IACb,KAAK,GAAQ,EAAE,CAAC;IAChB,UAAU,GAAG,KAAK,CAAC;IAE3B,oEAAoE;IAC7D,SAAS,GAA8B,KAAK,IAAI,EAAE,CAAC,EAAE,CAAC;IAE7D;;OAEG;IACH,OAAO,CAAC,IAAO;QACb,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtB,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,MAAa;QACrB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;QAChB,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;IAED,wDAAwD;IAEhD,KAAK,CAAC,KAAK;QACjB,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO;QAC5B,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;QAEvB,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;gBACjC,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;oBAC1C,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;gBACvB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;QAC1B,CAAC;IACH,CAAC;CACF;AA5DD,gCA4DC"}
|
||||
83
dist/core/SpeechManager.d.ts
vendored
Normal file
@@ -0,0 +1,83 @@
|
||||
import { EventEmitter } from "events";
|
||||
import { type SpeechModel } from "ai";
|
||||
import { type StreamingSpeechConfig } from "../types";
|
||||
export interface SpeechManagerOptions {
|
||||
speechModel?: SpeechModel;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
}
|
||||
/**
|
||||
* Manages text-to-speech generation, streaming speech chunking,
|
||||
* parallel TTS requests, and speech interruption.
|
||||
*/
|
||||
export declare class SpeechManager extends EventEmitter {
|
||||
private speechModel?;
|
||||
private voice;
|
||||
private speechInstructions?;
|
||||
private outputFormat;
|
||||
private streamingSpeechConfig;
|
||||
private currentSpeechAbortController?;
|
||||
private speechChunkQueue;
|
||||
private nextChunkId;
|
||||
private _isSpeaking;
|
||||
private pendingTextBuffer;
|
||||
private speechQueueDonePromise?;
|
||||
private speechQueueDoneResolve?;
|
||||
/** Callback to send messages over the WebSocket */
|
||||
sendMessage: (message: Record<string, unknown>) => void;
|
||||
constructor(options: SpeechManagerOptions);
|
||||
get isSpeaking(): boolean;
|
||||
get pendingChunkCount(): number;
|
||||
get hasSpeechModel(): boolean;
|
||||
/**
|
||||
* Returns a promise that resolves when the speech queue is fully drained.
|
||||
* Returns undefined if there is nothing queued.
|
||||
*/
|
||||
get queueDonePromise(): Promise<void> | undefined;
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
generateAndSendSpeechFull(text: string): Promise<void>;
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
interruptSpeech(reason?: string): void;
|
||||
/**
|
||||
* Process a text delta for streaming speech.
|
||||
* Call this as text chunks arrive from the LLM.
|
||||
*/
|
||||
processTextDelta(textDelta: string): void;
|
||||
/**
|
||||
* Flush any remaining text in the buffer to speech.
|
||||
* Call this when the LLM stream ends.
|
||||
*/
|
||||
flushPendingText(): void;
|
||||
/**
|
||||
* Reset all speech state (used on disconnect / cleanup).
|
||||
*/
|
||||
reset(): void;
|
||||
/**
|
||||
* Extract complete sentences from text buffer.
|
||||
* Returns [extractedSentences, remainingBuffer].
|
||||
*/
|
||||
private extractSentences;
|
||||
/**
|
||||
* Queue a text chunk for speech generation.
|
||||
*/
|
||||
private queueSpeechChunk;
|
||||
/**
|
||||
* Generate audio for a single chunk.
|
||||
*/
|
||||
private generateChunkAudio;
|
||||
/**
|
||||
* Process the speech queue and send audio chunks in order.
|
||||
*/
|
||||
private processSpeechQueue;
|
||||
}
|
||||
//# sourceMappingURL=SpeechManager.d.ts.map
|
||||
1
dist/core/SpeechManager.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"SpeechManager.d.ts","sourceRoot":"","sources":["../../src/core/SpeechManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAE3B,MAAM,UAAU,CAAC;AAElB,MAAM,WAAW,oBAAoB;IACnC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;CAClD;AAED;;;GAGG;AACH,qBAAa,aAAc,SAAQ,YAAY;IAC7C,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,qBAAqB,CAAwB;IAErD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;IAE5C,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAa;gBAE/D,OAAO,EAAE,oBAAoB;IAYzC,IAAI,UAAU,IAAI,OAAO,CAExB;IAED,IAAI,iBAAiB,IAAI,MAAM,CAE9B;IAED,IAAI,cAAc,IAAI,OAAO,CAE5B;IAED;;;OAGG;IACH,IAAI,gBAAgB,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,SAAS,CAEhD;IAED;;OAEG;IACG,sBAAsB,CAC1B,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;OAEG;IACG,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA4B5D;;OAEG;IACH,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgCrD;;;OAGG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;IAazC;;;OAGG;IACH,gBAAgB,IAAI,IAAI;IAOxB;;OAEG;IACH,KAAK,IAAI,IAAI;IAkBb;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA+CxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAwCxB;;OAEG;YACW,kBAAkB;IAiChC;;OAEG;YACW,kBAAkB;CA0GjC"}
|
||||
356
dist/core/SpeechManager.js
vendored
Normal file
@@ -0,0 +1,356 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.SpeechManager = void 0;
|
||||
const events_1 = require("events");
|
||||
const ai_1 = require("ai");
|
||||
const types_1 = require("../types");
|
||||
/**
|
||||
* Manages text-to-speech generation, streaming speech chunking,
|
||||
* parallel TTS requests, and speech interruption.
|
||||
*/
|
||||
class SpeechManager extends events_1.EventEmitter {
|
||||
speechModel;
|
||||
voice;
|
||||
speechInstructions;
|
||||
outputFormat;
|
||||
streamingSpeechConfig;
|
||||
currentSpeechAbortController;
|
||||
speechChunkQueue = [];
|
||||
nextChunkId = 0;
|
||||
_isSpeaking = false;
|
||||
pendingTextBuffer = "";
|
||||
// Promise-based signal for speech queue completion
|
||||
speechQueueDonePromise;
|
||||
speechQueueDoneResolve;
|
||||
/** Callback to send messages over the WebSocket */
|
||||
sendMessage = () => { };
|
||||
constructor(options) {
|
||||
super();
|
||||
this.speechModel = options.speechModel;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.streamingSpeechConfig = {
|
||||
...types_1.DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||
...options.streamingSpeech,
|
||||
};
|
||||
}
|
||||
get isSpeaking() {
|
||||
return this._isSpeaking;
|
||||
}
|
||||
get pendingChunkCount() {
|
||||
return this.speechChunkQueue.length;
|
||||
}
|
||||
get hasSpeechModel() {
|
||||
return !!this.speechModel;
|
||||
}
|
||||
/**
|
||||
* Returns a promise that resolves when the speech queue is fully drained.
|
||||
* Returns undefined if there is nothing queued.
|
||||
*/
|
||||
get queueDonePromise() {
|
||||
return this.speechQueueDonePromise;
|
||||
}
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
async generateSpeechFromText(text, abortSignal) {
|
||||
if (!this.speechModel) {
|
||||
throw new Error("Speech model not configured");
|
||||
}
|
||||
const result = await (0, ai_1.experimental_generateSpeech)({
|
||||
model: this.speechModel,
|
||||
text,
|
||||
voice: this.voice,
|
||||
instructions: this.speechInstructions,
|
||||
outputFormat: this.outputFormat,
|
||||
abortSignal,
|
||||
});
|
||||
return result.audio.uint8Array;
|
||||
}
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
async generateAndSendSpeechFull(text) {
|
||||
if (!this.speechModel)
|
||||
return;
|
||||
try {
|
||||
this.emit("speech_start", { text, streaming: false });
|
||||
const audioData = await this.generateSpeechFromText(text);
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
this.sendMessage({
|
||||
type: "audio",
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
});
|
||||
this.emit("audio", {
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
uint8Array: audioData,
|
||||
});
|
||||
this.emit("speech_complete", { text, streaming: false });
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Failed to generate speech:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
interruptSpeech(reason = "interrupted") {
|
||||
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
|
||||
return;
|
||||
}
|
||||
// Abort any pending speech generation requests
|
||||
if (this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController.abort();
|
||||
this.currentSpeechAbortController = undefined;
|
||||
}
|
||||
// Clear the speech queue
|
||||
this.speechChunkQueue = [];
|
||||
this.pendingTextBuffer = "";
|
||||
this._isSpeaking = false;
|
||||
// Resolve any pending speech-done waiters so callers can finish
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
// Notify clients to stop audio playback
|
||||
this.sendMessage({
|
||||
type: "speech_interrupted",
|
||||
reason,
|
||||
});
|
||||
this.emit("speech_interrupted", { reason });
|
||||
}
|
||||
/**
|
||||
* Process a text delta for streaming speech.
|
||||
* Call this as text chunks arrive from the LLM.
|
||||
*/
|
||||
processTextDelta(textDelta) {
|
||||
if (!this.speechModel)
|
||||
return;
|
||||
this.pendingTextBuffer += textDelta;
|
||||
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
|
||||
this.pendingTextBuffer = remaining;
|
||||
for (const sentence of sentences) {
|
||||
this.queueSpeechChunk(sentence);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Flush any remaining text in the buffer to speech.
|
||||
* Call this when the LLM stream ends.
|
||||
*/
|
||||
flushPendingText() {
|
||||
if (!this.speechModel || !this.pendingTextBuffer.trim())
|
||||
return;
|
||||
this.queueSpeechChunk(this.pendingTextBuffer);
|
||||
this.pendingTextBuffer = "";
|
||||
}
|
||||
/**
|
||||
* Reset all speech state (used on disconnect / cleanup).
|
||||
*/
|
||||
reset() {
|
||||
if (this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController.abort();
|
||||
this.currentSpeechAbortController = undefined;
|
||||
}
|
||||
this.speechChunkQueue = [];
|
||||
this.pendingTextBuffer = "";
|
||||
this._isSpeaking = false;
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
}
|
||||
// ── Private helpers ─────────────────────────────────────────
|
||||
/**
|
||||
* Extract complete sentences from text buffer.
|
||||
* Returns [extractedSentences, remainingBuffer].
|
||||
*/
|
||||
extractSentences(text) {
|
||||
const sentences = [];
|
||||
let remaining = text;
|
||||
// Match sentences ending with . ! ? followed by space or end of string
|
||||
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
|
||||
let lastIndex = 0;
|
||||
let match;
|
||||
while ((match = sentenceEndPattern.exec(text)) !== null) {
|
||||
const sentence = text
|
||||
.slice(lastIndex, match.index + match[0].length)
|
||||
.trim();
|
||||
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
|
||||
sentences.push(sentence);
|
||||
lastIndex = match.index + match[0].length;
|
||||
}
|
||||
else if (sentences.length > 0) {
|
||||
// Append short sentence to previous one
|
||||
sentences[sentences.length - 1] += " " + sentence;
|
||||
lastIndex = match.index + match[0].length;
|
||||
}
|
||||
}
|
||||
remaining = text.slice(lastIndex);
|
||||
// If remaining text is too long, force split at clause boundaries
|
||||
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
|
||||
const clausePattern = /[,;:]\s+/g;
|
||||
let clauseMatch;
|
||||
let splitIndex = 0;
|
||||
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
|
||||
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
|
||||
splitIndex = clauseMatch.index + clauseMatch[0].length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (splitIndex > 0) {
|
||||
sentences.push(remaining.slice(0, splitIndex).trim());
|
||||
remaining = remaining.slice(splitIndex);
|
||||
}
|
||||
}
|
||||
return [sentences, remaining];
|
||||
}
|
||||
/**
|
||||
* Queue a text chunk for speech generation.
|
||||
*/
|
||||
queueSpeechChunk(text) {
|
||||
if (!this.speechModel || !text.trim())
|
||||
return;
|
||||
// Wrap chunk ID to prevent unbounded growth in very long sessions
|
||||
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
|
||||
this.nextChunkId = 0;
|
||||
}
|
||||
const chunk = {
|
||||
id: this.nextChunkId++,
|
||||
text: text.trim(),
|
||||
};
|
||||
// Create the speech-done promise if not already present
|
||||
if (!this.speechQueueDonePromise) {
|
||||
this.speechQueueDonePromise = new Promise((resolve) => {
|
||||
this.speechQueueDoneResolve = resolve;
|
||||
});
|
||||
}
|
||||
// Start generating audio immediately (parallel generation)
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
|
||||
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
}
|
||||
this.speechChunkQueue.push(chunk);
|
||||
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
|
||||
// Start processing queue if not already
|
||||
if (!this._isSpeaking) {
|
||||
this.processSpeechQueue();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Generate audio for a single chunk.
|
||||
*/
|
||||
async generateChunkAudio(chunk) {
|
||||
if (!this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController = new AbortController();
|
||||
}
|
||||
try {
|
||||
console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`);
|
||||
const audioData = await this.generateSpeechFromText(chunk.text, this.currentSpeechAbortController.signal);
|
||||
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
|
||||
return audioData;
|
||||
}
|
||||
catch (error) {
|
||||
if (error.name === "AbortError") {
|
||||
console.log(`Audio generation aborted for chunk ${chunk.id}`);
|
||||
return null;
|
||||
}
|
||||
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
|
||||
this.emit("error", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process the speech queue and send audio chunks in order.
|
||||
*/
|
||||
async processSpeechQueue() {
|
||||
if (this._isSpeaking)
|
||||
return;
|
||||
this._isSpeaking = true;
|
||||
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
|
||||
this.emit("speech_start", { streaming: true });
|
||||
this.sendMessage({ type: "speech_stream_start" });
|
||||
try {
|
||||
while (this.speechChunkQueue.length > 0) {
|
||||
const chunk = this.speechChunkQueue[0];
|
||||
console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
|
||||
// Ensure audio generation has started
|
||||
if (!chunk.audioPromise) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
// Wait for this chunk's audio
|
||||
const audioData = await chunk.audioPromise;
|
||||
// Check if we were interrupted while waiting
|
||||
if (!this._isSpeaking) {
|
||||
console.log(`Speech interrupted during chunk #${chunk.id}`);
|
||||
break;
|
||||
}
|
||||
// Remove from queue after processing
|
||||
this.speechChunkQueue.shift();
|
||||
if (audioData) {
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);
|
||||
// Send audio chunk via WebSocket
|
||||
this.sendMessage({
|
||||
type: "audio_chunk",
|
||||
chunkId: chunk.id,
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
text: chunk.text,
|
||||
});
|
||||
// Emit for local handling
|
||||
this.emit("audio_chunk", {
|
||||
chunkId: chunk.id,
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
text: chunk.text,
|
||||
uint8Array: audioData,
|
||||
});
|
||||
}
|
||||
else {
|
||||
console.log(`No audio data generated for chunk #${chunk.id}`);
|
||||
}
|
||||
// Start generating next chunks in parallel
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
const activeRequests = this.speechChunkQueue.filter((c) => c.audioPromise).length;
|
||||
const toStart = Math.min(this.streamingSpeechConfig.maxParallelRequests - activeRequests, this.speechChunkQueue.length);
|
||||
if (toStart > 0) {
|
||||
console.log(`Starting parallel generation for ${toStart} more chunks`);
|
||||
for (let i = 0; i < toStart; i++) {
|
||||
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
|
||||
if (nextChunk) {
|
||||
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Error in speech queue processing:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
finally {
|
||||
this._isSpeaking = false;
|
||||
this.currentSpeechAbortController = undefined;
|
||||
// Signal that the speech queue is fully drained
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
console.log(`Speech queue processing complete`);
|
||||
this.sendMessage({ type: "speech_stream_end" });
|
||||
this.emit("speech_complete", { streaming: true });
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.SpeechManager = SpeechManager;
|
||||
//# sourceMappingURL=SpeechManager.js.map
|
||||
1
dist/core/SpeechManager.js.map
vendored
Normal file
42
dist/core/StreamProcessor.d.ts
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
import { type streamText } from "ai";
|
||||
/**
|
||||
* Result of processing a full LLM stream.
|
||||
*/
|
||||
export interface StreamResult {
|
||||
fullText: string;
|
||||
fullReasoning: string;
|
||||
allToolCalls: Array<{
|
||||
toolName: string;
|
||||
toolCallId: string;
|
||||
input: unknown;
|
||||
}>;
|
||||
allToolResults: Array<{
|
||||
toolName: string;
|
||||
toolCallId: string;
|
||||
output: unknown;
|
||||
}>;
|
||||
allSources: Array<unknown>;
|
||||
allFiles: Array<unknown>;
|
||||
}
|
||||
export interface StreamProcessorCallbacks {
|
||||
/** Called when a text delta arrives (for streaming speech, etc.) */
|
||||
onTextDelta?: (text: string) => void;
|
||||
/** Called when a text-end part arrives (flush speech, etc.) */
|
||||
onTextEnd?: () => void;
|
||||
/** Send a WebSocket message */
|
||||
sendMessage: (message: Record<string, unknown>) => void;
|
||||
/** Emit an event on the agent */
|
||||
emitEvent: (event: string, data?: unknown) => void;
|
||||
}
|
||||
/**
|
||||
* Processes the fullStream from an AI SDK `streamText` call,
|
||||
* forwarding events to WebSocket clients and collecting the complete response.
|
||||
*
|
||||
* This is a standalone function (not a class) because it has no persistent state.
|
||||
*/
|
||||
export declare function processFullStream(result: ReturnType<typeof streamText>, callbacks: StreamProcessorCallbacks, extraResponseFields?: Record<string, unknown>): Promise<StreamResult>;
|
||||
/**
|
||||
* Handle onChunk callback events and emit them.
|
||||
*/
|
||||
export declare function handleStreamChunk(chunk: any, emitEvent: (event: string, data?: unknown) => void): void;
|
||||
//# sourceMappingURL=StreamProcessor.d.ts.map
|
||||
1
dist/core/StreamProcessor.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"StreamProcessor.d.ts","sourceRoot":"","sources":["../../src/core/StreamProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,KAAK,CAAC;QAClB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,KAAK,EAAE,OAAO,CAAC;KAChB,CAAC,CAAC;IACH,cAAc,EAAE,KAAK,CAAC;QACpB,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,UAAU,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;IAC3B,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;CAC1B;AAED,MAAM,WAAW,wBAAwB;IACvC,oEAAoE;IACpE,WAAW,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IACrC,+DAA+D;IAC/D,SAAS,CAAC,EAAE,MAAM,IAAI,CAAC;IACvB,+BAA+B;IAC/B,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAC;IACxD,iCAAiC;IACjC,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,CAAC;CACpD;AAED;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,UAAU,CAAC,OAAO,UAAU,CAAC,EACrC,SAAS,EAAE,wBAAwB,EACnC,mBAAmB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAC5C,OAAO,CAAC,YAAY,CAAC,CAkMvB;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,KAAK,EAAE,GAAG,EACV,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,OAAO,KAAK,IAAI,GACjD,IAAI,CA+CN"}
|
||||
228
dist/core/StreamProcessor.js
vendored
Normal file
@@ -0,0 +1,228 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.processFullStream = processFullStream;
|
||||
exports.handleStreamChunk = handleStreamChunk;
|
||||
/**
|
||||
* Processes the fullStream from an AI SDK `streamText` call,
|
||||
* forwarding events to WebSocket clients and collecting the complete response.
|
||||
*
|
||||
* This is a standalone function (not a class) because it has no persistent state.
|
||||
*/
|
||||
async function processFullStream(result, callbacks, extraResponseFields) {
|
||||
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
|
||||
let fullText = "";
|
||||
let fullReasoning = "";
|
||||
const allToolCalls = [];
|
||||
const allToolResults = [];
|
||||
const allSources = [];
|
||||
const allFiles = [];
|
||||
for await (const part of result.fullStream) {
|
||||
switch (part.type) {
|
||||
// ── Stream lifecycle ──────────────────────────────
|
||||
case "start":
|
||||
sendMessage({ type: "stream_start" });
|
||||
break;
|
||||
case "finish":
|
||||
emitEvent("text", { role: "assistant", text: fullText });
|
||||
sendMessage({
|
||||
type: "stream_finish",
|
||||
finishReason: part.finishReason,
|
||||
usage: part.totalUsage,
|
||||
});
|
||||
break;
|
||||
case "error":
|
||||
emitEvent("error", part.error);
|
||||
sendMessage({
|
||||
type: "stream_error",
|
||||
error: String(part.error),
|
||||
});
|
||||
break;
|
||||
case "abort":
|
||||
emitEvent("abort", { reason: part.reason });
|
||||
sendMessage({
|
||||
type: "stream_abort",
|
||||
reason: part.reason,
|
||||
});
|
||||
break;
|
||||
// ── Step lifecycle ────────────────────────────────
|
||||
case "start-step":
|
||||
sendMessage({
|
||||
type: "step_start",
|
||||
warnings: part.warnings,
|
||||
});
|
||||
break;
|
||||
case "finish-step":
|
||||
sendMessage({
|
||||
type: "step_finish",
|
||||
finishReason: part.finishReason,
|
||||
usage: part.usage,
|
||||
});
|
||||
break;
|
||||
// ── Text streaming ────────────────────────────────
|
||||
case "text-start":
|
||||
sendMessage({ type: "text_start", id: part.id });
|
||||
break;
|
||||
case "text-delta":
|
||||
fullText += part.text;
|
||||
onTextDelta?.(part.text);
|
||||
sendMessage({
|
||||
type: "text_delta",
|
||||
id: part.id,
|
||||
text: part.text,
|
||||
});
|
||||
break;
|
||||
case "text-end":
|
||||
onTextEnd?.();
|
||||
sendMessage({ type: "text_end", id: part.id });
|
||||
break;
|
||||
// ── Reasoning streaming ───────────────────────────
|
||||
case "reasoning-start":
|
||||
sendMessage({ type: "reasoning_start", id: part.id });
|
||||
break;
|
||||
case "reasoning-delta":
|
||||
fullReasoning += part.text;
|
||||
sendMessage({
|
||||
type: "reasoning_delta",
|
||||
id: part.id,
|
||||
text: part.text,
|
||||
});
|
||||
break;
|
||||
case "reasoning-end":
|
||||
sendMessage({ type: "reasoning_end", id: part.id });
|
||||
break;
|
||||
// ── Tool input streaming ──────────────────────────
|
||||
case "tool-input-start":
|
||||
sendMessage({
|
||||
type: "tool_input_start",
|
||||
id: part.id,
|
||||
toolName: part.toolName,
|
||||
});
|
||||
break;
|
||||
case "tool-input-delta":
|
||||
sendMessage({
|
||||
type: "tool_input_delta",
|
||||
id: part.id,
|
||||
delta: part.delta,
|
||||
});
|
||||
break;
|
||||
case "tool-input-end":
|
||||
sendMessage({ type: "tool_input_end", id: part.id });
|
||||
break;
|
||||
// ── Tool execution ────────────────────────────────
|
||||
case "tool-call":
|
||||
allToolCalls.push({
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
input: part.input,
|
||||
});
|
||||
sendMessage({
|
||||
type: "tool_call",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
input: part.input,
|
||||
});
|
||||
break;
|
||||
case "tool-result":
|
||||
allToolResults.push({
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
output: part.output,
|
||||
});
|
||||
sendMessage({
|
||||
type: "tool_result",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
result: part.output,
|
||||
});
|
||||
break;
|
||||
case "tool-error":
|
||||
sendMessage({
|
||||
type: "tool_error",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
error: String(part.error),
|
||||
});
|
||||
break;
|
||||
// ── Sources and files ─────────────────────────────
|
||||
case "source":
|
||||
allSources.push(part);
|
||||
sendMessage({
|
||||
type: "source",
|
||||
source: part,
|
||||
});
|
||||
break;
|
||||
case "file":
|
||||
allFiles.push(part.file);
|
||||
sendMessage({
|
||||
type: "file",
|
||||
file: part.file,
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Send the complete response
|
||||
sendMessage({
|
||||
type: "response_complete",
|
||||
text: fullText,
|
||||
reasoning: fullReasoning || undefined,
|
||||
toolCalls: allToolCalls,
|
||||
toolResults: allToolResults,
|
||||
sources: allSources.length > 0 ? allSources : undefined,
|
||||
files: allFiles.length > 0 ? allFiles : undefined,
|
||||
...extraResponseFields,
|
||||
});
|
||||
return {
|
||||
fullText,
|
||||
fullReasoning,
|
||||
allToolCalls,
|
||||
allToolResults,
|
||||
allSources,
|
||||
allFiles,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Handle onChunk callback events and emit them.
|
||||
*/
|
||||
function handleStreamChunk(chunk, emitEvent) {
|
||||
switch (chunk.type) {
|
||||
case "text-delta":
|
||||
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
|
||||
break;
|
||||
case "reasoning-delta":
|
||||
emitEvent("chunk:reasoning_delta", {
|
||||
id: chunk.id,
|
||||
text: chunk.text,
|
||||
});
|
||||
break;
|
||||
case "tool-call":
|
||||
emitEvent("chunk:tool_call", {
|
||||
toolName: chunk.toolName,
|
||||
toolCallId: chunk.toolCallId,
|
||||
input: chunk.input,
|
||||
});
|
||||
break;
|
||||
case "tool-result":
|
||||
emitEvent("chunk:tool_result", {
|
||||
toolName: chunk.toolName,
|
||||
toolCallId: chunk.toolCallId,
|
||||
result: chunk.output,
|
||||
});
|
||||
break;
|
||||
case "tool-input-start":
|
||||
emitEvent("chunk:tool_input_start", {
|
||||
id: chunk.id,
|
||||
toolName: chunk.toolName,
|
||||
});
|
||||
break;
|
||||
case "tool-input-delta":
|
||||
emitEvent("chunk:tool_input_delta", {
|
||||
id: chunk.id,
|
||||
delta: chunk.delta,
|
||||
});
|
||||
break;
|
||||
case "source":
|
||||
emitEvent("chunk:source", chunk);
|
||||
break;
|
||||
}
|
||||
}
|
||||
//# sourceMappingURL=StreamProcessor.js.map
|
||||
1
dist/core/StreamProcessor.js.map
vendored
Normal file
28
dist/core/TranscriptionManager.d.ts
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
import { EventEmitter } from "events";
|
||||
import { type TranscriptionModel } from "ai";
|
||||
export interface TranscriptionManagerOptions {
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
/**
|
||||
* Handles audio transcription using the AI SDK transcription model
|
||||
* and validation of incoming audio data.
|
||||
*/
|
||||
export declare class TranscriptionManager extends EventEmitter {
|
||||
private transcriptionModel?;
|
||||
private maxAudioInputSize;
|
||||
/** Callback to send messages over the WebSocket */
|
||||
sendMessage: (message: Record<string, unknown>) => void;
|
||||
constructor(options?: TranscriptionManagerOptions);
|
||||
get hasTranscriptionModel(): boolean;
|
||||
/**
|
||||
* Transcribe audio data to text.
|
||||
*/
|
||||
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
|
||||
/**
|
||||
* Process incoming base64-encoded audio: validate, decode, transcribe.
|
||||
* Returns the transcribed text, or null if invalid / empty.
|
||||
*/
|
||||
processAudioInput(base64Audio: string, format?: string): Promise<string | null>;
|
||||
}
|
||||
//# sourceMappingURL=TranscriptionManager.d.ts.map
|
||||
1
dist/core/TranscriptionManager.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TranscriptionManager.d.ts","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAEL,KAAK,kBAAkB,EACxB,MAAM,IAAI,CAAC;AAGZ,MAAM,WAAW,2BAA2B;IAC1C,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;GAGG;AACH,qBAAa,oBAAqB,SAAQ,YAAY;IACpD,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,IAAI,CAAY;gBAE9D,OAAO,GAAE,2BAAgC;IAOrD,IAAI,qBAAqB,IAAI,OAAO,CAEnC;IAED;;OAEG;IACG,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAsCtE;;;OAGG;IACG,iBAAiB,CACrB,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,MAAM,GACd,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CA2D1B"}
|
||||
106
dist/core/TranscriptionManager.js
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.TranscriptionManager = void 0;
|
||||
const events_1 = require("events");
|
||||
const ai_1 = require("ai");
|
||||
const types_1 = require("../types");
|
||||
/**
|
||||
* Handles audio transcription using the AI SDK transcription model
|
||||
* and validation of incoming audio data.
|
||||
*/
|
||||
class TranscriptionManager extends events_1.EventEmitter {
|
||||
transcriptionModel;
|
||||
maxAudioInputSize;
|
||||
/** Callback to send messages over the WebSocket */
|
||||
sendMessage = () => { };
|
||||
constructor(options = {}) {
|
||||
super();
|
||||
this.transcriptionModel = options.transcriptionModel;
|
||||
this.maxAudioInputSize =
|
||||
options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
|
||||
}
|
||||
get hasTranscriptionModel() {
|
||||
return !!this.transcriptionModel;
|
||||
}
|
||||
/**
|
||||
* Transcribe audio data to text.
|
||||
*/
|
||||
async transcribeAudio(audioData) {
|
||||
if (!this.transcriptionModel) {
|
||||
throw new Error("Transcription model not configured");
|
||||
}
|
||||
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
|
||||
try {
|
||||
const result = await (0, ai_1.experimental_transcribe)({
|
||||
model: this.transcriptionModel,
|
||||
audio: audioData,
|
||||
});
|
||||
console.log(`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`);
|
||||
this.emit("transcription", {
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
});
|
||||
// Send transcription to client for immediate feedback
|
||||
this.sendMessage({
|
||||
type: "transcription_result",
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
});
|
||||
return result.text;
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Whisper transcription failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Process incoming base64-encoded audio: validate, decode, transcribe.
|
||||
* Returns the transcribed text, or null if invalid / empty.
|
||||
*/
|
||||
async processAudioInput(base64Audio, format) {
|
||||
if (!this.transcriptionModel) {
|
||||
const error = new Error("Transcription model not configured for audio input");
|
||||
this.emit("error", error);
|
||||
this.sendMessage({ type: "error", error: error.message });
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
const audioBuffer = Buffer.from(base64Audio, "base64");
|
||||
// Validate audio size
|
||||
if (audioBuffer.length > this.maxAudioInputSize) {
|
||||
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
|
||||
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
|
||||
this.emit("error", new Error(`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`));
|
||||
return null;
|
||||
}
|
||||
if (audioBuffer.length === 0) {
|
||||
this.emit("warning", "Received empty audio data");
|
||||
return null;
|
||||
}
|
||||
this.emit("audio_received", { size: audioBuffer.length, format });
|
||||
console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`);
|
||||
const transcribedText = await this.transcribeAudio(audioBuffer);
|
||||
console.log(`Transcribed text: "${transcribedText}"`);
|
||||
if (!transcribedText.trim()) {
|
||||
this.emit("warning", "Transcription returned empty text");
|
||||
this.sendMessage({
|
||||
type: "transcription_error",
|
||||
error: "Whisper returned empty text",
|
||||
});
|
||||
return null;
|
||||
}
|
||||
return transcribedText;
|
||||
}
|
||||
catch (error) {
|
||||
console.error("Failed to process audio input:", error);
|
||||
this.emit("error", error);
|
||||
this.sendMessage({
|
||||
type: "transcription_error",
|
||||
error: `Transcription failed: ${error.message || String(error)}`,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.TranscriptionManager = TranscriptionManager;
|
||||
//# sourceMappingURL=TranscriptionManager.js.map
|
||||
1
dist/core/TranscriptionManager.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"TranscriptionManager.js","sourceRoot":"","sources":["../../src/core/TranscriptionManager.ts"],"names":[],"mappings":";;;AAAA,mCAAsC;AACtC,2BAGY;AACZ,oCAAkD;AAOlD;;;GAGG;AACH,MAAa,oBAAqB,SAAQ,qBAAY;IAC5C,kBAAkB,CAAsB;IACxC,iBAAiB,CAAS;IAElC,mDAAmD;IAC5C,WAAW,GAA+C,GAAG,EAAE,GAAE,CAAC,CAAC;IAE1E,YAAY,UAAuC,EAAE;QACnD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,kBAAkB,GAAG,OAAO,CAAC,kBAAkB,CAAC;QACrD,IAAI,CAAC,iBAAiB;YACpB,OAAO,CAAC,iBAAiB,IAAI,8BAAsB,CAAC;IACxD,CAAC;IAED,IAAI,qBAAqB;QACvB,OAAO,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC;IACnC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,eAAe,CAAC,SAA8B;QAClD,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,CAAC,GAAG,CACT,WAAW,SAAS,CAAC,UAAU,qCAAqC,CACrE,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAA,4BAAU,EAAC;gBAC9B,KAAK,EAAE,IAAI,CAAC,kBAAkB;gBAC9B,KAAK,EAAE,SAAS;aACjB,CAAC,CAAC;YAEH,OAAO,CAAC,GAAG,CACT,kCAAkC,MAAM,CAAC,IAAI,gBAAgB,MAAM,CAAC,QAAQ,IAAI,SAAS,EAAE,CAC5F,CAAC;YAEF,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE;gBACzB,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,sDAAsD;YACtD,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,sBAAsB;gBAC5B,IAAI,EAAE,MAAM,CAAC,IAAI;gBACjB,QAAQ,EAAE,MAAM,CAAC,QAAQ;aAC1B,CAAC,CAAC;YAEH,OAAO,MAAM,CAAC,IAAI,CAAC;QACrB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,+BAA+B,EAAE,KAAK,CAAC,CAAC;YACtD,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,iBAAiB,CACrB,WAAmB,EACnB,MAAe;QAEf,IAAI,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,KAAK,CACrB,oDAAoD,CACrD,CAAC;YACF,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;YAC1D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YAEvD,sBAAsB;YACtB,IAAI,WAAW,CAAC,MAAM,GAAG,IAAI,CAAC,iBAAiB,EAAE,CAAC;gBAChD,MAAM,MAAM,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAC/D,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,iBAAiB,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAClE,IAAI,CAAC,IAAI,CACP,OAAO,EACP,IAAI,KAAK,CACP,0BAA0B,MAAM,0BAA0B,KAAK,KAAK,CACrE,CACF,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC;YACd,CAAC;YAED,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,EAAE,IAAI,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;YAClE,OAAO,CAAC,GAAG,CACT,2BAA2B,WAAW,CAAC,MAAM,mBAAmB,MAAM,IAAI,SAAS,EAAE,CACtF,CAAC;YAEF,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAAC;YAChE,OAAO,CAAC,GAAG,CAAC,sBAAsB,eAAe,GAAG,CAAC,CAAC;YAEtD,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,EAAE,CAAC;gBAC5B,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,mCAAmC,CAAC,CAAC;gBAC1D,IAAI,CAAC,WAAW,CAAC;oBACf,IAAI,EAAE,qBAAqB;oBAC3B,KAAK,EAAE,6BAA6B;iBACrC,CAAC,CAAC;gBACH,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO,eAAe,CAAC;QACzB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,gCAAgC,EAAE,KAAK,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,WAAW,CAAC;gBACf,IAAI,EAAE,qBAAqB;gBAC3B,KAAK,EAAE,yBAA0B,KAAe,CAAC,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,EAAE;aAC5E,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;CACF;AA7HD,oDA6HC"}
|
||||
35
dist/core/WebSocketManager.d.ts
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
/**
|
||||
* Manages a single WebSocket connection lifecycle.
|
||||
* Handles connecting, attaching existing sockets, sending messages,
|
||||
* and clean disconnection.
|
||||
*/
|
||||
export declare class WebSocketManager extends EventEmitter {
|
||||
private socket?;
|
||||
private _isConnected;
|
||||
get isConnected(): boolean;
|
||||
get currentSocket(): WebSocket | undefined;
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
connect(url: string): Promise<void>;
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
handleSocket(socket: WebSocket): void;
|
||||
/**
|
||||
* Send a JSON message via WebSocket if connected.
|
||||
* Gracefully handles send failures (e.g., socket closing mid-send).
|
||||
*/
|
||||
send(message: Record<string, unknown>): void;
|
||||
/**
|
||||
* Disconnect and clean up the current socket.
|
||||
*/
|
||||
disconnect(): void;
|
||||
/**
|
||||
* Attach internal event listeners on the current socket.
|
||||
*/
|
||||
private attachListeners;
|
||||
}
|
||||
//# sourceMappingURL=WebSocketManager.d.ts.map
|
||||
1
dist/core/WebSocketManager.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"WebSocketManager.d.ts","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AAEtC;;;;GAIG;AACH,qBAAa,gBAAiB,SAAQ,YAAY;IAChD,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,YAAY,CAAS;IAE7B,IAAI,WAAW,IAAI,OAAO,CAEzB;IAED,IAAI,aAAa,IAAI,SAAS,GAAG,SAAS,CAEzC;IAED;;OAEG;IACH,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA0BnC;;OAEG;IACH,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAYrC;;;OAGG;IACH,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI;IAgB5C;;OAEG;IACH,UAAU,IAAI,IAAI;IAmBlB;;OAEG;IACH,OAAO,CAAC,eAAe;CAuBxB"}
|
||||
126
dist/core/WebSocketManager.js
vendored
Normal file
@@ -0,0 +1,126 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.WebSocketManager = void 0;
|
||||
const ws_1 = require("ws");
|
||||
const events_1 = require("events");
|
||||
/**
|
||||
* Manages a single WebSocket connection lifecycle.
|
||||
* Handles connecting, attaching existing sockets, sending messages,
|
||||
* and clean disconnection.
|
||||
*/
|
||||
class WebSocketManager extends events_1.EventEmitter {
|
||||
socket;
|
||||
_isConnected = false;
|
||||
get isConnected() {
|
||||
return this._isConnected;
|
||||
}
|
||||
get currentSocket() {
|
||||
return this.socket;
|
||||
}
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
connect(url) {
|
||||
// Clean up any existing connection first
|
||||
if (this.socket) {
|
||||
this.disconnect();
|
||||
}
|
||||
return new Promise((resolve, reject) => {
|
||||
try {
|
||||
this.socket = new ws_1.WebSocket(url);
|
||||
this.attachListeners();
|
||||
this.socket.once("open", () => {
|
||||
this._isConnected = true;
|
||||
this.emit("connected");
|
||||
resolve();
|
||||
});
|
||||
this.socket.once("error", (error) => {
|
||||
reject(error);
|
||||
});
|
||||
}
|
||||
catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
handleSocket(socket) {
|
||||
// Clean up any existing connection first
|
||||
if (this.socket) {
|
||||
this.disconnect();
|
||||
}
|
||||
this.socket = socket;
|
||||
this._isConnected = true;
|
||||
this.attachListeners();
|
||||
this.emit("connected");
|
||||
}
|
||||
/**
|
||||
* Send a JSON message via WebSocket if connected.
|
||||
* Gracefully handles send failures (e.g., socket closing mid-send).
|
||||
*/
|
||||
send(message) {
|
||||
if (!this.socket || !this._isConnected)
|
||||
return;
|
||||
try {
|
||||
if (this.socket.readyState === ws_1.WebSocket.OPEN) {
|
||||
this.socket.send(JSON.stringify(message));
|
||||
}
|
||||
else {
|
||||
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
// Socket may have closed between the readyState check and send()
|
||||
console.error("Failed to send WebSocket message:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Disconnect and clean up the current socket.
|
||||
*/
|
||||
disconnect() {
|
||||
if (!this.socket)
|
||||
return;
|
||||
try {
|
||||
this.socket.removeAllListeners();
|
||||
if (this.socket.readyState === ws_1.WebSocket.OPEN ||
|
||||
this.socket.readyState === ws_1.WebSocket.CONNECTING) {
|
||||
this.socket.close();
|
||||
}
|
||||
}
|
||||
catch {
|
||||
// Ignore close errors — socket may already be dead
|
||||
}
|
||||
this.socket = undefined;
|
||||
this._isConnected = false;
|
||||
}
|
||||
/**
|
||||
* Attach internal event listeners on the current socket.
|
||||
*/
|
||||
attachListeners() {
|
||||
if (!this.socket)
|
||||
return;
|
||||
this.socket.on("message", (data) => {
|
||||
try {
|
||||
const message = JSON.parse(data.toString());
|
||||
this.emit("message", message);
|
||||
}
|
||||
catch (err) {
|
||||
console.error("Failed to parse WebSocket message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
});
|
||||
this.socket.on("close", () => {
|
||||
this._isConnected = false;
|
||||
this.emit("disconnected");
|
||||
});
|
||||
this.socket.on("error", (error) => {
|
||||
console.error("WebSocket error:", error);
|
||||
this.emit("error", error);
|
||||
});
|
||||
}
|
||||
}
|
||||
exports.WebSocketManager = WebSocketManager;
|
||||
//# sourceMappingURL=WebSocketManager.js.map
|
||||
1
dist/core/WebSocketManager.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"WebSocketManager.js","sourceRoot":"","sources":["../../src/core/WebSocketManager.ts"],"names":[],"mappings":";;;AAAA,2BAA+B;AAC/B,mCAAsC;AAEtC;;;;GAIG;AACH,MAAa,gBAAiB,SAAQ,qBAAY;IACxC,MAAM,CAAa;IACnB,YAAY,GAAG,KAAK,CAAC;IAE7B,IAAI,WAAW;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED,IAAI,aAAa;QACf,OAAO,IAAI,CAAC,MAAM,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,GAAW;QACjB,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,IAAI,CAAC,MAAM,GAAG,IAAI,cAAS,CAAC,GAAG,CAAC,CAAC;gBACjC,IAAI,CAAC,eAAe,EAAE,CAAC;gBAEvB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE;oBAC5B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;oBACzB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;oBACvB,OAAO,EAAE,CAAC;gBACZ,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;oBAClC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAChB,CAAC,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,CAAC,KAAK,CAAC,CAAC;YAChB,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,MAAiB;QAC5B,yCAAyC;QACzC,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChB,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;QACzB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACzB,CAAC;IAED;;;OAGG;IACH,IAAI,CAAC,OAAgC;QACnC,IAAI,CAAC,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,YAAY;YAAE,OAAO;QAE/C,IAAI,CAAC;YACH,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI,EAAE,CAAC;gBAC9C,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;YAC5C,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,sCAAsC,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,iEAAiE;YACjE,OAAO,CAAC,KAAK,CAAC,mCAAmC,EAAE,KAAK,CAAC,CAAC;YAC1D,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC;YACH,IAAI,CAAC,MAAM,CAAC,kBAAkB,EAAE,CAAC;YACjC,IACE,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,IAAI;gBACzC,IAAI,CAAC,MAAM,CAAC,UAAU,KAAK,cAAS,CAAC,UAAU,EAC/C,CAAC;gBACD,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;YACtB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,mDAAmD;QACrD,CAAC;QAED,IAAI,CAAC,MAAM,GAAG,SAAS,CAAC;QACxB,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,MAAM;YAAE,OAAO;QAEzB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC5C,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YAChC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,CAAC,KAAK,CAAC,oCAAoC,EAAE,GAAG,CAAC,CAAC;gBACzD,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAC3B,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;YAC1B,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAChC,OAAO,CAAC,KAAK,CAAC,kBAAkB,EAAE,KAAK,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AA5HD,4CA4HC"}
|
||||
7
dist/core/index.d.ts
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
export { WebSocketManager } from "./WebSocketManager";
|
||||
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
|
||||
export { ConversationManager, type ConversationManagerOptions, } from "./ConversationManager";
|
||||
export { TranscriptionManager, type TranscriptionManagerOptions, } from "./TranscriptionManager";
|
||||
export { processFullStream, handleStreamChunk, type StreamResult, type StreamProcessorCallbacks, } from "./StreamProcessor";
|
||||
export { InputQueue, type QueueItem } from "./InputQueue";
|
||||
//# sourceMappingURL=index.d.ts.map
|
||||
1
dist/core/index.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,KAAK,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAC3E,OAAO,EACL,mBAAmB,EACnB,KAAK,0BAA0B,GAChC,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EACL,oBAAoB,EACpB,KAAK,2BAA2B,GACjC,MAAM,wBAAwB,CAAC;AAChC,OAAO,EACL,iBAAiB,EACjB,iBAAiB,EACjB,KAAK,YAAY,EACjB,KAAK,wBAAwB,GAC9B,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,UAAU,EAAE,KAAK,SAAS,EAAE,MAAM,cAAc,CAAC"}
|
||||
17
dist/core/index.js
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
"use strict";
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.InputQueue = exports.handleStreamChunk = exports.processFullStream = exports.TranscriptionManager = exports.ConversationManager = exports.SpeechManager = exports.WebSocketManager = void 0;
|
||||
var WebSocketManager_1 = require("./WebSocketManager");
|
||||
Object.defineProperty(exports, "WebSocketManager", { enumerable: true, get: function () { return WebSocketManager_1.WebSocketManager; } });
|
||||
var SpeechManager_1 = require("./SpeechManager");
|
||||
Object.defineProperty(exports, "SpeechManager", { enumerable: true, get: function () { return SpeechManager_1.SpeechManager; } });
|
||||
var ConversationManager_1 = require("./ConversationManager");
|
||||
Object.defineProperty(exports, "ConversationManager", { enumerable: true, get: function () { return ConversationManager_1.ConversationManager; } });
|
||||
var TranscriptionManager_1 = require("./TranscriptionManager");
|
||||
Object.defineProperty(exports, "TranscriptionManager", { enumerable: true, get: function () { return TranscriptionManager_1.TranscriptionManager; } });
|
||||
var StreamProcessor_1 = require("./StreamProcessor");
|
||||
Object.defineProperty(exports, "processFullStream", { enumerable: true, get: function () { return StreamProcessor_1.processFullStream; } });
|
||||
Object.defineProperty(exports, "handleStreamChunk", { enumerable: true, get: function () { return StreamProcessor_1.handleStreamChunk; } });
|
||||
var InputQueue_1 = require("./InputQueue");
|
||||
Object.defineProperty(exports, "InputQueue", { enumerable: true, get: function () { return InputQueue_1.InputQueue; } });
|
||||
//# sourceMappingURL=index.js.map
|
||||
1
dist/core/index.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/core/index.ts"],"names":[],"mappings":";;;AAAA,uDAAsD;AAA7C,oHAAA,gBAAgB,OAAA;AACzB,iDAA2E;AAAlE,8GAAA,aAAa,OAAA;AACtB,6DAG+B;AAF7B,0HAAA,mBAAmB,OAAA;AAGrB,+DAGgC;AAF9B,4HAAA,oBAAoB,OAAA;AAGtB,qDAK2B;AAJzB,oHAAA,iBAAiB,OAAA;AACjB,oHAAA,iBAAiB,OAAA;AAInB,2CAA0D;AAAjD,wGAAA,UAAU,OAAA"}
|
||||
4
dist/index.d.ts
vendored
@@ -1,4 +1,4 @@
|
||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
||||
export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent";
|
||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
|
||||
export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent.new";
|
||||
export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types";
|
||||
//# sourceMappingURL=index.d.ts.map
|
||||
2
dist/index.d.ts.map
vendored
@@ -1 +1 @@
|
||||
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAClE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,cAAc,CAAC;AAGtB,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}
|
||||
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACtE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}
|
||||
8
dist/index.js
vendored
@@ -2,10 +2,10 @@
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0;
|
||||
// Agents
|
||||
var VoiceAgent_1 = require("./VoiceAgent");
|
||||
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_1.VoiceAgent; } });
|
||||
var VideoAgent_1 = require("./VideoAgent");
|
||||
Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_1.VideoAgent; } });
|
||||
var VoiceAgent_new_1 = require("./VoiceAgent.new");
|
||||
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_new_1.VoiceAgent; } });
|
||||
var VideoAgent_new_1 = require("./VideoAgent.new");
|
||||
Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_new_1.VideoAgent; } });
|
||||
// Shared types
|
||||
var types_1 = require("./types");
|
||||
Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } });
|
||||
|
||||
2
dist/index.js.map
vendored
@@ -1 +1 @@
|
||||
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,2CAAkE;AAAzD,wGAAA,UAAU,OAAA;AACnB,2CAQsB;AAPlB,wGAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}
|
||||
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,mDAAsE;AAA7D,4GAAA,UAAU,OAAA;AACnB,mDAQ0B;AAPtB,4GAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}
|
||||
BIN
example/frames/frame_00000_2026-02-23T10-41-46-424Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00001_2026-02-23T10-41-50-271Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00002_2026-02-23T10-41-51-387Z.webp
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
example/frames/frame_00003_2026-02-23T10-41-56-374Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00004_2026-02-23T10-42-01-379Z.webp
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
example/frames/frame_00005_2026-02-23T10-42-06-375Z.webp
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
example/frames/frame_00006_2026-02-23T10-42-07-405Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00007_2026-02-23T10-42-11-278Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00008_2026-02-23T10-42-11-381Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00009_2026-02-23T10-42-16-395Z.webp
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
example/frames/frame_00010_2026-02-23T10-42-21-373Z.webp
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
example/frames/frame_00011_2026-02-23T10-42-26-364Z.webp
Normal file
|
After Width: | Height: | Size: 10 KiB |
@@ -2,12 +2,12 @@ const http = require('http');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const PORT = 3000;
|
||||
const PORT = 3102;
|
||||
|
||||
// Create a simple HTTP server to serve the voice client HTML
|
||||
const server = http.createServer((req, res) => {
|
||||
if (req.url === '/' || req.url === '/index.html') {
|
||||
const htmlPath = path.join(__dirname, 'voice-client.html');
|
||||
const htmlPath = path.join(__dirname, 'video-client.html');
|
||||
fs.readFile(htmlPath, (err, data) => {
|
||||
if (err) {
|
||||
res.writeHead(500);
|
||||
|
||||
998
example/video-client.html
Normal file
@@ -0,0 +1,998 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Video + Voice Agent Client</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: system-ui, sans-serif;
|
||||
max-width: 1000px;
|
||||
margin: 20px auto;
|
||||
padding: 0 16px;
|
||||
background: #f9fafb;
|
||||
color: #111827;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: #6b7280;
|
||||
font-size: 0.95rem;
|
||||
margin-bottom: 24px;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: white;
|
||||
border: 1px solid #e5e7eb;
|
||||
border-radius: 12px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 12px;
|
||||
align-items: center;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
video {
|
||||
width: 100%;
|
||||
max-width: 520px;
|
||||
border-radius: 10px;
|
||||
background: #000;
|
||||
aspect-ratio: 4 / 3;
|
||||
}
|
||||
|
||||
button {
|
||||
padding: 10px 16px;
|
||||
border-radius: 8px;
|
||||
border: 1px solid #d1d5db;
|
||||
background: white;
|
||||
cursor: pointer;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
button.primary {
|
||||
background: #2563eb;
|
||||
color: white;
|
||||
border-color: #2563eb;
|
||||
}
|
||||
|
||||
button.danger {
|
||||
background: #dc2626;
|
||||
color: white;
|
||||
border-color: #dc2626;
|
||||
}
|
||||
|
||||
button:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.status {
|
||||
font-weight: 600;
|
||||
margin: 8px 0;
|
||||
font-size: 0.95rem;
|
||||
}
|
||||
|
||||
.dot {
|
||||
display: inline-block;
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 50%;
|
||||
margin-right: 8px;
|
||||
}
|
||||
|
||||
.dot.disconnected {
|
||||
background: #9ca3af;
|
||||
}
|
||||
|
||||
.dot.connected {
|
||||
background: #22c55e;
|
||||
}
|
||||
|
||||
.dot.listening {
|
||||
background: #f59e0b;
|
||||
animation: pulse 1.5s infinite;
|
||||
}
|
||||
|
||||
.dot.speaking {
|
||||
background: #3b82f6;
|
||||
animation: pulse 1.2s infinite;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1
|
||||
}
|
||||
|
||||
50% {
|
||||
opacity: 0.6
|
||||
}
|
||||
}
|
||||
|
||||
#transcript,
|
||||
#assistant,
|
||||
#reasoning,
|
||||
#tools {
|
||||
min-height: 48px;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
background: #f3f4f6;
|
||||
border-left: 4px solid #9ca3af;
|
||||
margin-bottom: 16px;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
#transcript {
|
||||
border-left-color: #2563eb;
|
||||
}
|
||||
|
||||
#assistant {
|
||||
border-left-color: #22c55e;
|
||||
}
|
||||
|
||||
#reasoning {
|
||||
border-left-color: #f59e0b;
|
||||
font-style: italic;
|
||||
color: #4b5563;
|
||||
}
|
||||
|
||||
#tools {
|
||||
border-left-color: #8b5cf6;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
#log {
|
||||
background: #0f172a;
|
||||
color: #e2e8f0;
|
||||
font-family: 'SF Mono', monospace;
|
||||
font-size: 0.82rem;
|
||||
padding: 12px;
|
||||
border-radius: 8px;
|
||||
max-height: 240px;
|
||||
overflow-y: auto;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* ── Mic selector & level meter ── */
|
||||
#micRow {
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
#micSelect {
|
||||
flex: 1;
|
||||
min-width: 180px;
|
||||
padding: 6px 8px;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #d1d5db;
|
||||
}
|
||||
|
||||
#refreshMicsBtn {
|
||||
padding: 6px 12px;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.meter-wrap {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.meter-wrap label {
|
||||
font-size: 0.85rem;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
#levelMeter {
|
||||
flex: 1;
|
||||
height: 14px;
|
||||
border-radius: 7px;
|
||||
background: #e5e7eb;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
#levelBar {
|
||||
height: 100%;
|
||||
width: 0%;
|
||||
border-radius: 7px;
|
||||
background: #22c55e;
|
||||
transition: width 60ms linear;
|
||||
}
|
||||
|
||||
#levelBar.hot {
|
||||
background: #ef4444;
|
||||
}
|
||||
|
||||
#rmsValue {
|
||||
font-family: monospace;
|
||||
font-size: 0.8rem;
|
||||
width: 56px;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
/* ── Push-to-talk ── */
|
||||
#pttBtn {
|
||||
padding: 10px 20px;
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
border-radius: 10px;
|
||||
border: 2px solid #2563eb;
|
||||
background: #eff6ff;
|
||||
color: #2563eb;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
touch-action: none;
|
||||
}
|
||||
|
||||
#pttBtn:active,
|
||||
#pttBtn.active {
|
||||
background: #dc2626;
|
||||
color: white;
|
||||
border-color: #dc2626;
|
||||
}
|
||||
|
||||
#pttBtn:disabled {
|
||||
opacity: 0.4;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<h1>📹 Video + Voice Agent</h1>
|
||||
<p class="subtitle">Webcam + microphone → multimodal AI (vision + speech)</p>
|
||||
|
||||
<div class="card">
|
||||
<video id="localVideo" autoplay playsinline muted></video>
|
||||
<canvas id="frameCanvas" style="display:none"></canvas>
|
||||
|
||||
<div class="row" style="margin-top:16px">
|
||||
<input type="text" id="wsEndpoint" value="ws://localhost:8081" style="flex:1; min-width:260px" />
|
||||
<button id="connectBtn" class="primary">Connect</button>
|
||||
<button id="disconnectBtn" disabled>Disconnect</button>
|
||||
</div>
|
||||
|
||||
<!-- ── Mic selector ── -->
|
||||
<div class="row" id="micRow">
|
||||
<label>Microphone:</label>
|
||||
<select id="micSelect">
|
||||
<option value="">-- click Refresh --</option>
|
||||
</select>
|
||||
<button id="refreshMicsBtn">🔄 Refresh</button>
|
||||
</div>
|
||||
|
||||
<!-- ── Live level meter ── -->
|
||||
<div class="meter-wrap">
|
||||
<label>Mic level:</label>
|
||||
<div id="levelMeter">
|
||||
<div id="levelBar"></div>
|
||||
</div>
|
||||
<span id="rmsValue">0.000</span>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<label>Input mode:</label>
|
||||
<select id="inputMode">
|
||||
<option value="browser-stt">Browser STT</option>
|
||||
<option value="server-whisper">Server Whisper (VAD)</option>
|
||||
<option value="push-to-talk" selected>Push-to-Talk</option>
|
||||
</select>
|
||||
<label>Frames:</label>
|
||||
<select id="frameInterval">
|
||||
<option value="3000">every 3s</option>
|
||||
<option value="5000" selected>every 5s</option>
|
||||
<option value="10000">every 10s</option>
|
||||
<option value="0">manual only</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<button id="startMediaBtn" disabled>📹🎤 Start Camera + Mic</button>
|
||||
<button id="stopMediaBtn" disabled>⏹ Stop</button>
|
||||
<button id="captureBtn" disabled>Capture Frame Now</button>
|
||||
<button id="pttBtn" disabled>🎙 Hold to Talk</button>
|
||||
<button id="interruptBtn" class="danger" disabled>✋ Interrupt</button>
|
||||
</div>
|
||||
|
||||
<div class="status" id="status">
|
||||
<span class="dot disconnected"></span>Disconnected
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>👤 You said</h3>
|
||||
<div id="transcript">—</div>
|
||||
|
||||
<h3>🤖 Assistant</h3>
|
||||
<div id="assistant"></div>
|
||||
|
||||
<div id="reasoningSection" class="hidden">
|
||||
<h3>💭 Reasoning</h3>
|
||||
<div id="reasoning"></div>
|
||||
</div>
|
||||
|
||||
<div id="toolsSection" class="hidden">
|
||||
<h3>🛠️ Tools</h3>
|
||||
<div id="tools"></div>
|
||||
</div>
|
||||
|
||||
<h3>📜 Log</h3>
|
||||
<div id="log"></div>
|
||||
|
||||
<script>
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// State & Elements
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
const els = {
|
||||
wsEndpoint: document.getElementById('wsEndpoint'),
|
||||
connectBtn: document.getElementById('connectBtn'),
|
||||
disconnectBtn: document.getElementById('disconnectBtn'),
|
||||
inputMode: document.getElementById('inputMode'),
|
||||
frameInterval: document.getElementById('frameInterval'),
|
||||
startMediaBtn: document.getElementById('startMediaBtn'),
|
||||
stopMediaBtn: document.getElementById('stopMediaBtn'),
|
||||
captureBtn: document.getElementById('captureBtn'),
|
||||
pttBtn: document.getElementById('pttBtn'),
|
||||
interruptBtn: document.getElementById('interruptBtn'),
|
||||
status: document.getElementById('status'),
|
||||
transcript: document.getElementById('transcript'),
|
||||
assistant: document.getElementById('assistant'),
|
||||
reasoningSec: document.getElementById('reasoningSection'),
|
||||
reasoning: document.getElementById('reasoning'),
|
||||
toolsSec: document.getElementById('toolsSection'),
|
||||
tools: document.getElementById('tools'),
|
||||
log: document.getElementById('log'),
|
||||
video: document.getElementById('localVideo'),
|
||||
canvas: document.getElementById('frameCanvas'),
|
||||
micSelect: document.getElementById('micSelect'),
|
||||
refreshMicsBtn: document.getElementById('refreshMicsBtn'),
|
||||
levelBar: document.getElementById('levelBar'),
|
||||
rmsValue: document.getElementById('rmsValue'),
|
||||
};
|
||||
|
||||
let ws = null;
|
||||
let localStream = null;
|
||||
let audioOnlyStream = null; // ← ADD THIS
|
||||
let mediaRecorder = null;
|
||||
let audioChunks = [];
|
||||
let frameTimer = null;
|
||||
let audioQueue = [];
|
||||
let isPlaying = false;
|
||||
let currentSource = null;
|
||||
|
||||
// Level-meter / VAD audio nodes (use browser-native sample rate)
|
||||
let meterCtx = null; // AudioContext for the meter (always running when media is on)
|
||||
let meterAnalyser = null;
|
||||
let meterSource = null;
|
||||
let meterRafId = null;
|
||||
|
||||
// VAD-specific
|
||||
let silenceStart = null;
|
||||
let recordingStartTime = null;
|
||||
const SPEECH_THRESHOLD = 0.015;
|
||||
const SILENCE_THRESHOLD = 0.008;
|
||||
const SILENCE_DURATION = 1400; // ms
|
||||
const MIN_RECORDING_TIME = 600; // ms
|
||||
|
||||
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
let recognition = null;
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function log(...args) {
|
||||
const time = new Date().toLocaleTimeString([], { hour12: false });
|
||||
const line = `[${time}] ${args.join(' ')}\n`;
|
||||
els.log.textContent += line;
|
||||
els.log.scrollTop = els.log.scrollHeight;
|
||||
}
|
||||
|
||||
function setStatus(text, state = 'disconnected') {
|
||||
els.status.innerHTML = `<span class="dot ${state}"></span>${text}`;
|
||||
}
|
||||
|
||||
function enable(...btns) {
|
||||
btns.forEach(b => { if (els[b]) els[b].disabled = false; });
|
||||
}
|
||||
function disable(...btns) {
|
||||
btns.forEach(b => { if (els[b]) els[b].disabled = true; });
|
||||
}
|
||||
|
||||
function resetUI() {
|
||||
els.assistant.textContent = '';
|
||||
els.reasoning.textContent = '';
|
||||
els.tools.textContent = '';
|
||||
els.reasoningSec.classList.add('hidden');
|
||||
els.toolsSec.classList.add('hidden');
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Mic enumeration
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function refreshMics() {
|
||||
try {
|
||||
// Need a temporary stream to get labelled device list
|
||||
const tmp = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
tmp.getTracks().forEach(t => t.stop());
|
||||
|
||||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||
const mics = devices.filter(d => d.kind === 'audioinput');
|
||||
els.micSelect.innerHTML = '';
|
||||
mics.forEach((m, i) => {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = m.deviceId;
|
||||
opt.textContent = m.label || `Microphone ${i + 1}`;
|
||||
els.micSelect.appendChild(opt);
|
||||
});
|
||||
log(`Found ${mics.length} microphone(s)`);
|
||||
} catch (err) {
|
||||
log('Mic enumeration failed:', err.message);
|
||||
}
|
||||
}
|
||||
|
||||
els.refreshMicsBtn.onclick = refreshMics;
|
||||
// Auto-populate on page load
|
||||
refreshMics();
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Live audio level meter (always-on when media is active)
|
||||
// Uses AnalyserNode + rAF – no ScriptProcessorNode needed.
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function startLevelMeter(stream) {
|
||||
// Use the browser's native sample rate (NO custom sampleRate!)
|
||||
meterCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||
meterSource = meterCtx.createMediaStreamSource(stream);
|
||||
meterAnalyser = meterCtx.createAnalyser();
|
||||
meterAnalyser.fftSize = 1024;
|
||||
meterSource.connect(meterAnalyser);
|
||||
// Do NOT connect to destination – we don't want to hear ourselves
|
||||
|
||||
const buf = new Float32Array(meterAnalyser.fftSize);
|
||||
|
||||
function tick() {
|
||||
meterAnalyser.getFloatTimeDomainData(buf);
|
||||
let sum = 0;
|
||||
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
|
||||
const rms = Math.sqrt(sum / buf.length);
|
||||
|
||||
// Update UI
|
||||
const pct = Math.min(rms / 0.15, 1) * 100; // 0.15 is "loud"
|
||||
els.levelBar.style.width = pct + '%';
|
||||
els.levelBar.classList.toggle('hot', rms > SPEECH_THRESHOLD);
|
||||
els.rmsValue.textContent = rms.toFixed(4);
|
||||
|
||||
// If VAD mode is active, drive it from here
|
||||
if (els.inputMode.value === 'server-whisper') {
|
||||
vadTick(rms);
|
||||
}
|
||||
|
||||
meterRafId = requestAnimationFrame(tick);
|
||||
}
|
||||
tick();
|
||||
log(`Level meter started (sampleRate=${meterCtx.sampleRate})`);
|
||||
}
|
||||
|
||||
function stopLevelMeter() {
|
||||
if (meterRafId) { cancelAnimationFrame(meterRafId); meterRafId = null; }
|
||||
if (meterSource) { meterSource.disconnect(); meterSource = null; }
|
||||
if (meterAnalyser) { meterAnalyser.disconnect(); meterAnalyser = null; }
|
||||
if (meterCtx) { meterCtx.close(); meterCtx = null; }
|
||||
els.levelBar.style.width = '0%';
|
||||
els.rmsValue.textContent = '0.000';
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Frame capture & send
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function captureFrame(reason = 'timer') {
|
||||
if (!els.video.videoWidth) return;
|
||||
|
||||
const ctx = els.canvas.getContext('2d');
|
||||
els.canvas.width = els.video.videoWidth;
|
||||
els.canvas.height = els.video.videoHeight;
|
||||
ctx.drawImage(els.video, 0, 0);
|
||||
|
||||
const dataUrl = els.canvas.toDataURL('image/webp', 0.78);
|
||||
const base64 = dataUrl.split(',')[1];
|
||||
|
||||
if (ws?.readyState === WebSocket.OPEN) {
|
||||
ws.send(JSON.stringify({
|
||||
type: 'video_frame',
|
||||
sessionId: 'client-main',
|
||||
sequence: Date.now(),
|
||||
timestamp: Date.now(),
|
||||
triggerReason: reason,
|
||||
image: {
|
||||
data: base64,
|
||||
format: 'webp',
|
||||
width: els.canvas.width,
|
||||
height: els.canvas.height
|
||||
}
|
||||
}));
|
||||
log(`Frame sent (${(base64.length / 1000).toFixed(1)} kB) — ${reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Audio playback queue
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function playNext() {
|
||||
if (isPlaying || audioQueue.length === 0) return;
|
||||
isPlaying = true;
|
||||
|
||||
const { bytes, format } = audioQueue.shift();
|
||||
|
||||
try {
|
||||
const ctx = new (window.AudioContext || window.webkitAudioContext)();
|
||||
const buffer = await ctx.decodeAudioData(
|
||||
bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.length)
|
||||
);
|
||||
const source = ctx.createBufferSource();
|
||||
source.buffer = buffer;
|
||||
source.connect(ctx.destination);
|
||||
currentSource = source;
|
||||
source.onended = () => {
|
||||
currentSource = null;
|
||||
isPlaying = false;
|
||||
ctx.close();
|
||||
playNext();
|
||||
};
|
||||
source.start(0);
|
||||
log(`Playing audio chunk (${bytes.length} bytes, ${format})`);
|
||||
} catch (err) {
|
||||
console.error('Audio decode/play error:', err);
|
||||
isPlaying = false;
|
||||
playNext();
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// WebSocket
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function connect() {
|
||||
const url = els.wsEndpoint.value.trim();
|
||||
if (!url) return log('No endpoint');
|
||||
|
||||
setStatus('Connecting...', 'disconnected');
|
||||
ws = new WebSocket(url);
|
||||
|
||||
ws.onopen = () => {
|
||||
setStatus('Connected', 'connected');
|
||||
enable('startMediaBtn', 'interruptBtn', 'captureBtn');
|
||||
disable('connectBtn');
|
||||
enable('disconnectBtn');
|
||||
log(`Connected to ${url}`);
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
setStatus('Disconnected', 'disconnected');
|
||||
disable('startMediaBtn', 'stopMediaBtn', 'captureBtn', 'interruptBtn', 'pttBtn');
|
||||
enable('connectBtn');
|
||||
disable('disconnectBtn');
|
||||
stopAllMedia();
|
||||
log('Disconnected');
|
||||
ws = null;
|
||||
};
|
||||
|
||||
ws.onerror = (e) => {
|
||||
log('WebSocket error', e);
|
||||
setStatus('Error', 'disconnected');
|
||||
};
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
try {
|
||||
const msg = JSON.parse(event.data);
|
||||
handleMessage(msg);
|
||||
} catch (err) {
|
||||
log('Parse error:', err);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function disconnect() {
|
||||
if (ws) ws.close();
|
||||
stopAllMedia();
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Media (camera + mic)
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function startMedia() {
|
||||
try {
|
||||
const audioConstraint = els.micSelect.value
|
||||
? { deviceId: { exact: els.micSelect.value } }
|
||||
: true;
|
||||
|
||||
localStream = await navigator.mediaDevices.getUserMedia({
|
||||
video: { width: { ideal: 640 }, height: { ideal: 480 } },
|
||||
audio: audioConstraint,
|
||||
});
|
||||
|
||||
audioOnlyStream = new MediaStream(localStream.getAudioTracks()); // ← ADD THIS
|
||||
|
||||
|
||||
// Log which mic was actually selected
|
||||
const audioTrack = localStream.getAudioTracks()[0];
|
||||
log(`Mic active: "${audioTrack?.label || 'unknown'}"`);
|
||||
|
||||
els.video.srcObject = localStream;
|
||||
await els.video.play();
|
||||
|
||||
enable('stopMediaBtn', 'pttBtn');
|
||||
disable('startMediaBtn');
|
||||
|
||||
// Start the always-on level meter
|
||||
startLevelMeter(localStream);
|
||||
|
||||
// Periodic frames
|
||||
const intervalMs = Number(els.frameInterval.value);
|
||||
if (intervalMs > 0) {
|
||||
frameTimer = setInterval(() => captureFrame('timer'), intervalMs);
|
||||
log(`Frame capture every ${intervalMs / 1000}s`);
|
||||
}
|
||||
|
||||
// Start the selected input mode
|
||||
const mode = els.inputMode.value;
|
||||
if (mode === 'browser-stt') {
|
||||
startBrowserSTT();
|
||||
}
|
||||
// VAD and push-to-talk don't need extra init – they're driven by
|
||||
// the level-meter tick and button events respectively.
|
||||
|
||||
setStatus('Listening...', 'listening');
|
||||
log(`Camera + Mic started, input mode: ${mode}`);
|
||||
} catch (err) {
|
||||
log('getUserMedia failed:', err.message);
|
||||
}
|
||||
}
|
||||
|
||||
function stopAllMedia() {
|
||||
if (frameTimer) { clearInterval(frameTimer); frameTimer = null; }
|
||||
|
||||
stopLevelMeter();
|
||||
|
||||
if (localStream) {
|
||||
localStream.getTracks().forEach(t => t.stop());
|
||||
audioOnlyStream = null;
|
||||
localStream = null;
|
||||
}
|
||||
els.video.srcObject = null;
|
||||
|
||||
if (mediaRecorder?.state === 'recording') mediaRecorder.stop();
|
||||
mediaRecorder = null;
|
||||
|
||||
if (recognition) recognition.stop();
|
||||
recognition = null;
|
||||
|
||||
silenceStart = null;
|
||||
recordingStartTime = null;
|
||||
audioChunks = [];
|
||||
|
||||
disable('stopMediaBtn', 'pttBtn');
|
||||
enable('startMediaBtn');
|
||||
setStatus('Connected', 'connected');
|
||||
log('Media stopped');
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Shared: record a segment from localStream and send it
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function chosenMimeType() {
|
||||
for (const mt of [
|
||||
'audio/webm;codecs=opus',
|
||||
'audio/webm',
|
||||
'audio/ogg;codecs=opus',
|
||||
'audio/mp4',
|
||||
]) {
|
||||
if (MediaRecorder.isTypeSupported(mt)) return mt;
|
||||
}
|
||||
return ''; // let browser pick default
|
||||
}
|
||||
|
||||
function startRecording() {
|
||||
if (mediaRecorder?.state === 'recording') return;
|
||||
if (!audioOnlyStream) { log('No audio stream!'); return; }
|
||||
|
||||
audioChunks = [];
|
||||
recordingStartTime = Date.now();
|
||||
silenceStart = null;
|
||||
|
||||
const mimeType = chosenMimeType();
|
||||
const opts = mimeType ? { mimeType } : undefined;
|
||||
mediaRecorder = new MediaRecorder(audioOnlyStream, opts);
|
||||
|
||||
mediaRecorder.ondataavailable = e => {
|
||||
if (e.data.size > 0) audioChunks.push(e.data);
|
||||
};
|
||||
|
||||
mediaRecorder.onstop = async () => {
|
||||
const usedMime = mediaRecorder?.mimeType || mimeType || 'audio/webm';
|
||||
if (audioChunks.length === 0) {
|
||||
log('No audio chunks recorded');
|
||||
setStatus('Listening...', 'listening');
|
||||
return;
|
||||
}
|
||||
|
||||
const blob = new Blob(audioChunks, { type: usedMime });
|
||||
if (blob.size < 800) {
|
||||
log(`Audio too short (${blob.size} bytes), skipping`);
|
||||
setStatus('Listening...', 'listening');
|
||||
return;
|
||||
}
|
||||
|
||||
const arrayBuffer = await blob.arrayBuffer();
|
||||
const base64 = btoa(
|
||||
new Uint8Array(arrayBuffer).reduce((d, b) => d + String.fromCharCode(b), '')
|
||||
);
|
||||
|
||||
if (ws?.readyState === WebSocket.OPEN) {
|
||||
ws.send(JSON.stringify({ type: 'audio', data: base64, format: usedMime }));
|
||||
log(`Sent audio (${(base64.length / 1000).toFixed(1)} kB, ${usedMime})`);
|
||||
els.transcript.textContent = 'Transcribing...';
|
||||
} else {
|
||||
log('WS not connected, audio dropped');
|
||||
}
|
||||
|
||||
setStatus('Listening...', 'listening');
|
||||
};
|
||||
|
||||
mediaRecorder.start(100); // timeslice 100ms
|
||||
setStatus('🔴 Recording...', 'speaking');
|
||||
log('Recording started');
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
if (mediaRecorder?.state === 'recording') {
|
||||
mediaRecorder.stop();
|
||||
silenceStart = null;
|
||||
recordingStartTime = null;
|
||||
setStatus('Processing...', 'connected');
|
||||
log('Recording stopped, sending...');
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// VAD (driven from the level-meter rAF loop)
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function vadTick(rms) {
|
||||
if (rms > SPEECH_THRESHOLD) {
|
||||
silenceStart = null;
|
||||
if (!mediaRecorder || mediaRecorder.state !== 'recording') {
|
||||
startRecording();
|
||||
}
|
||||
} else if (rms < SILENCE_THRESHOLD && mediaRecorder?.state === 'recording') {
|
||||
if (!silenceStart) {
|
||||
silenceStart = Date.now();
|
||||
} else if (Date.now() - silenceStart > SILENCE_DURATION) {
|
||||
if (recordingStartTime && (Date.now() - recordingStartTime) > MIN_RECORDING_TIME) {
|
||||
log('Silence → stopping');
|
||||
stopRecording();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Push-to-Talk
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function pttDown() {
|
||||
if (!localStream) return;
|
||||
els.pttBtn.classList.add('active');
|
||||
startRecording();
|
||||
}
|
||||
|
||||
function pttUp() {
|
||||
els.pttBtn.classList.remove('active');
|
||||
stopRecording();
|
||||
}
|
||||
|
||||
els.pttBtn.addEventListener('mousedown', pttDown);
|
||||
els.pttBtn.addEventListener('mouseup', pttUp);
|
||||
els.pttBtn.addEventListener('mouseleave', pttUp);
|
||||
els.pttBtn.addEventListener('touchstart', e => { e.preventDefault(); pttDown(); });
|
||||
els.pttBtn.addEventListener('touchend', e => { e.preventDefault(); pttUp(); });
|
||||
|
||||
// Spacebar push-to-talk (only when mode is push-to-talk)
|
||||
let spaceHeld = false;
|
||||
document.addEventListener('keydown', e => {
|
||||
if (e.code === 'Space' && !spaceHeld && els.inputMode.value === 'push-to-talk'
|
||||
&& localStream && !e.target.matches('input, textarea, select')) {
|
||||
e.preventDefault();
|
||||
spaceHeld = true;
|
||||
pttDown();
|
||||
}
|
||||
});
|
||||
document.addEventListener('keyup', e => {
|
||||
if (e.code === 'Space' && spaceHeld) {
|
||||
e.preventDefault();
|
||||
spaceHeld = false;
|
||||
pttUp();
|
||||
}
|
||||
});
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Browser STT
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function startBrowserSTT() {
|
||||
if (!SpeechRecognition) { log('Web Speech API not supported'); return; }
|
||||
recognition = new SpeechRecognition();
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = true;
|
||||
recognition.lang = 'en-US';
|
||||
|
||||
recognition.onresult = e => {
|
||||
const transcript = Array.from(e.results).map(r => r[0].transcript).join('');
|
||||
els.transcript.textContent = transcript;
|
||||
if (e.results[0].isFinal) sendTranscript(transcript);
|
||||
};
|
||||
|
||||
recognition.onerror = e => log('STT error:', e.error);
|
||||
recognition.start();
|
||||
log('Browser STT started');
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Sending transcript / interrupt
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function sendTranscript(text) {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||
ws.send(JSON.stringify({ type: 'transcript', text }));
|
||||
log(`Sent transcript: ${text}`);
|
||||
resetUI();
|
||||
}
|
||||
|
||||
function interrupt() {
|
||||
if (ws?.readyState === WebSocket.OPEN) {
|
||||
ws.send(JSON.stringify({ type: 'interrupt', reason: 'user_button' }));
|
||||
log('Interrupt sent');
|
||||
}
|
||||
audioQueue = [];
|
||||
if (currentSource) { currentSource.stop(); currentSource = null; }
|
||||
isPlaying = false;
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Server → Client messages
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
function handleMessage(msg) {
|
||||
switch (msg.type) {
|
||||
case 'transcription_result':
|
||||
els.transcript.textContent = msg.text || '(empty)';
|
||||
log(`Transcription: ${msg.text}`);
|
||||
break;
|
||||
|
||||
case 'text_delta':
|
||||
els.assistant.textContent += msg.text || '';
|
||||
break;
|
||||
|
||||
case 'reasoning_delta':
|
||||
els.reasoningSec.classList.remove('hidden');
|
||||
els.reasoning.textContent += msg.text || '';
|
||||
break;
|
||||
|
||||
case 'tool_call':
|
||||
case 'tool_result':
|
||||
els.toolsSec.classList.remove('hidden');
|
||||
els.tools.innerHTML += `<div>${msg.type}: ${msg.toolName || '?'} → ${JSON.stringify(msg.result || msg.input || {})}</div>`;
|
||||
break;
|
||||
|
||||
case 'audio_chunk':
|
||||
case 'audio':
|
||||
const bytes = Uint8Array.from(atob(msg.data), c => c.charCodeAt(0));
|
||||
audioQueue.push({ bytes, format: msg.format || 'mp3' });
|
||||
playNext();
|
||||
break;
|
||||
|
||||
case 'speech_interrupted':
|
||||
audioQueue = [];
|
||||
if (currentSource) currentSource.stop();
|
||||
isPlaying = false;
|
||||
log(`Speech interrupted: ${msg.reason || '?'}`);
|
||||
break;
|
||||
|
||||
case 'response_complete':
|
||||
log('Response complete');
|
||||
break;
|
||||
|
||||
case 'capture_frame':
|
||||
log(`Server requested frame: ${msg.reason}`);
|
||||
captureFrame(msg.reason || 'server_request');
|
||||
break;
|
||||
|
||||
case 'frame_ack':
|
||||
break; // silent
|
||||
|
||||
case 'session_init':
|
||||
log(`Session: ${msg.sessionId}`);
|
||||
break;
|
||||
|
||||
case 'stream_start':
|
||||
resetUI();
|
||||
break;
|
||||
|
||||
case 'stream_finish':
|
||||
log(`Stream finished: ${msg.finishReason}`);
|
||||
break;
|
||||
|
||||
case 'speech_stream_start':
|
||||
break;
|
||||
|
||||
case 'speech_stream_end':
|
||||
log('Speech done');
|
||||
break;
|
||||
|
||||
case 'error':
|
||||
log(`ERROR: ${msg.error}`);
|
||||
console.error('Server error:', msg.error);
|
||||
break;
|
||||
|
||||
case 'transcription_error':
|
||||
log(`Transcription error: ${msg.error}`);
|
||||
els.transcript.textContent = `Error: ${msg.error}`;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (msg.type?.includes('stream') || msg.type?.includes('step')) {
|
||||
// verbose stream events – log quietly
|
||||
} else {
|
||||
log(`[${msg.type}]`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Event listeners
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
|
||||
els.connectBtn.onclick = connect;
|
||||
els.disconnectBtn.onclick = disconnect;
|
||||
els.startMediaBtn.onclick = startMedia;
|
||||
els.stopMediaBtn.onclick = stopAllMedia;
|
||||
els.captureBtn.onclick = () => captureFrame('manual');
|
||||
els.interruptBtn.onclick = interrupt;
|
||||
|
||||
els.frameInterval.onchange = () => {
|
||||
if (frameTimer) {
|
||||
clearInterval(frameTimer);
|
||||
const ms = Number(els.frameInterval.value);
|
||||
if (ms > 0) frameTimer = setInterval(() => captureFrame('timer'), ms);
|
||||
}
|
||||
};
|
||||
|
||||
document.getElementById('wsEndpoint').addEventListener('keypress', e => {
|
||||
if (e.key === 'Enter') connect();
|
||||
});
|
||||
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
161
example/ws-server-video.ts
Normal file
@@ -0,0 +1,161 @@
|
||||
// ws-server-video.ts
|
||||
import "dotenv/config";
|
||||
import { WebSocketServer } from "ws";
|
||||
import { VideoAgent } from "../src/VideoAgent.new"; // adjust path
|
||||
import { tool } from "ai";
|
||||
import { z } from "zod";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { mkdirSync, writeFileSync } from "fs";
|
||||
import { join, dirname } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
|
||||
// ── Frame saving ────────────────────────────────────────────────────────
|
||||
const __dirname = typeof import.meta.dirname === "string"
|
||||
? import.meta.dirname
|
||||
: dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const FRAMES_DIR = join(__dirname, "frames");
|
||||
mkdirSync(FRAMES_DIR, { recursive: true });
|
||||
console.log(`[video-ws] Saving received frames to ${FRAMES_DIR}/`);
|
||||
|
||||
let frameCounter = 0;
|
||||
|
||||
function saveFrame(msg: {
|
||||
sequence?: number;
|
||||
timestamp?: number;
|
||||
triggerReason?: string;
|
||||
image: { data: string; format?: string; width?: number; height?: number };
|
||||
}) {
|
||||
const idx = frameCounter++;
|
||||
const ext = msg.image.format === "jpeg" ? "jpg" : (msg.image.format || "webp");
|
||||
const ts = new Date(msg.timestamp ?? Date.now())
|
||||
.toISOString()
|
||||
.replace(/[:.]/g, "-");
|
||||
const filename = `frame_${String(idx).padStart(5, "0")}_${ts}.${ext}`;
|
||||
const filepath = join(FRAMES_DIR, filename);
|
||||
|
||||
const buf = Buffer.from(msg.image.data, "base64");
|
||||
writeFileSync(filepath, buf);
|
||||
|
||||
console.log(
|
||||
`[frames] Saved ${filename} (${(buf.length / 1024).toFixed(1)} kB` +
|
||||
`${msg.image.width ? `, ${msg.image.width}×${msg.image.height}` : ""}` +
|
||||
`, ${msg.triggerReason ?? "unknown"})`
|
||||
);
|
||||
}
|
||||
|
||||
const endpoint = process.env.VIDEO_WS_ENDPOINT || "ws://localhost:8081";
|
||||
const url = new URL(endpoint);
|
||||
const port = Number(url.port || 8081);
|
||||
const host = url.hostname || "localhost";
|
||||
|
||||
|
||||
// ── Tools (same as demo.ts) ────────────────────────────────────────────
|
||||
const weatherTool = tool({
|
||||
description: "Get the weather in a location",
|
||||
inputSchema: z.object({
|
||||
location: z.string().describe("The location to get the weather for"),
|
||||
}),
|
||||
execute: async ({ location }) => ({
|
||||
location,
|
||||
temperature: 72 + Math.floor(Math.random() * 21) - 10,
|
||||
conditions: ["sunny", "cloudy", "rainy", "partly cloudy"][
|
||||
Math.floor(Math.random() * 4)
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
const timeTool = tool({
|
||||
description: "Get the current time",
|
||||
inputSchema: z.object({}),
|
||||
execute: async () => ({
|
||||
time: new Date().toLocaleTimeString(),
|
||||
timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
|
||||
}),
|
||||
});
|
||||
const wss = new WebSocketServer({ port, host });
|
||||
|
||||
wss.on("listening", () => {
|
||||
console.log(`[video-ws] listening on ${endpoint}`);
|
||||
console.log(`[video-ws] Open video-client.html and connect → ${endpoint}`);
|
||||
});
|
||||
|
||||
wss.on("connection", (socket) => {
|
||||
console.log("[video-ws] ✓ client connected");
|
||||
|
||||
const agent = new VideoAgent({
|
||||
model: openai("gpt-4o"), // or gpt-4o-mini, claude-3.5-sonnet, gemini-1.5-flash…
|
||||
transcriptionModel: openai.transcription("whisper-1"),
|
||||
speechModel: openai.speech("gpt-4o-mini-tts"),
|
||||
instructions: `You are a helpful video+voice assistant.
|
||||
You can SEE what the user is showing via webcam.
|
||||
Describe what you see when it helps answer the question.
|
||||
Keep spoken answers concise and natural.`,
|
||||
voice: "echo",
|
||||
streamingSpeech: {
|
||||
minChunkSize: 25,
|
||||
maxChunkSize: 140,
|
||||
parallelGeneration: true,
|
||||
maxParallelRequests: 3,
|
||||
},
|
||||
tools: { getWeather: weatherTool, getTime: timeTool },
|
||||
// Tune these depending on your budget & latency goals
|
||||
maxContextFrames: 6, // very important — each frame ≈ 100–400 tokens
|
||||
maxFrameInputSize: 2_500_000, // ~2.5 MB
|
||||
});
|
||||
|
||||
// Reuse most of the same event logging you have in ws-server.ts
|
||||
agent.on("text", (data: { role: string; text: string }) => {
|
||||
console.log(`[video] Text (${data.role}): ${data.text?.substring(0, 100)}...`);
|
||||
});
|
||||
agent.on("chunk:text_delta", (data: { id: string; text: string }) => {
|
||||
process.stdout.write(data.text || "");
|
||||
});
|
||||
agent.on("frame_received", ({ sequence, size, dimensions, triggerReason }) => {
|
||||
console.log(`[video] Frame #${sequence} (${triggerReason}) ${size / 1024 | 0} kB ${dimensions.width}×${dimensions.height}`);
|
||||
});
|
||||
agent.on("frame_requested", ({ reason }) => console.log(`[video] Requested frame: ${reason}`));
|
||||
|
||||
// Audio and transcription events
|
||||
agent.on("audio_received", ({ size, format }) => {
|
||||
console.log(`[video] Audio received: ${size} bytes, format: ${format}`);
|
||||
});
|
||||
agent.on("transcription", ({ text, language }) => {
|
||||
console.log(`[video] Transcription: "${text}" (${language || "unknown"})`);
|
||||
});
|
||||
|
||||
// Speech events
|
||||
agent.on("speech_start", () => console.log(`[video] Speech started`));
|
||||
agent.on("speech_complete", () => console.log(`[video] Speech complete`));
|
||||
agent.on("audio_chunk", ({ chunkId, text }) => {
|
||||
console.log(`[video] Audio chunk #${chunkId}: "${text?.substring(0, 50)}..."`);
|
||||
});
|
||||
|
||||
// Error handling
|
||||
agent.on("error", (error: Error) => {
|
||||
console.error(`[video] ERROR:`, error);
|
||||
});
|
||||
agent.on("warning", (warning: string) => {
|
||||
console.warn(`[video] WARNING:`, warning);
|
||||
});
|
||||
|
||||
agent.on("disconnected", () => {
|
||||
agent.destroy();
|
||||
console.log("[video-ws] ✗ client disconnected (agent destroyed)");
|
||||
});
|
||||
|
||||
// ── Intercept raw messages to save frames to disk ────────────────────
|
||||
socket.on("message", (raw) => {
|
||||
try {
|
||||
const msg = JSON.parse(raw.toString());
|
||||
if (msg.type === "video_frame" && msg.image?.data) {
|
||||
saveFrame(msg);
|
||||
}
|
||||
} catch {
|
||||
// not JSON — ignore, agent will handle binary etc.
|
||||
}
|
||||
});
|
||||
|
||||
// The crucial line — same as VoiceAgent
|
||||
agent.handleSocket(socket);
|
||||
});
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "voice-agent-ai-sdk",
|
||||
"version": "0.1.0",
|
||||
"version": "1.0.1",
|
||||
"description": "Voice AI Agent with ai-sdk",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -15,6 +15,7 @@
|
||||
"demo": "tsx example/demo.ts",
|
||||
"ws:server": "tsx example/ws-server.ts",
|
||||
"client": "node example/serve-client.js",
|
||||
"ws:video": "tsx example/ws-server-video.ts",
|
||||
"prepublishOnly": "pnpm build"
|
||||
},
|
||||
"keywords": [
|
||||
|
||||
818
src/VideoAgent.new.ts
Normal file
@@ -0,0 +1,818 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
import {
|
||||
streamText,
|
||||
type LanguageModel,
|
||||
stepCountIs,
|
||||
type Tool,
|
||||
type ModelMessage,
|
||||
type TranscriptionModel,
|
||||
type SpeechModel,
|
||||
} from "ai";
|
||||
import {
|
||||
type StreamingSpeechConfig,
|
||||
type HistoryConfig,
|
||||
} from "./types";
|
||||
import {
|
||||
WebSocketManager,
|
||||
SpeechManager,
|
||||
ConversationManager,
|
||||
TranscriptionManager,
|
||||
InputQueue,
|
||||
type QueueItem,
|
||||
processFullStream,
|
||||
handleStreamChunk,
|
||||
} from "./core";
|
||||
|
||||
// ── Video-specific types ────────────────────────────────
|
||||
|
||||
/**
|
||||
* Trigger reasons for frame capture
|
||||
*/
|
||||
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
|
||||
|
||||
/**
|
||||
* Video frame data structure sent to/from the client
|
||||
*/
|
||||
interface VideoFrame {
|
||||
type: "video_frame";
|
||||
sessionId: string;
|
||||
sequence: number;
|
||||
timestamp: number;
|
||||
triggerReason: FrameTriggerReason;
|
||||
previousFrameRef?: string;
|
||||
image: {
|
||||
data: string;
|
||||
format: string;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Audio data structure
|
||||
*/
|
||||
interface AudioData {
|
||||
type: "audio";
|
||||
sessionId: string;
|
||||
data: string;
|
||||
format: string;
|
||||
sampleRate?: number;
|
||||
duration?: number;
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Backend configuration for video processing
|
||||
*/
|
||||
interface VideoAgentConfig {
|
||||
/** Maximum frames to keep in context buffer for conversation history */
|
||||
maxContextFrames: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Frame context for maintaining visual conversation history
|
||||
*/
|
||||
interface FrameContext {
|
||||
sequence: number;
|
||||
timestamp: number;
|
||||
triggerReason: FrameTriggerReason;
|
||||
frameHash: string;
|
||||
description?: string;
|
||||
}
|
||||
|
||||
/** Default maximum frame input size (5 MB) */
|
||||
const DEFAULT_MAX_FRAME_SIZE = 5 * 1024 * 1024;
|
||||
|
||||
/** Default video agent config */
|
||||
const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
|
||||
maxContextFrames: 10,
|
||||
};
|
||||
|
||||
// ── Options & queue item ────────────────────────────────
|
||||
|
||||
export interface VideoAgentOptions {
|
||||
/**
|
||||
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
|
||||
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
|
||||
*/
|
||||
model: LanguageModel;
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
speechModel?: SpeechModel;
|
||||
instructions?: string;
|
||||
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
tools?: Record<string, Tool>;
|
||||
endpoint?: string;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
history?: Partial<HistoryConfig>;
|
||||
maxAudioInputSize?: number;
|
||||
/** Maximum frame input size in bytes (default: 5 MB) */
|
||||
maxFrameInputSize?: number;
|
||||
/** Maximum frames to keep in context buffer (default: 10) */
|
||||
maxContextFrames?: number;
|
||||
/** Session ID for this video agent instance */
|
||||
sessionId?: string;
|
||||
}
|
||||
|
||||
/** Shape of items in the video agent's input queue */
|
||||
interface VideoInputItem extends QueueItem<string> {
|
||||
text?: string;
|
||||
frame?: VideoFrame;
|
||||
}
|
||||
|
||||
// ── VideoAgent class ────────────────────────────────────
|
||||
|
||||
export class VideoAgent extends EventEmitter {
|
||||
private model: LanguageModel;
|
||||
private instructions: string;
|
||||
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
private endpoint?: string;
|
||||
private tools: Record<string, Tool> = {};
|
||||
private isDestroyed = false;
|
||||
private _isProcessing = false;
|
||||
|
||||
// Abort controller for the current LLM stream
|
||||
private currentStreamAbortController?: AbortController;
|
||||
|
||||
// ── Managers ─────────────────────────────────────────
|
||||
private ws: WebSocketManager;
|
||||
private speech: SpeechManager;
|
||||
private conversation: ConversationManager;
|
||||
private transcription: TranscriptionManager;
|
||||
private inputQueue: InputQueue<VideoInputItem>;
|
||||
|
||||
// ── Video-specific state ────────────────────────────
|
||||
private sessionId: string;
|
||||
private frameSequence = 0;
|
||||
private lastFrameTimestamp = 0;
|
||||
private lastFrameHash?: string;
|
||||
private frameContextBuffer: FrameContext[] = [];
|
||||
private currentFrameData?: string;
|
||||
private videoConfig: VideoAgentConfig;
|
||||
private maxFrameInputSize: number;
|
||||
|
||||
constructor(options: VideoAgentOptions) {
|
||||
super();
|
||||
this.model = options.model;
|
||||
this.instructions =
|
||||
options.instructions ||
|
||||
`You are a helpful multimodal AI assistant that can see through the user's camera and hear their voice.
|
||||
When analyzing images, be concise but informative. Describe what you see when asked.
|
||||
Keep responses conversational since they will be spoken aloud.
|
||||
Use tools when needed to provide accurate information.`;
|
||||
this.stopWhen = options.stopWhen || stepCountIs(5);
|
||||
this.endpoint = options.endpoint;
|
||||
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
|
||||
this.sessionId = options.sessionId || this.generateSessionId();
|
||||
this.videoConfig = {
|
||||
...DEFAULT_VIDEO_AGENT_CONFIG,
|
||||
maxContextFrames:
|
||||
options.maxContextFrames ?? DEFAULT_VIDEO_AGENT_CONFIG.maxContextFrames,
|
||||
};
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
}
|
||||
|
||||
// ── Initialize managers ─────────────────────────
|
||||
this.ws = new WebSocketManager();
|
||||
this.speech = new SpeechManager({
|
||||
speechModel: options.speechModel,
|
||||
voice: options.voice,
|
||||
speechInstructions: options.speechInstructions,
|
||||
outputFormat: options.outputFormat,
|
||||
streamingSpeech: options.streamingSpeech,
|
||||
});
|
||||
this.conversation = new ConversationManager({
|
||||
history: options.history,
|
||||
});
|
||||
this.transcription = new TranscriptionManager({
|
||||
transcriptionModel: options.transcriptionModel,
|
||||
maxAudioInputSize: options.maxAudioInputSize,
|
||||
});
|
||||
this.inputQueue = new InputQueue<VideoInputItem>();
|
||||
|
||||
// ── Wire managers to WebSocket send ─────────────
|
||||
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
|
||||
this.speech.sendMessage = sendMsg;
|
||||
this.transcription.sendMessage = sendMsg;
|
||||
|
||||
// ── Wire input queue processor ──────────────────
|
||||
this.inputQueue.processor = (item) => this.processQueueItem(item);
|
||||
|
||||
// ── Bubble events from managers ─────────────────
|
||||
this.bubbleEvents(this.ws, ["connected", "error"]);
|
||||
this.bubbleEvents(this.speech, [
|
||||
"speech_start",
|
||||
"speech_complete",
|
||||
"speech_interrupted",
|
||||
"speech_chunk_queued",
|
||||
"audio_chunk",
|
||||
"audio",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.conversation, [
|
||||
"history_cleared",
|
||||
"history_trimmed",
|
||||
]);
|
||||
this.bubbleEvents(this.transcription, [
|
||||
"transcription",
|
||||
"audio_received",
|
||||
"error",
|
||||
"warning",
|
||||
]);
|
||||
|
||||
// ── Handle WebSocket lifecycle ──────────────────
|
||||
this.ws.on("disconnected", () => {
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
|
||||
this.ws.on("message", (message: any) => this.handleMessage(message));
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Public API
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
public registerTools(tools: Record<string, Tool>) {
|
||||
this.tools = { ...this.tools, ...tools };
|
||||
}
|
||||
|
||||
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
|
||||
return this.transcription.transcribeAudio(audioData);
|
||||
}
|
||||
|
||||
public async generateSpeechFromText(
|
||||
text: string,
|
||||
abortSignal?: AbortSignal
|
||||
): Promise<Uint8Array> {
|
||||
return this.speech.generateSpeechFromText(text, abortSignal);
|
||||
}
|
||||
|
||||
public interruptSpeech(reason: string = "interrupted"): void {
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
|
||||
public interruptCurrentResponse(reason: string = "interrupted"): void {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
|
||||
public async connect(url?: string): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
const wsUrl = url || this.endpoint || "ws://localhost:8080";
|
||||
await this.ws.connect(wsUrl);
|
||||
}
|
||||
|
||||
public handleSocket(socket: WebSocket): void {
|
||||
this.ensureNotDestroyed();
|
||||
this.ws.handleSocket(socket);
|
||||
}
|
||||
|
||||
public async sendText(text: string): Promise<string> {
|
||||
this.ensureNotDestroyed();
|
||||
if (!text || !text.trim()) {
|
||||
throw new Error("Text input cannot be empty");
|
||||
}
|
||||
return this.enqueueTextInput(text);
|
||||
}
|
||||
|
||||
public async sendAudio(audioData: string): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
await this.handleAudioInput(audioData);
|
||||
}
|
||||
|
||||
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
const base64Audio = Buffer.from(audioBuffer).toString("base64");
|
||||
await this.handleAudioInput(base64Audio);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a video frame with optional text query for vision analysis
|
||||
*/
|
||||
public async sendFrame(
|
||||
frameData: string,
|
||||
query?: string,
|
||||
options?: { width?: number; height?: number; format?: string }
|
||||
): Promise<string> {
|
||||
this.ensureNotDestroyed();
|
||||
|
||||
const frame: VideoFrame = {
|
||||
type: "video_frame",
|
||||
sessionId: this.sessionId,
|
||||
sequence: this.frameSequence++,
|
||||
timestamp: Date.now(),
|
||||
triggerReason: "user_request",
|
||||
previousFrameRef: this.lastFrameHash,
|
||||
image: {
|
||||
data: frameData,
|
||||
format: options?.format || "webp",
|
||||
width: options?.width || 640,
|
||||
height: options?.height || 480,
|
||||
},
|
||||
};
|
||||
|
||||
// Update local frame state
|
||||
await this.handleVideoFrame(frame);
|
||||
|
||||
if (query) {
|
||||
return this.enqueueMultimodalInput(query, frame);
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Request client to capture and send a frame
|
||||
*/
|
||||
public requestFrameCapture(reason: FrameTriggerReason): void {
|
||||
this.ws.send({
|
||||
type: "capture_frame",
|
||||
reason,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
this.emit("frame_requested", { reason });
|
||||
}
|
||||
|
||||
public getConfig(): VideoAgentConfig {
|
||||
return { ...this.videoConfig };
|
||||
}
|
||||
|
||||
public updateConfig(config: Partial<VideoAgentConfig>): void {
|
||||
this.videoConfig = { ...this.videoConfig, ...config };
|
||||
this.emit("config_changed", this.videoConfig);
|
||||
}
|
||||
|
||||
startListening() {
|
||||
this.emit("listening");
|
||||
}
|
||||
|
||||
stopListening() {
|
||||
this.emit("stopped");
|
||||
}
|
||||
|
||||
clearHistory() {
|
||||
this.conversation.clearHistory();
|
||||
this.frameContextBuffer = [];
|
||||
}
|
||||
|
||||
getHistory(): ModelMessage[] {
|
||||
return this.conversation.getHistory();
|
||||
}
|
||||
|
||||
setHistory(history: ModelMessage[]) {
|
||||
this.conversation.setHistory(history);
|
||||
}
|
||||
|
||||
getFrameContext(): FrameContext[] {
|
||||
return [...this.frameContextBuffer];
|
||||
}
|
||||
|
||||
getSessionId(): string {
|
||||
return this.sessionId;
|
||||
}
|
||||
|
||||
disconnect() {
|
||||
this.ws.disconnect();
|
||||
}
|
||||
|
||||
destroy() {
|
||||
this.isDestroyed = true;
|
||||
this.cleanupOnDisconnect();
|
||||
this.ws.disconnect();
|
||||
this.conversation.clearHistory();
|
||||
this.frameContextBuffer = [];
|
||||
this.tools = {};
|
||||
this.removeAllListeners();
|
||||
}
|
||||
|
||||
// ── Getters ─────────────────────────────────────────
|
||||
|
||||
get connected(): boolean {
|
||||
return this.ws.isConnected;
|
||||
}
|
||||
|
||||
get processing(): boolean {
|
||||
return this._isProcessing;
|
||||
}
|
||||
|
||||
get speaking(): boolean {
|
||||
return this.speech.isSpeaking;
|
||||
}
|
||||
|
||||
get pendingSpeechChunks(): number {
|
||||
return this.speech.pendingChunkCount;
|
||||
}
|
||||
|
||||
get destroyed(): boolean {
|
||||
return this.isDestroyed;
|
||||
}
|
||||
|
||||
get currentFrameSequence(): number {
|
||||
return this.frameSequence;
|
||||
}
|
||||
|
||||
get hasVisualContext(): boolean {
|
||||
return !!this.currentFrameData;
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — message handling
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private async handleMessage(message: any): Promise<void> {
|
||||
try {
|
||||
switch (message.type) {
|
||||
case "transcript":
|
||||
if (typeof message.text !== "string" || !message.text.trim()) {
|
||||
this.emit("warning", "Received empty or invalid transcript message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
this.requestFrameCapture("user_request");
|
||||
await this.enqueueTextInput(message.text);
|
||||
break;
|
||||
|
||||
case "audio":
|
||||
if (typeof message.data !== "string" || !message.data) {
|
||||
this.emit("warning", "Received empty or invalid audio message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
this.requestFrameCapture("user_request");
|
||||
try {
|
||||
await this.handleAudioInput(message.data, message.format);
|
||||
} catch (audioError) {
|
||||
this.emit("error", audioError);
|
||||
}
|
||||
break;
|
||||
|
||||
case "video_frame":
|
||||
await this.handleVideoFrame(message);
|
||||
break;
|
||||
|
||||
case "interrupt":
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
break;
|
||||
|
||||
case "client_ready":
|
||||
this.handleClientReady(message);
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
this.emit("error", err);
|
||||
}
|
||||
}
|
||||
|
||||
private handleClientReady(message: any): void {
|
||||
this.ws.send({
|
||||
type: "session_init",
|
||||
sessionId: this.sessionId,
|
||||
});
|
||||
this.emit("client_ready", message.capabilities);
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — audio
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private async handleAudioInput(
|
||||
base64Audio: string,
|
||||
format?: string
|
||||
): Promise<void> {
|
||||
const text = await this.transcription.processAudioInput(base64Audio, format);
|
||||
if (text) {
|
||||
await this.enqueueTextInput(text);
|
||||
}
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — video frames
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private async handleVideoFrame(frame: VideoFrame): Promise<void> {
|
||||
try {
|
||||
if (!frame.image?.data) {
|
||||
this.emit("warning", "Received empty or invalid video frame");
|
||||
return;
|
||||
}
|
||||
|
||||
const frameSize = Buffer.from(frame.image.data, "base64").length;
|
||||
if (frameSize > this.maxFrameInputSize) {
|
||||
const sizeMB = (frameSize / (1024 * 1024)).toFixed(1);
|
||||
const maxMB = (this.maxFrameInputSize / (1024 * 1024)).toFixed(1);
|
||||
this.emit(
|
||||
"error",
|
||||
new Error(`Frame too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`)
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const frameHash = this.hashFrame(frame.image.data);
|
||||
this.lastFrameTimestamp = frame.timestamp;
|
||||
this.lastFrameHash = frameHash;
|
||||
this.currentFrameData = frame.image.data;
|
||||
|
||||
this.addFrameToContext({
|
||||
sequence: frame.sequence,
|
||||
timestamp: frame.timestamp,
|
||||
triggerReason: frame.triggerReason,
|
||||
frameHash,
|
||||
});
|
||||
|
||||
this.emit("frame_received", {
|
||||
sequence: frame.sequence,
|
||||
timestamp: frame.timestamp,
|
||||
triggerReason: frame.triggerReason,
|
||||
size: frameSize,
|
||||
dimensions: { width: frame.image.width, height: frame.image.height },
|
||||
});
|
||||
|
||||
this.ws.send({
|
||||
type: "frame_ack",
|
||||
sequence: frame.sequence,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (error) {
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
|
||||
private addFrameToContext(context: FrameContext): void {
|
||||
this.frameContextBuffer.push(context);
|
||||
if (this.frameContextBuffer.length > this.videoConfig.maxContextFrames) {
|
||||
this.frameContextBuffer.shift();
|
||||
}
|
||||
}
|
||||
|
||||
private hashFrame(data: string): string {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const char = data.charCodeAt(i);
|
||||
hash = ((hash << 5) - hash) + char;
|
||||
hash = hash & hash;
|
||||
}
|
||||
return `frame_${this.frameSequence}_${Math.abs(hash).toString(16)}`;
|
||||
}
|
||||
|
||||
private generateSessionId(): string {
|
||||
const timestamp = Date.now().toString(36);
|
||||
const randomPart = Math.random().toString(36).substring(2, 10);
|
||||
return `vs_${timestamp}_${randomPart}`;
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — input queue
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private enqueueTextInput(text: string): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, resolve, reject });
|
||||
});
|
||||
}
|
||||
|
||||
private enqueueMultimodalInput(text: string, frame: VideoFrame): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, frame, resolve, reject });
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Route queued items to the correct processor.
|
||||
*/
|
||||
private async processQueueItem(item: VideoInputItem): Promise<string> {
|
||||
if (item.frame && item.text) {
|
||||
return this.processMultimodalInput(item.text, item.frame);
|
||||
} else if (item.text) {
|
||||
return this.processUserInput(item.text);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — multimodal content building
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private buildMultimodalContent(
|
||||
text: string,
|
||||
frameData?: string
|
||||
): Array<{ type: "text"; text: string } | { type: "image"; image: string }> {
|
||||
const content: Array<
|
||||
{ type: "text"; text: string } | { type: "image"; image: string }
|
||||
> = [];
|
||||
|
||||
if (this.frameContextBuffer.length > 0) {
|
||||
const contextSummary = `[Visual context: ${this.frameContextBuffer.length} frames captured, latest at ${new Date(this.lastFrameTimestamp).toISOString()}]`;
|
||||
content.push({ type: "text", text: contextSummary });
|
||||
}
|
||||
|
||||
const imageData = frameData || this.currentFrameData;
|
||||
if (imageData) {
|
||||
content.push({ type: "image", image: imageData });
|
||||
}
|
||||
|
||||
content.push({ type: "text", text });
|
||||
return content;
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — LLM processing
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* Shared streamText invocation used by both processUserInput and processMultimodalInput.
|
||||
*/
|
||||
private async runStream(
|
||||
messages: ModelMessage[],
|
||||
abortSignal: AbortSignal
|
||||
): Promise<string> {
|
||||
const result = streamText({
|
||||
model: this.model,
|
||||
system: this.instructions,
|
||||
messages,
|
||||
tools: this.tools,
|
||||
stopWhen: this.stopWhen,
|
||||
abortSignal,
|
||||
onChunk: ({ chunk }) => {
|
||||
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
|
||||
},
|
||||
onFinish: async (event) => {
|
||||
for (const step of event.steps) {
|
||||
for (const toolResult of step.toolResults) {
|
||||
this.emit("tool_result", {
|
||||
name: toolResult.toolName,
|
||||
toolCallId: toolResult.toolCallId,
|
||||
result: toolResult.output,
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
|
||||
const streamResult = await processFullStream(
|
||||
result,
|
||||
{
|
||||
onTextDelta: (delta) => this.speech.processTextDelta(delta),
|
||||
onTextEnd: () => this.speech.flushPendingText(),
|
||||
sendMessage: (msg) => this.ws.send(msg),
|
||||
emitEvent: (event, data) => this.emit(event, data),
|
||||
},
|
||||
{
|
||||
sessionId: this.sessionId,
|
||||
frameContext:
|
||||
this.frameContextBuffer.length > 0
|
||||
? {
|
||||
frameCount: this.frameContextBuffer.length,
|
||||
lastFrameSequence:
|
||||
this.frameContextBuffer[this.frameContextBuffer.length - 1]
|
||||
?.sequence,
|
||||
}
|
||||
: undefined,
|
||||
}
|
||||
);
|
||||
|
||||
// Add assistant response to history
|
||||
if (streamResult.fullText) {
|
||||
this.conversation.addMessage({
|
||||
role: "assistant",
|
||||
content: streamResult.fullText,
|
||||
});
|
||||
}
|
||||
|
||||
// Flush remaining speech & wait for queue
|
||||
this.speech.flushPendingText();
|
||||
if (this.speech.queueDonePromise) {
|
||||
await this.speech.queueDonePromise;
|
||||
}
|
||||
|
||||
return streamResult.fullText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process text-only input (with optional visual context from latest frame).
|
||||
*/
|
||||
private async processUserInput(text: string): Promise<string> {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
|
||||
try {
|
||||
this.emit("text", { role: "user", text });
|
||||
|
||||
const hasVisual = !!this.currentFrameData;
|
||||
let messages: ModelMessage[];
|
||||
|
||||
if (hasVisual) {
|
||||
const content = this.buildMultimodalContent(text);
|
||||
this.conversation.addMessage({
|
||||
role: "user",
|
||||
content: [{ type: "text", text: `[Visual context] ${text}` }],
|
||||
});
|
||||
messages = [
|
||||
...this.conversation.getHistoryRef().slice(0, -1),
|
||||
{ role: "user", content },
|
||||
];
|
||||
} else {
|
||||
this.conversation.addMessage({ role: "user", content: text });
|
||||
messages = this.conversation.getHistoryRef();
|
||||
}
|
||||
|
||||
return await this.runStream(
|
||||
messages,
|
||||
this.currentStreamAbortController.signal
|
||||
);
|
||||
} catch (error) {
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
} finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process multimodal input (text + explicit video frame).
|
||||
*/
|
||||
private async processMultimodalInput(
|
||||
text: string,
|
||||
frame: VideoFrame
|
||||
): Promise<string> {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
|
||||
try {
|
||||
this.emit("text", { role: "user", text, hasImage: true });
|
||||
|
||||
const content = this.buildMultimodalContent(text, frame.image.data);
|
||||
|
||||
this.conversation.addMessage({
|
||||
role: "user",
|
||||
content: [{ type: "text", text: `[Image attached] ${text}` }],
|
||||
});
|
||||
|
||||
const messages: ModelMessage[] = [
|
||||
...this.conversation.getHistoryRef().slice(0, -1),
|
||||
{ role: "user", content },
|
||||
];
|
||||
|
||||
return await this.runStream(
|
||||
messages,
|
||||
this.currentStreamAbortController.signal
|
||||
);
|
||||
} catch (error) {
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
} finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// ══════════════════════════════════════════════════════
|
||||
// Private — helpers
|
||||
// ══════════════════════════════════════════════════════
|
||||
|
||||
private ensureNotDestroyed(): void {
|
||||
if (this.isDestroyed) {
|
||||
throw new Error("VideoAgent has been destroyed and cannot be used");
|
||||
}
|
||||
}
|
||||
|
||||
private cleanupOnDisconnect(): void {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.reset();
|
||||
this._isProcessing = false;
|
||||
this.currentFrameData = undefined;
|
||||
this.inputQueue.rejectAll(new Error("Connection closed"));
|
||||
}
|
||||
|
||||
private bubbleEvents(source: EventEmitter, events: string[]): void {
|
||||
for (const event of events) {
|
||||
source.on(event, (...args: any[]) => this.emit(event, ...args));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Export types for external use
|
||||
export type {
|
||||
VideoFrame,
|
||||
AudioData,
|
||||
VideoAgentConfig,
|
||||
FrameContext,
|
||||
FrameTriggerReason,
|
||||
};
|
||||
|
||||
// Re-export shared types
|
||||
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
|
||||
@@ -84,6 +84,10 @@ const DEFAULT_VIDEO_AGENT_CONFIG: VideoAgentConfig = {
|
||||
};
|
||||
|
||||
export interface VideoAgentOptions {
|
||||
/**
|
||||
* AI SDK Model for chat. Must be a vision-enabled model (e.g., openai('gpt-4o'),
|
||||
* anthropic('claude-3.5-sonnet'), google('gemini-1.5-pro')) to process video frames.
|
||||
*/
|
||||
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
|
||||
transcriptionModel?: TranscriptionModel; // AI SDK Transcription Model (e.g., openai.transcription('whisper-1'))
|
||||
speechModel?: SpeechModel; // AI SDK Speech Model (e.g., openai.speech('gpt-4o-mini-tts'))
|
||||
@@ -183,7 +187,7 @@ Use tools when needed to provide accurate information.`;
|
||||
this.endpoint = options.endpoint;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "mp3";
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
|
||||
this.maxFrameInputSize = options.maxFrameInputSize ?? DEFAULT_MAX_FRAME_SIZE;
|
||||
|
||||
@@ -265,7 +269,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.socket.on("message", async (data) => {
|
||||
try {
|
||||
const message = JSON.parse(data.toString());
|
||||
console.log(`Received WebSocket message of type: ${message.type}`);
|
||||
|
||||
switch (message.type) {
|
||||
// Handle transcribed text from the client/STT
|
||||
@@ -278,7 +281,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
// Force capture current frame when user speaks
|
||||
this.requestFrameCapture("user_request");
|
||||
console.log(`Processing transcript: "${message.text}"`);
|
||||
await this.enqueueTextInput(message.text);
|
||||
break;
|
||||
|
||||
@@ -292,10 +294,11 @@ Use tools when needed to provide accurate information.`;
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
// Force capture current frame when user speaks
|
||||
this.requestFrameCapture("user_request");
|
||||
console.log(
|
||||
`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
|
||||
);
|
||||
try {
|
||||
await this.processAudioInput(message);
|
||||
} catch (audioError) {
|
||||
this.emit("error", audioError);
|
||||
}
|
||||
break;
|
||||
|
||||
// Handle video frame from client
|
||||
@@ -305,7 +308,6 @@ Use tools when needed to provide accurate information.`;
|
||||
|
||||
// Handle explicit interrupt request from client
|
||||
case "interrupt":
|
||||
console.log(`Received interrupt request: ${message.reason || "client_request"}`);
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
break;
|
||||
|
||||
@@ -315,23 +317,20 @@ Use tools when needed to provide accurate information.`;
|
||||
break;
|
||||
|
||||
default:
|
||||
console.log(`Unknown message type: ${message.type}`);
|
||||
break;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Failed to process message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
});
|
||||
|
||||
this.socket.on("close", () => {
|
||||
console.log("Disconnected");
|
||||
this.isConnected = false;
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
|
||||
this.socket.on("error", (error) => {
|
||||
console.error("WebSocket error:", error);
|
||||
this.emit("error", error);
|
||||
});
|
||||
}
|
||||
@@ -340,8 +339,6 @@ Use tools when needed to provide accurate information.`;
|
||||
* Handle client ready signal
|
||||
*/
|
||||
private handleClientReady(message: any): void {
|
||||
console.log(`Client ready, capabilities: ${JSON.stringify(message.capabilities || {})}`);
|
||||
|
||||
// Send session info to client
|
||||
this.sendWebSocketMessage({
|
||||
type: "session_init",
|
||||
@@ -403,12 +400,7 @@ Use tools when needed to provide accurate information.`;
|
||||
sequence: frame.sequence,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
|
||||
console.log(
|
||||
`Received frame #${frame.sequence} (${frame.triggerReason}): ${(frameSize / 1024).toFixed(1)}KB, ${frame.image.width}x${frame.image.height}`
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Failed to handle video frame:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
@@ -485,18 +477,12 @@ Use tools when needed to provide accurate information.`;
|
||||
throw new Error("Transcription model not configured");
|
||||
}
|
||||
|
||||
console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);
|
||||
|
||||
try {
|
||||
const result = await transcribe({
|
||||
model: this.transcriptionModel,
|
||||
audio: audioData,
|
||||
});
|
||||
|
||||
console.log(
|
||||
`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`
|
||||
);
|
||||
|
||||
this.emit("transcription", {
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
@@ -511,7 +497,6 @@ Use tools when needed to provide accurate information.`;
|
||||
|
||||
return result.text;
|
||||
} catch (error) {
|
||||
console.error("Whisper transcription failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@@ -707,21 +692,15 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log(
|
||||
`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`
|
||||
);
|
||||
const audioData = await this.generateSpeechFromText(
|
||||
chunk.text,
|
||||
this.currentSpeechAbortController.signal
|
||||
);
|
||||
console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
|
||||
return audioData;
|
||||
} catch (error) {
|
||||
if ((error as Error).name === "AbortError") {
|
||||
console.log(`Audio generation aborted for chunk ${chunk.id}`);
|
||||
return null;
|
||||
}
|
||||
console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
|
||||
this.emit("error", error);
|
||||
return null;
|
||||
}
|
||||
@@ -734,7 +713,6 @@ Use tools when needed to provide accurate information.`;
|
||||
if (this.isSpeaking) return;
|
||||
this.isSpeaking = true;
|
||||
|
||||
console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
|
||||
this.emit("speech_start", { streaming: true });
|
||||
this.sendWebSocketMessage({ type: "speech_stream_start" });
|
||||
|
||||
@@ -742,10 +720,6 @@ Use tools when needed to provide accurate information.`;
|
||||
while (this.speechChunkQueue.length > 0) {
|
||||
const chunk = this.speechChunkQueue[0];
|
||||
|
||||
console.log(
|
||||
`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`
|
||||
);
|
||||
|
||||
if (!chunk.audioPromise) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
@@ -753,7 +727,6 @@ Use tools when needed to provide accurate information.`;
|
||||
const audioData = await chunk.audioPromise;
|
||||
|
||||
if (!this.isSpeaking) {
|
||||
console.log(`Speech interrupted during chunk #${chunk.id}`);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -761,9 +734,6 @@ Use tools when needed to provide accurate information.`;
|
||||
|
||||
if (audioData) {
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
console.log(
|
||||
`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`
|
||||
);
|
||||
|
||||
this.sendWebSocketMessage({
|
||||
type: "audio_chunk",
|
||||
@@ -780,8 +750,6 @@ Use tools when needed to provide accurate information.`;
|
||||
text: chunk.text,
|
||||
uint8Array: audioData,
|
||||
});
|
||||
} else {
|
||||
console.log(`No audio data generated for chunk #${chunk.id}`);
|
||||
}
|
||||
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
@@ -792,7 +760,6 @@ Use tools when needed to provide accurate information.`;
|
||||
);
|
||||
|
||||
if (toStart > 0) {
|
||||
console.log(`Starting parallel generation for ${toStart} more chunks`);
|
||||
for (let i = 0; i < toStart; i++) {
|
||||
const nextChunk = this.speechChunkQueue.find((c) => !c.audioPromise);
|
||||
if (nextChunk) {
|
||||
@@ -803,7 +770,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error in speech queue processing:", error);
|
||||
this.emit("error", error);
|
||||
} finally {
|
||||
this.isSpeaking = false;
|
||||
@@ -815,7 +781,6 @@ Use tools when needed to provide accurate information.`;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
|
||||
console.log(`Speech queue processing complete`);
|
||||
this.sendWebSocketMessage({ type: "speech_stream_end" });
|
||||
this.emit("speech_complete", { streaming: true });
|
||||
}
|
||||
@@ -850,9 +815,14 @@ Use tools when needed to provide accurate information.`;
|
||||
/**
|
||||
* Process incoming audio data: transcribe and generate response
|
||||
*/
|
||||
private async processAudioInput(audioMessage: AudioData): Promise<void> {
|
||||
private async processAudioInput(audioMessage: AudioData | { type: string; data: string; format?: string; sessionId?: string }): Promise<void> {
|
||||
if (!this.transcriptionModel) {
|
||||
this.emit("error", new Error("Transcription model not configured for audio input"));
|
||||
const error = new Error("Transcription model not configured for audio input");
|
||||
this.emit("error", error);
|
||||
this.sendWebSocketMessage({
|
||||
type: "error",
|
||||
error: error.message,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -877,15 +847,11 @@ Use tools when needed to provide accurate information.`;
|
||||
this.emit("audio_received", {
|
||||
size: audioBuffer.length,
|
||||
format: audioMessage.format,
|
||||
sessionId: audioMessage.sessionId,
|
||||
sessionId: audioMessage.sessionId || this.sessionId,
|
||||
});
|
||||
|
||||
console.log(
|
||||
`Processing audio input: ${audioBuffer.length} bytes, format: ${audioMessage.format || "unknown"}`
|
||||
);
|
||||
|
||||
const transcribedText = await this.transcribeAudio(audioBuffer);
|
||||
console.log(`Transcribed text: "${transcribedText}"`);
|
||||
|
||||
if (transcribedText.trim()) {
|
||||
await this.enqueueTextInput(transcribedText);
|
||||
@@ -897,7 +863,6 @@ Use tools when needed to provide accurate information.`;
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to process audio input:", error);
|
||||
this.emit("error", error);
|
||||
this.sendWebSocketMessage({
|
||||
type: "transcription_error",
|
||||
@@ -1049,7 +1014,9 @@ Use tools when needed to provide accurate information.`;
|
||||
* Drain the input queue, processing one request at a time
|
||||
*/
|
||||
private async drainInputQueue(): Promise<void> {
|
||||
if (this.processingQueue) return;
|
||||
if (this.processingQueue) {
|
||||
return;
|
||||
}
|
||||
this.processingQueue = true;
|
||||
|
||||
try {
|
||||
@@ -1151,7 +1118,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
@@ -1229,7 +1195,6 @@ Use tools when needed to provide accurate information.`;
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
@@ -1513,23 +1478,9 @@ Use tools when needed to provide accurate information.`;
|
||||
|
||||
try {
|
||||
if (this.socket.readyState === WebSocket.OPEN) {
|
||||
if (message.type === "audio_chunk" || message.type === "audio") {
|
||||
const { data, ...rest } = message as any;
|
||||
console.log(
|
||||
`Sending WebSocket message: ${message.type}`,
|
||||
data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "",
|
||||
rest
|
||||
);
|
||||
} else {
|
||||
console.log(`Sending WebSocket message: ${message.type}`);
|
||||
}
|
||||
|
||||
this.socket.send(JSON.stringify(message));
|
||||
} else {
|
||||
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to send WebSocket message:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
@@ -1538,7 +1489,6 @@ Use tools when needed to provide accurate information.`;
|
||||
* Start listening for voice/video input
|
||||
*/
|
||||
startListening() {
|
||||
console.log("Starting video agent...");
|
||||
this.emit("listening");
|
||||
}
|
||||
|
||||
@@ -1546,7 +1496,6 @@ Use tools when needed to provide accurate information.`;
|
||||
* Stop listening for voice/video input
|
||||
*/
|
||||
stopListening() {
|
||||
console.log("Stopping video agent...");
|
||||
this.emit("stopped");
|
||||
}
|
||||
|
||||
|
||||
484
src/VoiceAgent.new.ts
Normal file
@@ -0,0 +1,484 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
import {
|
||||
streamText,
|
||||
type LanguageModel,
|
||||
stepCountIs,
|
||||
type Tool,
|
||||
type ModelMessage,
|
||||
type TranscriptionModel,
|
||||
type SpeechModel,
|
||||
} from "ai";
|
||||
import {
|
||||
type StreamingSpeechConfig,
|
||||
type HistoryConfig,
|
||||
} from "./types";
|
||||
import {
|
||||
WebSocketManager,
|
||||
SpeechManager,
|
||||
ConversationManager,
|
||||
TranscriptionManager,
|
||||
InputQueue,
|
||||
type QueueItem,
|
||||
processFullStream,
|
||||
handleStreamChunk,
|
||||
} from "./core";
|
||||
|
||||
export interface VoiceAgentOptions {
|
||||
model: LanguageModel;
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
speechModel?: SpeechModel;
|
||||
instructions?: string;
|
||||
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
tools?: Record<string, Tool>;
|
||||
endpoint?: string;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
/** Configuration for streaming speech generation */
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
/** Configuration for conversation history memory limits */
|
||||
history?: Partial<HistoryConfig>;
|
||||
/** Maximum audio input size in bytes (default: 10 MB) */
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
|
||||
/** Shape of items in the voice agent's input queue */
|
||||
interface VoiceInputItem extends QueueItem<string> {
|
||||
text: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
export class VoiceAgent extends EventEmitter {
|
||||
private model: LanguageModel;
|
||||
private instructions: string;
|
||||
private stopWhen: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||
private endpoint?: string;
|
||||
private tools: Record<string, Tool> = {};
|
||||
private isDestroyed = false;
|
||||
private _isProcessing = false;
|
||||
|
||||
// Abort controller for the current LLM stream
|
||||
private currentStreamAbortController?: AbortController;
|
||||
|
||||
// ── Managers ──────────────────────────────────────────
|
||||
private ws: WebSocketManager;
|
||||
private speech: SpeechManager;
|
||||
private conversation: ConversationManager;
|
||||
private transcription: TranscriptionManager;
|
||||
private inputQueue: InputQueue<VoiceInputItem>;
|
||||
|
||||
constructor(options: VoiceAgentOptions) {
|
||||
super();
|
||||
this.model = options.model;
|
||||
this.instructions =
|
||||
options.instructions || "You are a helpful voice assistant.";
|
||||
this.stopWhen = options.stopWhen || stepCountIs(5);
|
||||
this.endpoint = options.endpoint;
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
}
|
||||
|
||||
// ── Initialize managers ──────────────────────────────
|
||||
this.ws = new WebSocketManager();
|
||||
this.speech = new SpeechManager({
|
||||
speechModel: options.speechModel,
|
||||
voice: options.voice,
|
||||
speechInstructions: options.speechInstructions,
|
||||
outputFormat: options.outputFormat,
|
||||
streamingSpeech: options.streamingSpeech,
|
||||
});
|
||||
this.conversation = new ConversationManager({
|
||||
history: options.history,
|
||||
});
|
||||
this.transcription = new TranscriptionManager({
|
||||
transcriptionModel: options.transcriptionModel,
|
||||
maxAudioInputSize: options.maxAudioInputSize,
|
||||
});
|
||||
this.inputQueue = new InputQueue<VoiceInputItem>();
|
||||
|
||||
// ── Wire managers to the WebSocket send function ─────
|
||||
const sendMsg = (msg: Record<string, unknown>) => this.ws.send(msg);
|
||||
this.speech.sendMessage = sendMsg;
|
||||
this.transcription.sendMessage = sendMsg;
|
||||
|
||||
// ── Wire the input queue processor ───────────────────
|
||||
this.inputQueue.processor = (item) => this.processUserInput(item.text);
|
||||
|
||||
// ── Bubble events from managers ──────────────────────
|
||||
this.bubbleEvents(this.ws, [
|
||||
"connected",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.speech, [
|
||||
"speech_start",
|
||||
"speech_complete",
|
||||
"speech_interrupted",
|
||||
"speech_chunk_queued",
|
||||
"audio_chunk",
|
||||
"audio",
|
||||
"error",
|
||||
]);
|
||||
this.bubbleEvents(this.conversation, [
|
||||
"history_cleared",
|
||||
"history_trimmed",
|
||||
]);
|
||||
this.bubbleEvents(this.transcription, [
|
||||
"transcription",
|
||||
"audio_received",
|
||||
"error",
|
||||
"warning",
|
||||
]);
|
||||
|
||||
// ── Handle WebSocket lifecycle events ────────────────
|
||||
this.ws.on("disconnected", () => {
|
||||
this.cleanupOnDisconnect();
|
||||
this.emit("disconnected");
|
||||
});
|
||||
|
||||
this.ws.on("message", (message: any) => this.handleMessage(message));
|
||||
}
|
||||
|
||||
// ── Public API ────────────────────────────────────────
|
||||
|
||||
public registerTools(tools: Record<string, Tool>) {
|
||||
this.tools = { ...this.tools, ...tools };
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio data to text using the configured transcription model.
|
||||
*/
|
||||
public async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
|
||||
return this.transcription.transcribeAudio(audioData);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
public async generateSpeechFromText(
|
||||
text: string,
|
||||
abortSignal?: AbortSignal
|
||||
): Promise<Uint8Array> {
|
||||
return this.speech.generateSpeechFromText(text, abortSignal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
public interruptSpeech(reason: string = "interrupted"): void {
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
|
||||
/**
|
||||
* Interrupt both the current LLM stream and ongoing speech.
|
||||
*/
|
||||
public interruptCurrentResponse(reason: string = "interrupted"): void {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.interruptSpeech(reason);
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
public async connect(url?: string): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
const wsUrl = url || this.endpoint || "ws://localhost:8080";
|
||||
await this.ws.connect(wsUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
public handleSocket(socket: WebSocket): void {
|
||||
this.ensureNotDestroyed();
|
||||
this.ws.handleSocket(socket);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send text input for processing (bypasses transcription).
|
||||
*/
|
||||
public async sendText(text: string): Promise<string> {
|
||||
this.ensureNotDestroyed();
|
||||
if (!text || !text.trim()) {
|
||||
throw new Error("Text input cannot be empty");
|
||||
}
|
||||
return this.enqueueInput(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send base64 audio data to be transcribed and processed.
|
||||
*/
|
||||
public async sendAudio(audioData: string): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
await this.handleAudioInput(audioData);
|
||||
}
|
||||
|
||||
/**
|
||||
* Send raw audio buffer to be transcribed and processed.
|
||||
*/
|
||||
public async sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void> {
|
||||
this.ensureNotDestroyed();
|
||||
const base64Audio = Buffer.from(audioBuffer).toString("base64");
|
||||
await this.handleAudioInput(base64Audio);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
public async generateAndSendSpeechFull(text: string): Promise<void> {
|
||||
return this.speech.generateAndSendSpeechFull(text);
|
||||
}
|
||||
|
||||
/** Start listening for voice input */
|
||||
startListening() {
|
||||
console.log("Starting voice agent...");
|
||||
this.emit("listening");
|
||||
}
|
||||
|
||||
/** Stop listening for voice input */
|
||||
stopListening() {
|
||||
console.log("Stopping voice agent...");
|
||||
this.emit("stopped");
|
||||
}
|
||||
|
||||
/** Clear conversation history */
|
||||
clearHistory() {
|
||||
this.conversation.clearHistory();
|
||||
}
|
||||
|
||||
/** Get current conversation history */
|
||||
getHistory(): ModelMessage[] {
|
||||
return this.conversation.getHistory();
|
||||
}
|
||||
|
||||
/** Set conversation history (useful for restoring sessions) */
|
||||
setHistory(history: ModelMessage[]) {
|
||||
this.conversation.setHistory(history);
|
||||
}
|
||||
|
||||
/** Disconnect from WebSocket and stop all in-flight work */
|
||||
disconnect() {
|
||||
this.ws.disconnect();
|
||||
}
|
||||
|
||||
/**
|
||||
* Permanently destroy the agent, releasing all resources.
|
||||
*/
|
||||
destroy() {
|
||||
this.isDestroyed = true;
|
||||
this.cleanupOnDisconnect();
|
||||
this.ws.disconnect();
|
||||
this.conversation.clearHistory();
|
||||
this.tools = {};
|
||||
this.removeAllListeners();
|
||||
}
|
||||
|
||||
// ── Getters ───────────────────────────────────────────
|
||||
|
||||
get connected(): boolean {
|
||||
return this.ws.isConnected;
|
||||
}
|
||||
|
||||
get processing(): boolean {
|
||||
return this._isProcessing;
|
||||
}
|
||||
|
||||
get speaking(): boolean {
|
||||
return this.speech.isSpeaking;
|
||||
}
|
||||
|
||||
get pendingSpeechChunks(): number {
|
||||
return this.speech.pendingChunkCount;
|
||||
}
|
||||
|
||||
get destroyed(): boolean {
|
||||
return this.isDestroyed;
|
||||
}
|
||||
|
||||
// ── Private: message handling ─────────────────────────
|
||||
|
||||
private async handleMessage(message: any): Promise<void> {
|
||||
try {
|
||||
console.log(`Received WebSocket message of type: ${message.type}`);
|
||||
|
||||
if (message.type === "transcript") {
|
||||
if (typeof message.text !== "string" || !message.text.trim()) {
|
||||
this.emit("warning", "Received empty or invalid transcript message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
console.log(`Processing transcript: "${message.text}"`);
|
||||
await this.enqueueInput(message.text);
|
||||
} else if (message.type === "audio") {
|
||||
if (typeof message.data !== "string" || !message.data) {
|
||||
this.emit("warning", "Received empty or invalid audio message");
|
||||
return;
|
||||
}
|
||||
this.interruptCurrentResponse("user_speaking");
|
||||
console.log(
|
||||
`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || "unknown"}`
|
||||
);
|
||||
await this.handleAudioInput(message.data, message.format);
|
||||
} else if (message.type === "interrupt") {
|
||||
console.log(
|
||||
`Received interrupt request: ${message.reason || "client_request"}`
|
||||
);
|
||||
this.interruptCurrentResponse(message.reason || "client_request");
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Failed to process message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Private: audio ────────────────────────────────────
|
||||
|
||||
private async handleAudioInput(
|
||||
base64Audio: string,
|
||||
format?: string
|
||||
): Promise<void> {
|
||||
const text = await this.transcription.processAudioInput(
|
||||
base64Audio,
|
||||
format
|
||||
);
|
||||
if (text) {
|
||||
await this.enqueueInput(text);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Private: input queue ──────────────────────────────
|
||||
|
||||
private enqueueInput(text: string): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this.inputQueue.enqueue({ text, resolve, reject });
|
||||
});
|
||||
}
|
||||
|
||||
// ── Private: LLM processing ───────────────────────────
|
||||
|
||||
/**
|
||||
* Process user input with streaming text generation.
|
||||
* Called serially by the input queue.
|
||||
*/
|
||||
private async processUserInput(text: string): Promise<string> {
|
||||
this._isProcessing = true;
|
||||
this.currentStreamAbortController = new AbortController();
|
||||
const streamAbortSignal = this.currentStreamAbortController.signal;
|
||||
|
||||
try {
|
||||
this.emit("text", { role: "user", text });
|
||||
|
||||
this.conversation.addMessage({ role: "user", content: text });
|
||||
|
||||
const result = streamText({
|
||||
model: this.model,
|
||||
system: this.instructions,
|
||||
messages: this.conversation.getHistoryRef(),
|
||||
tools: this.tools,
|
||||
stopWhen: this.stopWhen,
|
||||
abortSignal: streamAbortSignal,
|
||||
onChunk: ({ chunk }) => {
|
||||
handleStreamChunk(chunk, (event, data) => this.emit(event, data));
|
||||
},
|
||||
onFinish: async (event) => {
|
||||
for (const step of event.steps) {
|
||||
for (const toolResult of step.toolResults) {
|
||||
this.emit("tool_result", {
|
||||
name: toolResult.toolName,
|
||||
toolCallId: toolResult.toolCallId,
|
||||
result: toolResult.output,
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onError: ({ error }) => {
|
||||
console.error("Stream error:", error);
|
||||
this.emit("error", error);
|
||||
},
|
||||
});
|
||||
|
||||
const streamResult = await processFullStream(result, {
|
||||
onTextDelta: (delta) => this.speech.processTextDelta(delta),
|
||||
onTextEnd: () => this.speech.flushPendingText(),
|
||||
sendMessage: (msg) => this.ws.send(msg),
|
||||
emitEvent: (event, data) => this.emit(event, data),
|
||||
});
|
||||
|
||||
// Add assistant response to history
|
||||
if (streamResult.fullText) {
|
||||
this.conversation.addMessage({
|
||||
role: "assistant",
|
||||
content: streamResult.fullText,
|
||||
});
|
||||
}
|
||||
|
||||
// Flush any remaining speech
|
||||
this.speech.flushPendingText();
|
||||
|
||||
// Wait for all speech chunks to complete
|
||||
if (this.speech.queueDonePromise) {
|
||||
await this.speech.queueDonePromise;
|
||||
}
|
||||
|
||||
return streamResult.fullText;
|
||||
} catch (error) {
|
||||
// Clean up speech state on error
|
||||
this.speech.reset();
|
||||
throw error;
|
||||
} finally {
|
||||
this._isProcessing = false;
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Private: helpers ──────────────────────────────────
|
||||
|
||||
private ensureNotDestroyed(): void {
|
||||
if (this.isDestroyed) {
|
||||
throw new Error("VoiceAgent has been destroyed and cannot be used");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up all in-flight state when the connection drops.
|
||||
*/
|
||||
private cleanupOnDisconnect(): void {
|
||||
if (this.currentStreamAbortController) {
|
||||
this.currentStreamAbortController.abort();
|
||||
this.currentStreamAbortController = undefined;
|
||||
}
|
||||
this.speech.reset();
|
||||
this._isProcessing = false;
|
||||
this.inputQueue.rejectAll(new Error("Connection closed"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward select events from a child emitter to this agent.
|
||||
*/
|
||||
private bubbleEvents(source: EventEmitter, events: string[]): void {
|
||||
for (const event of events) {
|
||||
source.on(event, (...args: any[]) => this.emit(event, ...args));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -39,6 +39,25 @@ export interface VoiceAgentOptions {
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* A single-session voice agent that manages one WebSocket connection at a time.
|
||||
*
|
||||
* **Important:** Each `VoiceAgent` instance holds its own conversation history,
|
||||
* input queue, speech state, and WebSocket. It is designed for **one user per
|
||||
* instance**. To support multiple concurrent users, create a separate
|
||||
* `VoiceAgent` for each connection:
|
||||
*
|
||||
* ```ts
|
||||
* wss.on("connection", (socket) => {
|
||||
* const agent = new VoiceAgent({ model, ... });
|
||||
* agent.handleSocket(socket);
|
||||
* agent.on("disconnected", () => agent.destroy());
|
||||
* });
|
||||
* ```
|
||||
*
|
||||
* Sharing a single instance across multiple users will cause conversation
|
||||
* history cross-contamination, interleaved audio, and unpredictable behavior.
|
||||
*/
|
||||
export class VoiceAgent extends EventEmitter {
|
||||
private socket?: WebSocket;
|
||||
private tools: Record<string, Tool> = {};
|
||||
@@ -90,7 +109,7 @@ export class VoiceAgent extends EventEmitter {
|
||||
this.endpoint = options.endpoint;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "mp3";
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
|
||||
if (options.tools) {
|
||||
this.tools = { ...options.tools };
|
||||
@@ -695,6 +714,10 @@ export class VoiceAgent extends EventEmitter {
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
* Use this when a WS server accepts a connection and you want the
|
||||
* agent to handle messages on that socket.
|
||||
*
|
||||
* **Note:** Calling this while a socket is already attached will cleanly
|
||||
* tear down the previous connection first. Each `VoiceAgent` instance
|
||||
* supports only one socket at a time — create a new agent per user.
|
||||
*/
|
||||
public handleSocket(socket: WebSocket): void {
|
||||
this.ensureNotDestroyed();
|
||||
|
||||
122
src/core/ConversationManager.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
import { EventEmitter } from "events";
|
||||
import { type ModelMessage } from "ai";
|
||||
import { type HistoryConfig, DEFAULT_HISTORY_CONFIG } from "../types";
|
||||
|
||||
export interface ConversationManagerOptions {
|
||||
history?: Partial<HistoryConfig>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Manages conversation history (ModelMessage[]) with configurable
|
||||
* limits on message count and total character size.
|
||||
*/
|
||||
export class ConversationManager extends EventEmitter {
|
||||
private conversationHistory: ModelMessage[] = [];
|
||||
private historyConfig: HistoryConfig;
|
||||
|
||||
constructor(options: ConversationManagerOptions = {}) {
|
||||
super();
|
||||
this.historyConfig = {
|
||||
...DEFAULT_HISTORY_CONFIG,
|
||||
...options.history,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a message to history and trim if needed.
|
||||
*/
|
||||
addMessage(message: ModelMessage): void {
|
||||
this.conversationHistory.push(message);
|
||||
this.trimHistory();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a copy of the current history.
|
||||
*/
|
||||
getHistory(): ModelMessage[] {
|
||||
return [...this.conversationHistory];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a direct reference to the history array.
|
||||
* Use with caution — prefer getHistory() for safety.
|
||||
*/
|
||||
getHistoryRef(): ModelMessage[] {
|
||||
return this.conversationHistory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace the entire conversation history.
|
||||
*/
|
||||
setHistory(history: ModelMessage[]): void {
|
||||
this.conversationHistory = [...history];
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all conversation history.
|
||||
*/
|
||||
clearHistory(): void {
|
||||
this.conversationHistory = [];
|
||||
this.emit("history_cleared");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of messages in history.
|
||||
*/
|
||||
get length(): number {
|
||||
return this.conversationHistory.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim conversation history to stay within configured limits.
|
||||
* Removes oldest messages (always in pairs to preserve user/assistant turns).
|
||||
*/
|
||||
private trimHistory(): void {
|
||||
const { maxMessages, maxTotalChars } = this.historyConfig;
|
||||
|
||||
// Trim by message count
|
||||
if (maxMessages > 0 && this.conversationHistory.length > maxMessages) {
|
||||
const excess = this.conversationHistory.length - maxMessages;
|
||||
// Round up to even number to preserve turn pairs
|
||||
const toRemove = excess % 2 === 0 ? excess : excess + 1;
|
||||
this.conversationHistory.splice(0, toRemove);
|
||||
this.emit("history_trimmed", {
|
||||
removedCount: toRemove,
|
||||
reason: "max_messages",
|
||||
});
|
||||
}
|
||||
|
||||
// Trim by total character count
|
||||
if (maxTotalChars > 0) {
|
||||
let totalChars = this.conversationHistory.reduce((sum, msg) => {
|
||||
const content =
|
||||
typeof msg.content === "string"
|
||||
? msg.content
|
||||
: JSON.stringify(msg.content);
|
||||
return sum + content.length;
|
||||
}, 0);
|
||||
|
||||
let removedCount = 0;
|
||||
while (
|
||||
totalChars > maxTotalChars &&
|
||||
this.conversationHistory.length > 2
|
||||
) {
|
||||
const removed = this.conversationHistory.shift();
|
||||
if (removed) {
|
||||
const content =
|
||||
typeof removed.content === "string"
|
||||
? removed.content
|
||||
: JSON.stringify(removed.content);
|
||||
totalChars -= content.length;
|
||||
removedCount++;
|
||||
}
|
||||
}
|
||||
if (removedCount > 0) {
|
||||
this.emit("history_trimmed", {
|
||||
removedCount,
|
||||
reason: "max_total_chars",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
71
src/core/InputQueue.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* A generic serial input queue that ensures only one processor runs at a time.
|
||||
*
|
||||
* @template T The shape of each queued item (must include resolve/reject)
|
||||
*/
|
||||
export interface QueueItem<T = string> {
|
||||
resolve: (v: T) => void;
|
||||
reject: (e: unknown) => void;
|
||||
}
|
||||
|
||||
export class InputQueue<T extends QueueItem<any>> {
|
||||
private queue: T[] = [];
|
||||
private processing = false;
|
||||
|
||||
/** Callback invoked for each item — must return a resolved value */
|
||||
public processor: (item: T) => Promise<any> = async () => "";
|
||||
|
||||
/**
|
||||
* Enqueue an item for serial processing.
|
||||
*/
|
||||
enqueue(item: T): void {
|
||||
this.queue.push(item);
|
||||
this.drain();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reject all pending items (used on disconnect/destroy).
|
||||
*/
|
||||
rejectAll(reason: Error): void {
|
||||
for (const item of this.queue) {
|
||||
item.reject(reason);
|
||||
}
|
||||
this.queue = [];
|
||||
this.processing = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Number of items waiting in the queue.
|
||||
*/
|
||||
get length(): number {
|
||||
return this.queue.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the queue is currently processing an item.
|
||||
*/
|
||||
get isProcessing(): boolean {
|
||||
return this.processing;
|
||||
}
|
||||
|
||||
// ── Private ──────────────────────────────────────────
|
||||
|
||||
private async drain(): Promise<void> {
|
||||
if (this.processing) return;
|
||||
this.processing = true;
|
||||
|
||||
try {
|
||||
while (this.queue.length > 0) {
|
||||
const item = this.queue.shift()!;
|
||||
try {
|
||||
const result = await this.processor(item);
|
||||
item.resolve(result);
|
||||
} catch (error) {
|
||||
item.reject(error);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.processing = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
453
src/core/SpeechManager.ts
Normal file
@@ -0,0 +1,453 @@
|
||||
import { EventEmitter } from "events";
|
||||
import {
|
||||
experimental_generateSpeech as generateSpeech,
|
||||
type SpeechModel,
|
||||
} from "ai";
|
||||
import {
|
||||
type SpeechChunk,
|
||||
type StreamingSpeechConfig,
|
||||
DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||
} from "../types";
|
||||
|
||||
export interface SpeechManagerOptions {
|
||||
speechModel?: SpeechModel;
|
||||
voice?: string;
|
||||
speechInstructions?: string;
|
||||
outputFormat?: string;
|
||||
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Manages text-to-speech generation, streaming speech chunking,
|
||||
* parallel TTS requests, and speech interruption.
|
||||
*/
|
||||
export class SpeechManager extends EventEmitter {
|
||||
private speechModel?: SpeechModel;
|
||||
private voice: string;
|
||||
private speechInstructions?: string;
|
||||
private outputFormat: string;
|
||||
private streamingSpeechConfig: StreamingSpeechConfig;
|
||||
|
||||
private currentSpeechAbortController?: AbortController;
|
||||
private speechChunkQueue: SpeechChunk[] = [];
|
||||
private nextChunkId = 0;
|
||||
private _isSpeaking = false;
|
||||
private pendingTextBuffer = "";
|
||||
|
||||
// Promise-based signal for speech queue completion
|
||||
private speechQueueDonePromise?: Promise<void>;
|
||||
private speechQueueDoneResolve?: () => void;
|
||||
|
||||
/** Callback to send messages over the WebSocket */
|
||||
public sendMessage: (message: Record<string, unknown>) => void = () => { };
|
||||
|
||||
constructor(options: SpeechManagerOptions) {
|
||||
super();
|
||||
this.speechModel = options.speechModel;
|
||||
this.voice = options.voice || "alloy";
|
||||
this.speechInstructions = options.speechInstructions;
|
||||
this.outputFormat = options.outputFormat || "opus";
|
||||
this.streamingSpeechConfig = {
|
||||
...DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||
...options.streamingSpeech,
|
||||
};
|
||||
}
|
||||
|
||||
get isSpeaking(): boolean {
|
||||
return this._isSpeaking;
|
||||
}
|
||||
|
||||
get pendingChunkCount(): number {
|
||||
return this.speechChunkQueue.length;
|
||||
}
|
||||
|
||||
get hasSpeechModel(): boolean {
|
||||
return !!this.speechModel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a promise that resolves when the speech queue is fully drained.
|
||||
* Returns undefined if there is nothing queued.
|
||||
*/
|
||||
get queueDonePromise(): Promise<void> | undefined {
|
||||
return this.speechQueueDonePromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech from text using the configured speech model.
|
||||
*/
|
||||
async generateSpeechFromText(
|
||||
text: string,
|
||||
abortSignal?: AbortSignal
|
||||
): Promise<Uint8Array> {
|
||||
if (!this.speechModel) {
|
||||
throw new Error("Speech model not configured");
|
||||
}
|
||||
|
||||
const result = await generateSpeech({
|
||||
model: this.speechModel,
|
||||
text,
|
||||
voice: this.voice,
|
||||
instructions: this.speechInstructions,
|
||||
outputFormat: this.outputFormat,
|
||||
abortSignal,
|
||||
});
|
||||
|
||||
return result.audio.uint8Array;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech for full text at once (non-streaming fallback).
|
||||
*/
|
||||
async generateAndSendSpeechFull(text: string): Promise<void> {
|
||||
if (!this.speechModel) return;
|
||||
|
||||
try {
|
||||
this.emit("speech_start", { text, streaming: false });
|
||||
|
||||
const audioData = await this.generateSpeechFromText(text);
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
|
||||
this.sendMessage({
|
||||
type: "audio",
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
});
|
||||
|
||||
this.emit("audio", {
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
uint8Array: audioData,
|
||||
});
|
||||
|
||||
this.emit("speech_complete", { text, streaming: false });
|
||||
} catch (error) {
|
||||
console.error("Failed to generate speech:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Interrupt ongoing speech generation and playback (barge-in support).
|
||||
*/
|
||||
interruptSpeech(reason: string = "interrupted"): void {
|
||||
if (!this._isSpeaking && this.speechChunkQueue.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Abort any pending speech generation requests
|
||||
if (this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController.abort();
|
||||
this.currentSpeechAbortController = undefined;
|
||||
}
|
||||
|
||||
// Clear the speech queue
|
||||
this.speechChunkQueue = [];
|
||||
this.pendingTextBuffer = "";
|
||||
this._isSpeaking = false;
|
||||
|
||||
// Resolve any pending speech-done waiters so callers can finish
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
|
||||
// Notify clients to stop audio playback
|
||||
this.sendMessage({
|
||||
type: "speech_interrupted",
|
||||
reason,
|
||||
});
|
||||
|
||||
this.emit("speech_interrupted", { reason });
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a text delta for streaming speech.
|
||||
* Call this as text chunks arrive from the LLM.
|
||||
*/
|
||||
processTextDelta(textDelta: string): void {
|
||||
if (!this.speechModel) return;
|
||||
|
||||
this.pendingTextBuffer += textDelta;
|
||||
|
||||
const [sentences, remaining] = this.extractSentences(this.pendingTextBuffer);
|
||||
this.pendingTextBuffer = remaining;
|
||||
|
||||
for (const sentence of sentences) {
|
||||
this.queueSpeechChunk(sentence);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush any remaining text in the buffer to speech.
|
||||
* Call this when the LLM stream ends.
|
||||
*/
|
||||
flushPendingText(): void {
|
||||
if (!this.speechModel || !this.pendingTextBuffer.trim()) return;
|
||||
|
||||
this.queueSpeechChunk(this.pendingTextBuffer);
|
||||
this.pendingTextBuffer = "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset all speech state (used on disconnect / cleanup).
|
||||
*/
|
||||
reset(): void {
|
||||
if (this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController.abort();
|
||||
this.currentSpeechAbortController = undefined;
|
||||
}
|
||||
this.speechChunkQueue = [];
|
||||
this.pendingTextBuffer = "";
|
||||
this._isSpeaking = false;
|
||||
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Private helpers ─────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Extract complete sentences from text buffer.
|
||||
* Returns [extractedSentences, remainingBuffer].
|
||||
*/
|
||||
private extractSentences(text: string): [string[], string] {
|
||||
const sentences: string[] = [];
|
||||
let remaining = text;
|
||||
|
||||
// Match sentences ending with . ! ? followed by space or end of string
|
||||
const sentenceEndPattern = /[.!?]+(?:\s+|$)/g;
|
||||
let lastIndex = 0;
|
||||
let match;
|
||||
|
||||
while ((match = sentenceEndPattern.exec(text)) !== null) {
|
||||
const sentence = text
|
||||
.slice(lastIndex, match.index + match[0].length)
|
||||
.trim();
|
||||
if (sentence.length >= this.streamingSpeechConfig.minChunkSize) {
|
||||
sentences.push(sentence);
|
||||
lastIndex = match.index + match[0].length;
|
||||
} else if (sentences.length > 0) {
|
||||
// Append short sentence to previous one
|
||||
sentences[sentences.length - 1] += " " + sentence;
|
||||
lastIndex = match.index + match[0].length;
|
||||
}
|
||||
}
|
||||
|
||||
remaining = text.slice(lastIndex);
|
||||
|
||||
// If remaining text is too long, force split at clause boundaries
|
||||
if (remaining.length > this.streamingSpeechConfig.maxChunkSize) {
|
||||
const clausePattern = /[,;:]\s+/g;
|
||||
let clauseMatch;
|
||||
let splitIndex = 0;
|
||||
|
||||
while ((clauseMatch = clausePattern.exec(remaining)) !== null) {
|
||||
if (clauseMatch.index >= this.streamingSpeechConfig.minChunkSize) {
|
||||
splitIndex = clauseMatch.index + clauseMatch[0].length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (splitIndex > 0) {
|
||||
sentences.push(remaining.slice(0, splitIndex).trim());
|
||||
remaining = remaining.slice(splitIndex);
|
||||
}
|
||||
}
|
||||
|
||||
return [sentences, remaining];
|
||||
}
|
||||
|
||||
/**
|
||||
* Queue a text chunk for speech generation.
|
||||
*/
|
||||
private queueSpeechChunk(text: string): void {
|
||||
if (!this.speechModel || !text.trim()) return;
|
||||
|
||||
// Wrap chunk ID to prevent unbounded growth in very long sessions
|
||||
if (this.nextChunkId >= Number.MAX_SAFE_INTEGER) {
|
||||
this.nextChunkId = 0;
|
||||
}
|
||||
|
||||
const chunk: SpeechChunk = {
|
||||
id: this.nextChunkId++,
|
||||
text: text.trim(),
|
||||
};
|
||||
|
||||
// Create the speech-done promise if not already present
|
||||
if (!this.speechQueueDonePromise) {
|
||||
this.speechQueueDonePromise = new Promise<void>((resolve) => {
|
||||
this.speechQueueDoneResolve = resolve;
|
||||
});
|
||||
}
|
||||
|
||||
// Start generating audio immediately (parallel generation)
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
const activeRequests = this.speechChunkQueue.filter(
|
||||
(c) => c.audioPromise
|
||||
).length;
|
||||
|
||||
if (activeRequests < this.streamingSpeechConfig.maxParallelRequests) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
this.speechChunkQueue.push(chunk);
|
||||
this.emit("speech_chunk_queued", { id: chunk.id, text: chunk.text });
|
||||
|
||||
// Start processing queue if not already
|
||||
if (!this._isSpeaking) {
|
||||
this.processSpeechQueue();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio for a single chunk.
|
||||
*/
|
||||
private async generateChunkAudio(
|
||||
chunk: SpeechChunk
|
||||
): Promise<Uint8Array | null> {
|
||||
if (!this.currentSpeechAbortController) {
|
||||
this.currentSpeechAbortController = new AbortController();
|
||||
}
|
||||
|
||||
try {
|
||||
console.log(
|
||||
`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? "..." : ""}"`
|
||||
);
|
||||
const audioData = await this.generateSpeechFromText(
|
||||
chunk.text,
|
||||
this.currentSpeechAbortController.signal
|
||||
);
|
||||
console.log(
|
||||
`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`
|
||||
);
|
||||
return audioData;
|
||||
} catch (error) {
|
||||
if ((error as Error).name === "AbortError") {
|
||||
console.log(`Audio generation aborted for chunk ${chunk.id}`);
|
||||
return null;
|
||||
}
|
||||
console.error(
|
||||
`Failed to generate audio for chunk ${chunk.id}:`,
|
||||
error
|
||||
);
|
||||
this.emit("error", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the speech queue and send audio chunks in order.
|
||||
*/
|
||||
private async processSpeechQueue(): Promise<void> {
|
||||
if (this._isSpeaking) return;
|
||||
this._isSpeaking = true;
|
||||
|
||||
console.log(
|
||||
`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`
|
||||
);
|
||||
this.emit("speech_start", { streaming: true });
|
||||
this.sendMessage({ type: "speech_stream_start" });
|
||||
|
||||
try {
|
||||
while (this.speechChunkQueue.length > 0) {
|
||||
const chunk = this.speechChunkQueue[0];
|
||||
|
||||
console.log(
|
||||
`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`
|
||||
);
|
||||
|
||||
// Ensure audio generation has started
|
||||
if (!chunk.audioPromise) {
|
||||
chunk.audioPromise = this.generateChunkAudio(chunk);
|
||||
}
|
||||
|
||||
// Wait for this chunk's audio
|
||||
const audioData = await chunk.audioPromise;
|
||||
|
||||
// Check if we were interrupted while waiting
|
||||
if (!this._isSpeaking) {
|
||||
console.log(`Speech interrupted during chunk #${chunk.id}`);
|
||||
break;
|
||||
}
|
||||
|
||||
// Remove from queue after processing
|
||||
this.speechChunkQueue.shift();
|
||||
|
||||
if (audioData) {
|
||||
const base64Audio = Buffer.from(audioData).toString("base64");
|
||||
console.log(
|
||||
`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`
|
||||
);
|
||||
|
||||
// Send audio chunk via WebSocket
|
||||
this.sendMessage({
|
||||
type: "audio_chunk",
|
||||
chunkId: chunk.id,
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
text: chunk.text,
|
||||
});
|
||||
|
||||
// Emit for local handling
|
||||
this.emit("audio_chunk", {
|
||||
chunkId: chunk.id,
|
||||
data: base64Audio,
|
||||
format: this.outputFormat,
|
||||
text: chunk.text,
|
||||
uint8Array: audioData,
|
||||
});
|
||||
} else {
|
||||
console.log(`No audio data generated for chunk #${chunk.id}`);
|
||||
}
|
||||
|
||||
// Start generating next chunks in parallel
|
||||
if (this.streamingSpeechConfig.parallelGeneration) {
|
||||
const activeRequests = this.speechChunkQueue.filter(
|
||||
(c) => c.audioPromise
|
||||
).length;
|
||||
const toStart = Math.min(
|
||||
this.streamingSpeechConfig.maxParallelRequests - activeRequests,
|
||||
this.speechChunkQueue.length
|
||||
);
|
||||
|
||||
if (toStart > 0) {
|
||||
console.log(
|
||||
`Starting parallel generation for ${toStart} more chunks`
|
||||
);
|
||||
for (let i = 0; i < toStart; i++) {
|
||||
const nextChunk = this.speechChunkQueue.find(
|
||||
(c) => !c.audioPromise
|
||||
);
|
||||
if (nextChunk) {
|
||||
nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error in speech queue processing:", error);
|
||||
this.emit("error", error);
|
||||
} finally {
|
||||
this._isSpeaking = false;
|
||||
this.currentSpeechAbortController = undefined;
|
||||
|
||||
// Signal that the speech queue is fully drained
|
||||
if (this.speechQueueDoneResolve) {
|
||||
this.speechQueueDoneResolve();
|
||||
this.speechQueueDoneResolve = undefined;
|
||||
this.speechQueueDonePromise = undefined;
|
||||
}
|
||||
|
||||
console.log(`Speech queue processing complete`);
|
||||
this.sendMessage({ type: "speech_stream_end" });
|
||||
this.emit("speech_complete", { streaming: true });
|
||||
}
|
||||
}
|
||||
}
|
||||
293
src/core/StreamProcessor.ts
Normal file
@@ -0,0 +1,293 @@
|
||||
import { type streamText } from "ai";
|
||||
|
||||
/**
|
||||
* Result of processing a full LLM stream.
|
||||
*/
|
||||
export interface StreamResult {
|
||||
fullText: string;
|
||||
fullReasoning: string;
|
||||
allToolCalls: Array<{
|
||||
toolName: string;
|
||||
toolCallId: string;
|
||||
input: unknown;
|
||||
}>;
|
||||
allToolResults: Array<{
|
||||
toolName: string;
|
||||
toolCallId: string;
|
||||
output: unknown;
|
||||
}>;
|
||||
allSources: Array<unknown>;
|
||||
allFiles: Array<unknown>;
|
||||
}
|
||||
|
||||
export interface StreamProcessorCallbacks {
|
||||
/** Called when a text delta arrives (for streaming speech, etc.) */
|
||||
onTextDelta?: (text: string) => void;
|
||||
/** Called when a text-end part arrives (flush speech, etc.) */
|
||||
onTextEnd?: () => void;
|
||||
/** Send a WebSocket message */
|
||||
sendMessage: (message: Record<string, unknown>) => void;
|
||||
/** Emit an event on the agent */
|
||||
emitEvent: (event: string, data?: unknown) => void;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes the fullStream from an AI SDK `streamText` call,
|
||||
* forwarding events to WebSocket clients and collecting the complete response.
|
||||
*
|
||||
* This is a standalone function (not a class) because it has no persistent state.
|
||||
*/
|
||||
export async function processFullStream(
|
||||
result: ReturnType<typeof streamText>,
|
||||
callbacks: StreamProcessorCallbacks,
|
||||
extraResponseFields?: Record<string, unknown>
|
||||
): Promise<StreamResult> {
|
||||
const { onTextDelta, onTextEnd, sendMessage, emitEvent } = callbacks;
|
||||
|
||||
let fullText = "";
|
||||
let fullReasoning = "";
|
||||
const allToolCalls: StreamResult["allToolCalls"] = [];
|
||||
const allToolResults: StreamResult["allToolResults"] = [];
|
||||
const allSources: unknown[] = [];
|
||||
const allFiles: unknown[] = [];
|
||||
|
||||
for await (const part of result.fullStream) {
|
||||
switch (part.type) {
|
||||
// ── Stream lifecycle ──────────────────────────────
|
||||
case "start":
|
||||
sendMessage({ type: "stream_start" });
|
||||
break;
|
||||
|
||||
case "finish":
|
||||
emitEvent("text", { role: "assistant", text: fullText });
|
||||
sendMessage({
|
||||
type: "stream_finish",
|
||||
finishReason: part.finishReason,
|
||||
usage: part.totalUsage,
|
||||
});
|
||||
break;
|
||||
|
||||
case "error":
|
||||
emitEvent("error", part.error);
|
||||
sendMessage({
|
||||
type: "stream_error",
|
||||
error: String(part.error),
|
||||
});
|
||||
break;
|
||||
|
||||
case "abort":
|
||||
emitEvent("abort", { reason: part.reason });
|
||||
sendMessage({
|
||||
type: "stream_abort",
|
||||
reason: part.reason,
|
||||
});
|
||||
break;
|
||||
|
||||
// ── Step lifecycle ────────────────────────────────
|
||||
case "start-step":
|
||||
sendMessage({
|
||||
type: "step_start",
|
||||
warnings: part.warnings,
|
||||
});
|
||||
break;
|
||||
|
||||
case "finish-step":
|
||||
sendMessage({
|
||||
type: "step_finish",
|
||||
finishReason: part.finishReason,
|
||||
usage: part.usage,
|
||||
});
|
||||
break;
|
||||
|
||||
// ── Text streaming ────────────────────────────────
|
||||
case "text-start":
|
||||
sendMessage({ type: "text_start", id: part.id });
|
||||
break;
|
||||
|
||||
case "text-delta":
|
||||
fullText += part.text;
|
||||
onTextDelta?.(part.text);
|
||||
sendMessage({
|
||||
type: "text_delta",
|
||||
id: part.id,
|
||||
text: part.text,
|
||||
});
|
||||
break;
|
||||
|
||||
case "text-end":
|
||||
onTextEnd?.();
|
||||
sendMessage({ type: "text_end", id: part.id });
|
||||
break;
|
||||
|
||||
// ── Reasoning streaming ───────────────────────────
|
||||
case "reasoning-start":
|
||||
sendMessage({ type: "reasoning_start", id: part.id });
|
||||
break;
|
||||
|
||||
case "reasoning-delta":
|
||||
fullReasoning += part.text;
|
||||
sendMessage({
|
||||
type: "reasoning_delta",
|
||||
id: part.id,
|
||||
text: part.text,
|
||||
});
|
||||
break;
|
||||
|
||||
case "reasoning-end":
|
||||
sendMessage({ type: "reasoning_end", id: part.id });
|
||||
break;
|
||||
|
||||
// ── Tool input streaming ──────────────────────────
|
||||
case "tool-input-start":
|
||||
sendMessage({
|
||||
type: "tool_input_start",
|
||||
id: part.id,
|
||||
toolName: part.toolName,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-input-delta":
|
||||
sendMessage({
|
||||
type: "tool_input_delta",
|
||||
id: part.id,
|
||||
delta: part.delta,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-input-end":
|
||||
sendMessage({ type: "tool_input_end", id: part.id });
|
||||
break;
|
||||
|
||||
// ── Tool execution ────────────────────────────────
|
||||
case "tool-call":
|
||||
allToolCalls.push({
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
input: part.input,
|
||||
});
|
||||
sendMessage({
|
||||
type: "tool_call",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
input: part.input,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-result":
|
||||
allToolResults.push({
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
output: part.output,
|
||||
});
|
||||
sendMessage({
|
||||
type: "tool_result",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
result: part.output,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-error":
|
||||
sendMessage({
|
||||
type: "tool_error",
|
||||
toolName: part.toolName,
|
||||
toolCallId: part.toolCallId,
|
||||
error: String(part.error),
|
||||
});
|
||||
break;
|
||||
|
||||
// ── Sources and files ─────────────────────────────
|
||||
case "source":
|
||||
allSources.push(part);
|
||||
sendMessage({
|
||||
type: "source",
|
||||
source: part,
|
||||
});
|
||||
break;
|
||||
|
||||
case "file":
|
||||
allFiles.push(part.file);
|
||||
sendMessage({
|
||||
type: "file",
|
||||
file: part.file,
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Send the complete response
|
||||
sendMessage({
|
||||
type: "response_complete",
|
||||
text: fullText,
|
||||
reasoning: fullReasoning || undefined,
|
||||
toolCalls: allToolCalls,
|
||||
toolResults: allToolResults,
|
||||
sources: allSources.length > 0 ? allSources : undefined,
|
||||
files: allFiles.length > 0 ? allFiles : undefined,
|
||||
...extraResponseFields,
|
||||
});
|
||||
|
||||
return {
|
||||
fullText,
|
||||
fullReasoning,
|
||||
allToolCalls,
|
||||
allToolResults,
|
||||
allSources,
|
||||
allFiles,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle onChunk callback events and emit them.
|
||||
*/
|
||||
export function handleStreamChunk(
|
||||
chunk: any,
|
||||
emitEvent: (event: string, data?: unknown) => void
|
||||
): void {
|
||||
switch (chunk.type) {
|
||||
case "text-delta":
|
||||
emitEvent("chunk:text_delta", { id: chunk.id, text: chunk.text });
|
||||
break;
|
||||
|
||||
case "reasoning-delta":
|
||||
emitEvent("chunk:reasoning_delta", {
|
||||
id: chunk.id,
|
||||
text: chunk.text,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-call":
|
||||
emitEvent("chunk:tool_call", {
|
||||
toolName: chunk.toolName,
|
||||
toolCallId: chunk.toolCallId,
|
||||
input: chunk.input,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-result":
|
||||
emitEvent("chunk:tool_result", {
|
||||
toolName: chunk.toolName,
|
||||
toolCallId: chunk.toolCallId,
|
||||
result: chunk.output,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-input-start":
|
||||
emitEvent("chunk:tool_input_start", {
|
||||
id: chunk.id,
|
||||
toolName: chunk.toolName,
|
||||
});
|
||||
break;
|
||||
|
||||
case "tool-input-delta":
|
||||
emitEvent("chunk:tool_input_delta", {
|
||||
id: chunk.id,
|
||||
delta: chunk.delta,
|
||||
});
|
||||
break;
|
||||
|
||||
case "source":
|
||||
emitEvent("chunk:source", chunk);
|
||||
break;
|
||||
}
|
||||
}
|
||||
142
src/core/TranscriptionManager.ts
Normal file
@@ -0,0 +1,142 @@
|
||||
import { EventEmitter } from "events";
|
||||
import {
|
||||
experimental_transcribe as transcribe,
|
||||
type TranscriptionModel,
|
||||
} from "ai";
|
||||
import { DEFAULT_MAX_AUDIO_SIZE } from "../types";
|
||||
|
||||
export interface TranscriptionManagerOptions {
|
||||
transcriptionModel?: TranscriptionModel;
|
||||
maxAudioInputSize?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles audio transcription using the AI SDK transcription model
|
||||
* and validation of incoming audio data.
|
||||
*/
|
||||
export class TranscriptionManager extends EventEmitter {
|
||||
private transcriptionModel?: TranscriptionModel;
|
||||
private maxAudioInputSize: number;
|
||||
|
||||
/** Callback to send messages over the WebSocket */
|
||||
public sendMessage: (message: Record<string, unknown>) => void = () => {};
|
||||
|
||||
constructor(options: TranscriptionManagerOptions = {}) {
|
||||
super();
|
||||
this.transcriptionModel = options.transcriptionModel;
|
||||
this.maxAudioInputSize =
|
||||
options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
|
||||
}
|
||||
|
||||
get hasTranscriptionModel(): boolean {
|
||||
return !!this.transcriptionModel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio data to text.
|
||||
*/
|
||||
async transcribeAudio(audioData: Buffer | Uint8Array): Promise<string> {
|
||||
if (!this.transcriptionModel) {
|
||||
throw new Error("Transcription model not configured");
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Sending ${audioData.byteLength} bytes to Whisper for transcription`
|
||||
);
|
||||
|
||||
try {
|
||||
const result = await transcribe({
|
||||
model: this.transcriptionModel,
|
||||
audio: audioData,
|
||||
});
|
||||
|
||||
console.log(
|
||||
`Whisper transcription result: "${result.text}", language: ${result.language || "unknown"}`
|
||||
);
|
||||
|
||||
this.emit("transcription", {
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
});
|
||||
|
||||
// Send transcription to client for immediate feedback
|
||||
this.sendMessage({
|
||||
type: "transcription_result",
|
||||
text: result.text,
|
||||
language: result.language,
|
||||
});
|
||||
|
||||
return result.text;
|
||||
} catch (error) {
|
||||
console.error("Whisper transcription failed:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process incoming base64-encoded audio: validate, decode, transcribe.
|
||||
* Returns the transcribed text, or null if invalid / empty.
|
||||
*/
|
||||
async processAudioInput(
|
||||
base64Audio: string,
|
||||
format?: string
|
||||
): Promise<string | null> {
|
||||
if (!this.transcriptionModel) {
|
||||
const error = new Error(
|
||||
"Transcription model not configured for audio input"
|
||||
);
|
||||
this.emit("error", error);
|
||||
this.sendMessage({ type: "error", error: error.message });
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const audioBuffer = Buffer.from(base64Audio, "base64");
|
||||
|
||||
// Validate audio size
|
||||
if (audioBuffer.length > this.maxAudioInputSize) {
|
||||
const sizeMB = (audioBuffer.length / (1024 * 1024)).toFixed(1);
|
||||
const maxMB = (this.maxAudioInputSize / (1024 * 1024)).toFixed(1);
|
||||
this.emit(
|
||||
"error",
|
||||
new Error(
|
||||
`Audio input too large (${sizeMB} MB). Maximum allowed: ${maxMB} MB`
|
||||
)
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (audioBuffer.length === 0) {
|
||||
this.emit("warning", "Received empty audio data");
|
||||
return null;
|
||||
}
|
||||
|
||||
this.emit("audio_received", { size: audioBuffer.length, format });
|
||||
console.log(
|
||||
`Processing audio input: ${audioBuffer.length} bytes, format: ${format || "unknown"}`
|
||||
);
|
||||
|
||||
const transcribedText = await this.transcribeAudio(audioBuffer);
|
||||
console.log(`Transcribed text: "${transcribedText}"`);
|
||||
|
||||
if (!transcribedText.trim()) {
|
||||
this.emit("warning", "Transcription returned empty text");
|
||||
this.sendMessage({
|
||||
type: "transcription_error",
|
||||
error: "Whisper returned empty text",
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
return transcribedText;
|
||||
} catch (error) {
|
||||
console.error("Failed to process audio input:", error);
|
||||
this.emit("error", error);
|
||||
this.sendMessage({
|
||||
type: "transcription_error",
|
||||
error: `Transcription failed: ${(error as Error).message || String(error)}`,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
133
src/core/WebSocketManager.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import { WebSocket } from "ws";
|
||||
import { EventEmitter } from "events";
|
||||
|
||||
/**
|
||||
* Manages a single WebSocket connection lifecycle.
|
||||
* Handles connecting, attaching existing sockets, sending messages,
|
||||
* and clean disconnection.
|
||||
*/
|
||||
export class WebSocketManager extends EventEmitter {
|
||||
private socket?: WebSocket;
|
||||
private _isConnected = false;
|
||||
|
||||
get isConnected(): boolean {
|
||||
return this._isConnected;
|
||||
}
|
||||
|
||||
get currentSocket(): WebSocket | undefined {
|
||||
return this.socket;
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to a WebSocket server by URL.
|
||||
*/
|
||||
connect(url: string): Promise<void> {
|
||||
// Clean up any existing connection first
|
||||
if (this.socket) {
|
||||
this.disconnect();
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
try {
|
||||
this.socket = new WebSocket(url);
|
||||
this.attachListeners();
|
||||
|
||||
this.socket.once("open", () => {
|
||||
this._isConnected = true;
|
||||
this.emit("connected");
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.socket.once("error", (error) => {
|
||||
reject(error);
|
||||
});
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach an existing WebSocket (server-side usage).
|
||||
*/
|
||||
handleSocket(socket: WebSocket): void {
|
||||
// Clean up any existing connection first
|
||||
if (this.socket) {
|
||||
this.disconnect();
|
||||
}
|
||||
|
||||
this.socket = socket;
|
||||
this._isConnected = true;
|
||||
this.attachListeners();
|
||||
this.emit("connected");
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a JSON message via WebSocket if connected.
|
||||
* Gracefully handles send failures (e.g., socket closing mid-send).
|
||||
*/
|
||||
send(message: Record<string, unknown>): void {
|
||||
if (!this.socket || !this._isConnected) return;
|
||||
|
||||
try {
|
||||
if (this.socket.readyState === WebSocket.OPEN) {
|
||||
this.socket.send(JSON.stringify(message));
|
||||
} else {
|
||||
console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
|
||||
}
|
||||
} catch (error) {
|
||||
// Socket may have closed between the readyState check and send()
|
||||
console.error("Failed to send WebSocket message:", error);
|
||||
this.emit("error", error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Disconnect and clean up the current socket.
|
||||
*/
|
||||
disconnect(): void {
|
||||
if (!this.socket) return;
|
||||
|
||||
try {
|
||||
this.socket.removeAllListeners();
|
||||
if (
|
||||
this.socket.readyState === WebSocket.OPEN ||
|
||||
this.socket.readyState === WebSocket.CONNECTING
|
||||
) {
|
||||
this.socket.close();
|
||||
}
|
||||
} catch {
|
||||
// Ignore close errors — socket may already be dead
|
||||
}
|
||||
|
||||
this.socket = undefined;
|
||||
this._isConnected = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach internal event listeners on the current socket.
|
||||
*/
|
||||
private attachListeners(): void {
|
||||
if (!this.socket) return;
|
||||
|
||||
this.socket.on("message", (data) => {
|
||||
try {
|
||||
const message = JSON.parse(data.toString());
|
||||
this.emit("message", message);
|
||||
} catch (err) {
|
||||
console.error("Failed to parse WebSocket message:", err);
|
||||
this.emit("error", err);
|
||||
}
|
||||
});
|
||||
|
||||
this.socket.on("close", () => {
|
||||
this._isConnected = false;
|
||||
this.emit("disconnected");
|
||||
});
|
||||
|
||||
this.socket.on("error", (error) => {
|
||||
console.error("WebSocket error:", error);
|
||||
this.emit("error", error);
|
||||
});
|
||||
}
|
||||
}
|
||||
17
src/core/index.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
export { WebSocketManager } from "./WebSocketManager";
|
||||
export { SpeechManager, type SpeechManagerOptions } from "./SpeechManager";
|
||||
export {
|
||||
ConversationManager,
|
||||
type ConversationManagerOptions,
|
||||
} from "./ConversationManager";
|
||||
export {
|
||||
TranscriptionManager,
|
||||
type TranscriptionManagerOptions,
|
||||
} from "./TranscriptionManager";
|
||||
export {
|
||||
processFullStream,
|
||||
handleStreamChunk,
|
||||
type StreamResult,
|
||||
type StreamProcessorCallbacks,
|
||||
} from "./StreamProcessor";
|
||||
export { InputQueue, type QueueItem } from "./InputQueue";
|
||||
@@ -1,5 +1,5 @@
|
||||
// Agents
|
||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent.new";
|
||||
export {
|
||||
VideoAgent,
|
||||
type VideoAgentOptions,
|
||||
@@ -8,7 +8,7 @@ export {
|
||||
type VideoAgentConfig,
|
||||
type FrameContext,
|
||||
type FrameTriggerReason,
|
||||
} from "./VideoAgent";
|
||||
} from "./VideoAgent.new";
|
||||
|
||||
// Shared types
|
||||
export {
|
||||
|
||||