mirror of
https://github.com/Bijit-Mondal/VoiceAgent.git
synced 2026-03-02 18:36:39 +00:00
Refactor VoiceAgent: Extract types and default configurations into separate types.ts file; remove unused StreamBuffer file
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -5,3 +5,5 @@ node_modules
|
|||||||
.marscode
|
.marscode
|
||||||
|
|
||||||
# dist
|
# dist
|
||||||
|
|
||||||
|
HOW_*.md
|
||||||
339
dist/VideoAgent.d.ts
vendored
Normal file
339
dist/VideoAgent.d.ts
vendored
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
import { WebSocket } from "ws";
|
||||||
|
import { EventEmitter } from "events";
|
||||||
|
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
||||||
|
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
|
||||||
|
/**
|
||||||
|
* Trigger reasons for frame capture
|
||||||
|
*/
|
||||||
|
type FrameTriggerReason = "scene_change" | "user_request" | "timer" | "initial";
|
||||||
|
/**
|
||||||
|
* Video frame data structure sent to/from the client
|
||||||
|
*/
|
||||||
|
interface VideoFrame {
|
||||||
|
type: "video_frame";
|
||||||
|
sessionId: string;
|
||||||
|
sequence: number;
|
||||||
|
timestamp: number;
|
||||||
|
triggerReason: FrameTriggerReason;
|
||||||
|
previousFrameRef?: string;
|
||||||
|
image: {
|
||||||
|
data: string;
|
||||||
|
format: string;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Audio data structure
|
||||||
|
*/
|
||||||
|
interface AudioData {
|
||||||
|
type: "audio";
|
||||||
|
sessionId: string;
|
||||||
|
data: string;
|
||||||
|
format: string;
|
||||||
|
sampleRate?: number;
|
||||||
|
duration?: number;
|
||||||
|
timestamp: number;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Backend configuration for video processing
|
||||||
|
*/
|
||||||
|
interface VideoAgentConfig {
|
||||||
|
/** Maximum frames to keep in context buffer for conversation history */
|
||||||
|
maxContextFrames: number;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Frame context for maintaining visual conversation history
|
||||||
|
*/
|
||||||
|
interface FrameContext {
|
||||||
|
sequence: number;
|
||||||
|
timestamp: number;
|
||||||
|
triggerReason: FrameTriggerReason;
|
||||||
|
frameHash: string;
|
||||||
|
description?: string;
|
||||||
|
}
|
||||||
|
export interface VideoAgentOptions {
|
||||||
|
model: LanguageModel;
|
||||||
|
transcriptionModel?: TranscriptionModel;
|
||||||
|
speechModel?: SpeechModel;
|
||||||
|
instructions?: string;
|
||||||
|
stopWhen?: NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||||
|
tools?: Record<string, Tool>;
|
||||||
|
endpoint?: string;
|
||||||
|
voice?: string;
|
||||||
|
speechInstructions?: string;
|
||||||
|
outputFormat?: string;
|
||||||
|
/** Configuration for streaming speech generation */
|
||||||
|
streamingSpeech?: Partial<StreamingSpeechConfig>;
|
||||||
|
/** Configuration for conversation history memory limits */
|
||||||
|
history?: Partial<HistoryConfig>;
|
||||||
|
/** Maximum audio input size in bytes (default: 10 MB) */
|
||||||
|
maxAudioInputSize?: number;
|
||||||
|
/** Maximum frame input size in bytes (default: 5 MB) */
|
||||||
|
maxFrameInputSize?: number;
|
||||||
|
/** Maximum frames to keep in context buffer (default: 10) */
|
||||||
|
maxContextFrames?: number;
|
||||||
|
/** Session ID for this video agent instance */
|
||||||
|
sessionId?: string;
|
||||||
|
}
|
||||||
|
export declare class VideoAgent extends EventEmitter {
|
||||||
|
private socket?;
|
||||||
|
private tools;
|
||||||
|
private model;
|
||||||
|
private transcriptionModel?;
|
||||||
|
private speechModel?;
|
||||||
|
private instructions;
|
||||||
|
private stopWhen;
|
||||||
|
private endpoint?;
|
||||||
|
private isConnected;
|
||||||
|
private conversationHistory;
|
||||||
|
private voice;
|
||||||
|
private speechInstructions?;
|
||||||
|
private outputFormat;
|
||||||
|
private isProcessing;
|
||||||
|
private isDestroyed;
|
||||||
|
private sessionId;
|
||||||
|
private frameSequence;
|
||||||
|
private lastFrameTimestamp;
|
||||||
|
private lastFrameHash?;
|
||||||
|
private frameContextBuffer;
|
||||||
|
private currentFrameData?;
|
||||||
|
private videoConfig;
|
||||||
|
private inputQueue;
|
||||||
|
private processingQueue;
|
||||||
|
private currentStreamAbortController?;
|
||||||
|
private historyConfig;
|
||||||
|
private maxAudioInputSize;
|
||||||
|
private maxFrameInputSize;
|
||||||
|
private streamingSpeechConfig;
|
||||||
|
private currentSpeechAbortController?;
|
||||||
|
private speechChunkQueue;
|
||||||
|
private nextChunkId;
|
||||||
|
private isSpeaking;
|
||||||
|
private pendingTextBuffer;
|
||||||
|
private speechQueueDonePromise?;
|
||||||
|
private speechQueueDoneResolve?;
|
||||||
|
constructor(options: VideoAgentOptions);
|
||||||
|
/**
|
||||||
|
* Generate a unique session ID
|
||||||
|
*/
|
||||||
|
private generateSessionId;
|
||||||
|
/**
|
||||||
|
* Simple hash function for frame comparison
|
||||||
|
*/
|
||||||
|
private hashFrame;
|
||||||
|
/**
|
||||||
|
* Ensure the agent has not been destroyed. Throws if it has.
|
||||||
|
*/
|
||||||
|
private ensureNotDestroyed;
|
||||||
|
/**
|
||||||
|
* Get current video agent configuration
|
||||||
|
*/
|
||||||
|
getConfig(): VideoAgentConfig;
|
||||||
|
/**
|
||||||
|
* Update video agent configuration
|
||||||
|
*/
|
||||||
|
updateConfig(config: Partial<VideoAgentConfig>): void;
|
||||||
|
private setupListeners;
|
||||||
|
/**
|
||||||
|
* Handle client ready signal
|
||||||
|
*/
|
||||||
|
private handleClientReady;
|
||||||
|
/**
|
||||||
|
* Handle incoming video frame
|
||||||
|
*/
|
||||||
|
private handleVideoFrame;
|
||||||
|
/**
|
||||||
|
* Add frame to context buffer
|
||||||
|
*/
|
||||||
|
private addFrameToContext;
|
||||||
|
/**
|
||||||
|
* Request client to capture and send a frame
|
||||||
|
*/
|
||||||
|
requestFrameCapture(reason: FrameTriggerReason): void;
|
||||||
|
/**
|
||||||
|
* Clean up all in-flight state when the connection drops.
|
||||||
|
*/
|
||||||
|
private cleanupOnDisconnect;
|
||||||
|
registerTools(tools: Record<string, Tool>): void;
|
||||||
|
/**
|
||||||
|
* Transcribe audio data to text using the configured transcription model
|
||||||
|
*/
|
||||||
|
transcribeAudio(audioData: Buffer | Uint8Array): Promise<string>;
|
||||||
|
/**
|
||||||
|
* Generate speech from text using the configured speech model
|
||||||
|
*/
|
||||||
|
generateSpeechFromText(text: string, abortSignal?: AbortSignal): Promise<Uint8Array>;
|
||||||
|
/**
|
||||||
|
* Interrupt ongoing speech generation and playback
|
||||||
|
*/
|
||||||
|
interruptSpeech(reason?: string): void;
|
||||||
|
/**
|
||||||
|
* Interrupt both the current LLM stream and ongoing speech
|
||||||
|
*/
|
||||||
|
interruptCurrentResponse(reason?: string): void;
|
||||||
|
/**
|
||||||
|
* Extract complete sentences from text buffer
|
||||||
|
*/
|
||||||
|
private extractSentences;
|
||||||
|
/**
|
||||||
|
* Trim conversation history to stay within configured limits
|
||||||
|
*/
|
||||||
|
private trimHistory;
|
||||||
|
/**
|
||||||
|
* Queue a text chunk for speech generation
|
||||||
|
*/
|
||||||
|
private queueSpeechChunk;
|
||||||
|
/**
|
||||||
|
* Generate audio for a single chunk
|
||||||
|
*/
|
||||||
|
private generateChunkAudio;
|
||||||
|
/**
|
||||||
|
* Process the speech queue and send audio chunks in order
|
||||||
|
*/
|
||||||
|
private processSpeechQueue;
|
||||||
|
/**
|
||||||
|
* Process text delta for streaming speech
|
||||||
|
*/
|
||||||
|
private processTextForStreamingSpeech;
|
||||||
|
/**
|
||||||
|
* Flush any remaining text in the buffer to speech
|
||||||
|
*/
|
||||||
|
private flushStreamingSpeech;
|
||||||
|
/**
|
||||||
|
* Process incoming audio data: transcribe and generate response
|
||||||
|
*/
|
||||||
|
private processAudioInput;
|
||||||
|
connect(url?: string): Promise<void>;
|
||||||
|
/**
|
||||||
|
* Attach an existing WebSocket (server-side usage)
|
||||||
|
*/
|
||||||
|
handleSocket(socket: WebSocket): void;
|
||||||
|
/**
|
||||||
|
* Send text input for processing (bypasses transcription)
|
||||||
|
*/
|
||||||
|
sendText(text: string): Promise<string>;
|
||||||
|
/**
|
||||||
|
* Send audio data to be transcribed and processed
|
||||||
|
*/
|
||||||
|
sendAudio(audioData: string): Promise<void>;
|
||||||
|
/**
|
||||||
|
* Send raw audio buffer to be transcribed and processed
|
||||||
|
*/
|
||||||
|
sendAudioBuffer(audioBuffer: Buffer | Uint8Array): Promise<void>;
|
||||||
|
/**
|
||||||
|
* Send a video frame with optional text query for vision analysis
|
||||||
|
*/
|
||||||
|
sendFrame(frameData: string, query?: string, options?: {
|
||||||
|
width?: number;
|
||||||
|
height?: number;
|
||||||
|
format?: string;
|
||||||
|
}): Promise<string>;
|
||||||
|
/**
|
||||||
|
* Enqueue a text input for serial processing
|
||||||
|
*/
|
||||||
|
private enqueueTextInput;
|
||||||
|
/**
|
||||||
|
* Enqueue a multimodal input (text + frame) for serial processing
|
||||||
|
*/
|
||||||
|
private enqueueMultimodalInput;
|
||||||
|
/**
|
||||||
|
* Drain the input queue, processing one request at a time
|
||||||
|
*/
|
||||||
|
private drainInputQueue;
|
||||||
|
/**
|
||||||
|
* Build the message content array for multimodal input
|
||||||
|
*/
|
||||||
|
private buildMultimodalContent;
|
||||||
|
/**
|
||||||
|
* Process multimodal input (text + video frame)
|
||||||
|
*/
|
||||||
|
private processMultimodalInput;
|
||||||
|
/**
|
||||||
|
* Process user input with streaming text generation
|
||||||
|
*/
|
||||||
|
private processUserInput;
|
||||||
|
/**
|
||||||
|
* Handle individual stream chunks
|
||||||
|
*/
|
||||||
|
private handleStreamChunk;
|
||||||
|
/**
|
||||||
|
* Process the full stream result and return the response text
|
||||||
|
*/
|
||||||
|
private processStreamResult;
|
||||||
|
/**
|
||||||
|
* Send a message via WebSocket if connected
|
||||||
|
*/
|
||||||
|
private sendWebSocketMessage;
|
||||||
|
/**
|
||||||
|
* Start listening for voice/video input
|
||||||
|
*/
|
||||||
|
startListening(): void;
|
||||||
|
/**
|
||||||
|
* Stop listening for voice/video input
|
||||||
|
*/
|
||||||
|
stopListening(): void;
|
||||||
|
/**
|
||||||
|
* Clear conversation history
|
||||||
|
*/
|
||||||
|
clearHistory(): void;
|
||||||
|
/**
|
||||||
|
* Get current conversation history
|
||||||
|
*/
|
||||||
|
getHistory(): ModelMessage[];
|
||||||
|
/**
|
||||||
|
* Set conversation history
|
||||||
|
*/
|
||||||
|
setHistory(history: ModelMessage[]): void;
|
||||||
|
/**
|
||||||
|
* Get frame context buffer
|
||||||
|
*/
|
||||||
|
getFrameContext(): FrameContext[];
|
||||||
|
/**
|
||||||
|
* Get session ID
|
||||||
|
*/
|
||||||
|
getSessionId(): string;
|
||||||
|
/**
|
||||||
|
* Internal helper to close and clean up the current socket
|
||||||
|
*/
|
||||||
|
private disconnectSocket;
|
||||||
|
/**
|
||||||
|
* Disconnect from WebSocket and stop all in-flight work
|
||||||
|
*/
|
||||||
|
disconnect(): void;
|
||||||
|
/**
|
||||||
|
* Permanently destroy the agent, releasing all resources
|
||||||
|
*/
|
||||||
|
destroy(): void;
|
||||||
|
/**
|
||||||
|
* Check if agent is connected to WebSocket
|
||||||
|
*/
|
||||||
|
get connected(): boolean;
|
||||||
|
/**
|
||||||
|
* Check if agent is currently processing a request
|
||||||
|
*/
|
||||||
|
get processing(): boolean;
|
||||||
|
/**
|
||||||
|
* Check if agent is currently speaking
|
||||||
|
*/
|
||||||
|
get speaking(): boolean;
|
||||||
|
/**
|
||||||
|
* Get the number of pending speech chunks in the queue
|
||||||
|
*/
|
||||||
|
get pendingSpeechChunks(): number;
|
||||||
|
/**
|
||||||
|
* Check if agent has been permanently destroyed
|
||||||
|
*/
|
||||||
|
get destroyed(): boolean;
|
||||||
|
/**
|
||||||
|
* Get current frame sequence number
|
||||||
|
*/
|
||||||
|
get currentFrameSequence(): number;
|
||||||
|
/**
|
||||||
|
* Check if there is visual context available
|
||||||
|
*/
|
||||||
|
get hasVisualContext(): boolean;
|
||||||
|
}
|
||||||
|
export type { VideoFrame, AudioData, VideoAgentConfig, FrameContext, FrameTriggerReason, };
|
||||||
|
export type { StreamingSpeechConfig, HistoryConfig } from "./types";
|
||||||
|
//# sourceMappingURL=VideoAgent.d.ts.map
|
||||||
1
dist/VideoAgent.d.ts.map
vendored
Normal file
1
dist/VideoAgent.d.ts.map
vendored
Normal file
File diff suppressed because one or more lines are too long
1351
dist/VideoAgent.js
vendored
Normal file
1351
dist/VideoAgent.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
dist/VideoAgent.js.map
vendored
Normal file
1
dist/VideoAgent.js.map
vendored
Normal file
File diff suppressed because one or more lines are too long
24
dist/VoiceAgent.d.ts
vendored
24
dist/VoiceAgent.d.ts
vendored
@@ -1,28 +1,7 @@
|
|||||||
import { WebSocket } from "ws";
|
import { WebSocket } from "ws";
|
||||||
import { EventEmitter } from "events";
|
import { EventEmitter } from "events";
|
||||||
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
import { streamText, LanguageModel, type Tool, type ModelMessage, type TranscriptionModel, type SpeechModel } from "ai";
|
||||||
/**
|
import { type StreamingSpeechConfig, type HistoryConfig } from "./types";
|
||||||
* Configuration for streaming speech behavior
|
|
||||||
*/
|
|
||||||
interface StreamingSpeechConfig {
|
|
||||||
/** Minimum characters before generating speech for a chunk */
|
|
||||||
minChunkSize: number;
|
|
||||||
/** Maximum characters per chunk (will split at sentence boundary before this) */
|
|
||||||
maxChunkSize: number;
|
|
||||||
/** Whether to enable parallel TTS generation */
|
|
||||||
parallelGeneration: boolean;
|
|
||||||
/** Maximum number of parallel TTS requests */
|
|
||||||
maxParallelRequests: number;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Configuration for conversation history memory management
|
|
||||||
*/
|
|
||||||
interface HistoryConfig {
|
|
||||||
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
||||||
maxMessages: number;
|
|
||||||
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
||||||
maxTotalChars: number;
|
|
||||||
}
|
|
||||||
export interface VoiceAgentOptions {
|
export interface VoiceAgentOptions {
|
||||||
model: LanguageModel;
|
model: LanguageModel;
|
||||||
transcriptionModel?: TranscriptionModel;
|
transcriptionModel?: TranscriptionModel;
|
||||||
@@ -238,5 +217,4 @@ export declare class VoiceAgent extends EventEmitter {
|
|||||||
*/
|
*/
|
||||||
get destroyed(): boolean;
|
get destroyed(): boolean;
|
||||||
}
|
}
|
||||||
export {};
|
|
||||||
//# sourceMappingURL=VoiceAgent.d.ts.map
|
//# sourceMappingURL=VoiceAgent.d.ts.map
|
||||||
2
dist/VoiceAgent.d.ts.map
vendored
2
dist/VoiceAgent.d.ts.map
vendored
@@ -1 +1 @@
|
|||||||
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AAWZ;;GAEG;AACH,UAAU,qBAAqB;IAC7B,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,UAAU,aAAa;IACrB,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACvB;AAKD,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IAkCtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
|
{"version":3,"file":"VoiceAgent.d.ts","sourceRoot":"","sources":["../src/VoiceAgent.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAC/B,OAAO,EAAE,YAAY,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EACL,UAAU,EACV,aAAa,EAEb,KAAK,IAAI,EACT,KAAK,YAAY,EAGjB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EACjB,MAAM,IAAI,CAAC;AACZ,OAAO,EAEL,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAInB,MAAM,SAAS,CAAC;AAEjB,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IACxC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;IACrE,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oDAAoD;IACpD,eAAe,CAAC,EAAE,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACjD,2DAA2D;IAC3D,OAAO,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IACjC,yDAAyD;IACzD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,qBAAa,UAAW,SAAQ,YAAY;IAC1C,OAAO,CAAC,MAAM,CAAC,CAAY;IAC3B,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,KAAK,CAAgB;IAC7B,OAAO,CAAC,kBAAkB,CAAC,CAAqB;IAChD,OAAO,CAAC,WAAW,CAAC,CAAc;IAClC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,QAAQ,CAA4D;IAC5E,OAAO,CAAC,QAAQ,CAAC,CAAS;IAC1B,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,mBAAmB,CAAsB;IACjD,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,kBAAkB,CAAC,CAAS;IACpC,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,WAAW,CAAS;IAG5B,OAAO,CAAC,UAAU,CAA2F;IAC7G,OAAO,CAAC,eAAe,CAAS;IAGhC,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IAGvD,OAAO,CAAC,aAAa,CAAgB;IACrC,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,4BAA4B,CAAC,CAAkB;IACvD,OAAO,CAAC,gBAAgB,CAAqB;IAC7C,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,iBAAiB,CAAM;IAG/B,OAAO,CAAC,sBAAsB,CAAC,CAAgB;IAC/C,OAAO,CAAC,sBAAsB,CAAC,CAAa;gBAEhC,OAAO,EAAE,iBAAiB;IA8BtC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAM1B,OAAO,CAAC,cAAc;IAuDtB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA8BpB,aAAa,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC;IAIhD;;OAEG;IACU,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC;IAuC7E;;;OAGG;IACU,sBAAsB,CACjC,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,WAAW,GACxB,OAAO,CAAC,UAAU,CAAC;IAiBtB;;;OAGG;IACI,eAAe,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAgC5D;;;OAGG;IACI,wBAAwB,CAAC,MAAM,GAAE,MAAsB,GAAG,IAAI;IAUrE;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IA8CxB;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmCnB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAsCxB;;OAEG;YACW,kBAAkB;IAwBhC;;OAEG;YACW,kBAAkB;IA+FhC;;;OAGG;IACH,OAAO,CAAC,6BAA6B;IAarC;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;YACW,iBAAiB;IAiDlB,OAAO,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BjD;;;;OAIG;IACI,YAAY,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI;IAc5C;;;OAGG;IACU,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAQpD;;;OAGG;IACU,SAAS,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAKxD;;OAEG;IACU,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7E;;;;OAIG;IACH,OAAO,CAAC,YAAY;IAOpB;;OAEG;YACW,eAAe;IAmB7B;;;;;OAKG;YACW,gBAAgB;IAuT9B;;;OAGG;IACU,yBAAyB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IA8BnE;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IA2B5B;;OAEG;IACH,cAAc;IAKd;;OAEG;IACH,aAAa;IAKb;;OAEG;IACH,YAAY;IAKZ;;OAEG;IACH,UAAU,IAAI,YAAY,EAAE;IAI5B;;OAEG;IACH,UAAU,CAAC,OAAO,EAAE,YAAY,EAAE;IAIlC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,UAAU;IAIV;;;OAGG;IACH,OAAO;IAQP;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;IAED;;OAEG;IACH,IAAI,UAAU,IAAI,OAAO,CAExB;IAED;;OAEG;IACH,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED;;OAEG;IACH,IAAI,mBAAmB,IAAI,MAAM,CAEhC;IAED;;OAEG;IACH,IAAI,SAAS,IAAI,OAAO,CAEvB;CACF"}
|
||||||
13
dist/VoiceAgent.js
vendored
13
dist/VoiceAgent.js
vendored
@@ -4,8 +4,7 @@ exports.VoiceAgent = void 0;
|
|||||||
const ws_1 = require("ws");
|
const ws_1 = require("ws");
|
||||||
const events_1 = require("events");
|
const events_1 = require("events");
|
||||||
const ai_1 = require("ai");
|
const ai_1 = require("ai");
|
||||||
/** Default maximum audio input size (10 MB) */
|
const types_1 = require("./types");
|
||||||
const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
|
|
||||||
class VoiceAgent extends events_1.EventEmitter {
|
class VoiceAgent extends events_1.EventEmitter {
|
||||||
socket;
|
socket;
|
||||||
tools = {};
|
tools = {};
|
||||||
@@ -52,22 +51,18 @@ class VoiceAgent extends events_1.EventEmitter {
|
|||||||
this.voice = options.voice || "alloy";
|
this.voice = options.voice || "alloy";
|
||||||
this.speechInstructions = options.speechInstructions;
|
this.speechInstructions = options.speechInstructions;
|
||||||
this.outputFormat = options.outputFormat || "mp3";
|
this.outputFormat = options.outputFormat || "mp3";
|
||||||
this.maxAudioInputSize = options.maxAudioInputSize ?? DEFAULT_MAX_AUDIO_SIZE;
|
this.maxAudioInputSize = options.maxAudioInputSize ?? types_1.DEFAULT_MAX_AUDIO_SIZE;
|
||||||
if (options.tools) {
|
if (options.tools) {
|
||||||
this.tools = { ...options.tools };
|
this.tools = { ...options.tools };
|
||||||
}
|
}
|
||||||
// Initialize streaming speech config with defaults
|
// Initialize streaming speech config with defaults
|
||||||
this.streamingSpeechConfig = {
|
this.streamingSpeechConfig = {
|
||||||
minChunkSize: 50,
|
...types_1.DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||||
maxChunkSize: 200,
|
|
||||||
parallelGeneration: true,
|
|
||||||
maxParallelRequests: 3,
|
|
||||||
...options.streamingSpeech,
|
...options.streamingSpeech,
|
||||||
};
|
};
|
||||||
// Initialize history config with defaults
|
// Initialize history config with defaults
|
||||||
this.historyConfig = {
|
this.historyConfig = {
|
||||||
maxMessages: 100,
|
...types_1.DEFAULT_HISTORY_CONFIG,
|
||||||
maxTotalChars: 0, // unlimited by default
|
|
||||||
...options.history,
|
...options.history,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
2
dist/VoiceAgent.js.map
vendored
2
dist/VoiceAgent.js.map
vendored
File diff suppressed because one or more lines are too long
2
dist/index.d.ts
vendored
2
dist/index.d.ts
vendored
@@ -1,2 +1,4 @@
|
|||||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
||||||
|
export { VideoAgent, type VideoAgentOptions, type VideoFrame, type AudioData, type VideoAgentConfig, type FrameContext, type FrameTriggerReason, } from "./VideoAgent";
|
||||||
|
export { type SpeechChunk, type StreamingSpeechConfig, type HistoryConfig, type StopWhenCondition, DEFAULT_STREAMING_SPEECH_CONFIG, DEFAULT_HISTORY_CONFIG, DEFAULT_MAX_AUDIO_SIZE, } from "./types";
|
||||||
//# sourceMappingURL=index.d.ts.map
|
//# sourceMappingURL=index.d.ts.map
|
||||||
2
dist/index.d.ts.map
vendored
2
dist/index.d.ts.map
vendored
@@ -1 +1 @@
|
|||||||
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC"}
|
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,KAAK,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAClE,OAAO,EACH,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,UAAU,EACf,KAAK,SAAS,EACd,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,KAAK,kBAAkB,GAC1B,MAAM,cAAc,CAAC;AAGtB,OAAO,EACH,KAAK,WAAW,EAChB,KAAK,qBAAqB,EAC1B,KAAK,aAAa,EAClB,KAAK,iBAAiB,EACtB,+BAA+B,EAC/B,sBAAsB,EACtB,sBAAsB,GACzB,MAAM,SAAS,CAAC"}
|
||||||
10
dist/index.js
vendored
10
dist/index.js
vendored
@@ -1,6 +1,14 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
Object.defineProperty(exports, "__esModule", { value: true });
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
exports.VoiceAgent = void 0;
|
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = exports.VideoAgent = exports.VoiceAgent = void 0;
|
||||||
|
// Agents
|
||||||
var VoiceAgent_1 = require("./VoiceAgent");
|
var VoiceAgent_1 = require("./VoiceAgent");
|
||||||
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_1.VoiceAgent; } });
|
Object.defineProperty(exports, "VoiceAgent", { enumerable: true, get: function () { return VoiceAgent_1.VoiceAgent; } });
|
||||||
|
var VideoAgent_1 = require("./VideoAgent");
|
||||||
|
Object.defineProperty(exports, "VideoAgent", { enumerable: true, get: function () { return VideoAgent_1.VideoAgent; } });
|
||||||
|
// Shared types
|
||||||
|
var types_1 = require("./types");
|
||||||
|
Object.defineProperty(exports, "DEFAULT_STREAMING_SPEECH_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_STREAMING_SPEECH_CONFIG; } });
|
||||||
|
Object.defineProperty(exports, "DEFAULT_HISTORY_CONFIG", { enumerable: true, get: function () { return types_1.DEFAULT_HISTORY_CONFIG; } });
|
||||||
|
Object.defineProperty(exports, "DEFAULT_MAX_AUDIO_SIZE", { enumerable: true, get: function () { return types_1.DEFAULT_MAX_AUDIO_SIZE; } });
|
||||||
//# sourceMappingURL=index.js.map
|
//# sourceMappingURL=index.js.map
|
||||||
2
dist/index.js.map
vendored
2
dist/index.js.map
vendored
@@ -1 +1 @@
|
|||||||
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,2CAAkE;AAAzD,wGAAA,UAAU,OAAA"}
|
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,SAAS;AACT,2CAAkE;AAAzD,wGAAA,UAAU,OAAA;AACnB,2CAQsB;AAPlB,wGAAA,UAAU,OAAA;AASd,eAAe;AACf,iCAQiB;AAHb,wHAAA,+BAA+B,OAAA;AAC/B,+GAAA,sBAAsB,OAAA;AACtB,+GAAA,sBAAsB,OAAA"}
|
||||||
46
dist/types.d.ts
vendored
Normal file
46
dist/types.d.ts
vendored
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import type { streamText } from "ai";
|
||||||
|
/**
|
||||||
|
* Represents a chunk of text to be converted to speech
|
||||||
|
*/
|
||||||
|
export interface SpeechChunk {
|
||||||
|
id: number;
|
||||||
|
text: string;
|
||||||
|
audioPromise?: Promise<Uint8Array | null>;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Configuration for streaming speech behavior
|
||||||
|
*/
|
||||||
|
export interface StreamingSpeechConfig {
|
||||||
|
/** Minimum characters before generating speech for a chunk */
|
||||||
|
minChunkSize: number;
|
||||||
|
/** Maximum characters per chunk (will split at sentence boundary before this) */
|
||||||
|
maxChunkSize: number;
|
||||||
|
/** Whether to enable parallel TTS generation */
|
||||||
|
parallelGeneration: boolean;
|
||||||
|
/** Maximum number of parallel TTS requests */
|
||||||
|
maxParallelRequests: number;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Configuration for conversation history memory management
|
||||||
|
*/
|
||||||
|
export interface HistoryConfig {
|
||||||
|
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
||||||
|
maxMessages: number;
|
||||||
|
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
||||||
|
maxTotalChars: number;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Default streaming speech configuration
|
||||||
|
*/
|
||||||
|
export declare const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig;
|
||||||
|
/**
|
||||||
|
* Default history configuration
|
||||||
|
*/
|
||||||
|
export declare const DEFAULT_HISTORY_CONFIG: HistoryConfig;
|
||||||
|
/** Default maximum audio input size (10 MB) */
|
||||||
|
export declare const DEFAULT_MAX_AUDIO_SIZE: number;
|
||||||
|
/**
|
||||||
|
* Default stop condition type from streamText
|
||||||
|
*/
|
||||||
|
export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||||
|
//# sourceMappingURL=types.d.ts.map
|
||||||
1
dist/types.d.ts.map
vendored
Normal file
1
dist/types.d.ts.map
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAErC;;GAEG;AACH,MAAM,WAAW,WAAW;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,CAAC,EAAE,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC;CAC7C;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IAClC,8DAA8D;IAC9D,YAAY,EAAE,MAAM,CAAC;IACrB,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,gDAAgD;IAChD,kBAAkB,EAAE,OAAO,CAAC;IAC5B,8CAA8C;IAC9C,mBAAmB,EAAE,MAAM,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC1B,yHAAyH;IACzH,WAAW,EAAE,MAAM,CAAC;IACpB,6HAA6H;IAC7H,aAAa,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,eAAO,MAAM,+BAA+B,EAAE,qBAK7C,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,aAGpC,CAAC;AAEF,+CAA+C;AAC/C,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAEvD;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,WAAW,CAAC,UAAU,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC"}
|
||||||
22
dist/types.js
vendored
Normal file
22
dist/types.js
vendored
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
"use strict";
|
||||||
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
|
exports.DEFAULT_MAX_AUDIO_SIZE = exports.DEFAULT_HISTORY_CONFIG = exports.DEFAULT_STREAMING_SPEECH_CONFIG = void 0;
|
||||||
|
/**
|
||||||
|
* Default streaming speech configuration
|
||||||
|
*/
|
||||||
|
exports.DEFAULT_STREAMING_SPEECH_CONFIG = {
|
||||||
|
minChunkSize: 50,
|
||||||
|
maxChunkSize: 200,
|
||||||
|
parallelGeneration: true,
|
||||||
|
maxParallelRequests: 3,
|
||||||
|
};
|
||||||
|
/**
|
||||||
|
* Default history configuration
|
||||||
|
*/
|
||||||
|
exports.DEFAULT_HISTORY_CONFIG = {
|
||||||
|
maxMessages: 100,
|
||||||
|
maxTotalChars: 0, // unlimited by default
|
||||||
|
};
|
||||||
|
/** Default maximum audio input size (10 MB) */
|
||||||
|
exports.DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
|
||||||
|
//# sourceMappingURL=types.js.map
|
||||||
1
dist/types.js.map
vendored
Normal file
1
dist/types.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";;;AAmCA;;GAEG;AACU,QAAA,+BAA+B,GAA0B;IAClE,YAAY,EAAE,EAAE;IAChB,YAAY,EAAE,GAAG;IACjB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,CAAC;CACzB,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAkB;IACjD,WAAW,EAAE,GAAG;IAChB,aAAa,EAAE,CAAC,EAAE,uBAAuB;CAC5C,CAAC;AAEF,+CAA+C;AAClC,QAAA,sBAAsB,GAAG,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC"}
|
||||||
@@ -258,7 +258,7 @@
|
|||||||
<!-- Connection -->
|
<!-- Connection -->
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<input type="text" id="endpoint" value="ws://localhost:8080" placeholder="WebSocket endpoint" />
|
<input type="text" id="endpoint" value="ws://localhost:8081/ws/voice" placeholder="WebSocket endpoint" />
|
||||||
<button id="connectBtn" class="primary">Connect</button>
|
<button id="connectBtn" class="primary">Connect</button>
|
||||||
<button id="disconnectBtn" disabled>Disconnect</button>
|
<button id="disconnectBtn" disabled>Disconnect</button>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
1692
src/VideoAgent.ts
Normal file
1692
src/VideoAgent.ts
Normal file
File diff suppressed because it is too large
Load Diff
@@ -11,42 +11,14 @@ import {
|
|||||||
type TranscriptionModel,
|
type TranscriptionModel,
|
||||||
type SpeechModel,
|
type SpeechModel,
|
||||||
} from "ai";
|
} from "ai";
|
||||||
|
import {
|
||||||
/**
|
type SpeechChunk,
|
||||||
* Represents a chunk of text to be converted to speech
|
type StreamingSpeechConfig,
|
||||||
*/
|
type HistoryConfig,
|
||||||
interface SpeechChunk {
|
DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||||
id: number;
|
DEFAULT_HISTORY_CONFIG,
|
||||||
text: string;
|
DEFAULT_MAX_AUDIO_SIZE,
|
||||||
audioPromise?: Promise<Uint8Array | null>;
|
} from "./types";
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configuration for streaming speech behavior
|
|
||||||
*/
|
|
||||||
interface StreamingSpeechConfig {
|
|
||||||
/** Minimum characters before generating speech for a chunk */
|
|
||||||
minChunkSize: number;
|
|
||||||
/** Maximum characters per chunk (will split at sentence boundary before this) */
|
|
||||||
maxChunkSize: number;
|
|
||||||
/** Whether to enable parallel TTS generation */
|
|
||||||
parallelGeneration: boolean;
|
|
||||||
/** Maximum number of parallel TTS requests */
|
|
||||||
maxParallelRequests: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configuration for conversation history memory management
|
|
||||||
*/
|
|
||||||
interface HistoryConfig {
|
|
||||||
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
||||||
maxMessages: number;
|
|
||||||
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
|
||||||
maxTotalChars: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Default maximum audio input size (10 MB) */
|
|
||||||
const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
|
|
||||||
|
|
||||||
export interface VoiceAgentOptions {
|
export interface VoiceAgentOptions {
|
||||||
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
|
model: LanguageModel; // AI SDK Model for chat (e.g., openai('gpt-4o'))
|
||||||
@@ -126,17 +98,13 @@ export class VoiceAgent extends EventEmitter {
|
|||||||
|
|
||||||
// Initialize streaming speech config with defaults
|
// Initialize streaming speech config with defaults
|
||||||
this.streamingSpeechConfig = {
|
this.streamingSpeechConfig = {
|
||||||
minChunkSize: 50,
|
...DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||||
maxChunkSize: 200,
|
|
||||||
parallelGeneration: true,
|
|
||||||
maxParallelRequests: 3,
|
|
||||||
...options.streamingSpeech,
|
...options.streamingSpeech,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Initialize history config with defaults
|
// Initialize history config with defaults
|
||||||
this.historyConfig = {
|
this.historyConfig = {
|
||||||
maxMessages: 100,
|
...DEFAULT_HISTORY_CONFIG,
|
||||||
maxTotalChars: 0, // unlimited by default
|
|
||||||
...options.history,
|
...options.history,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
21
src/index.ts
21
src/index.ts
@@ -1 +1,22 @@
|
|||||||
|
// Agents
|
||||||
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
export { VoiceAgent, type VoiceAgentOptions } from "./VoiceAgent";
|
||||||
|
export {
|
||||||
|
VideoAgent,
|
||||||
|
type VideoAgentOptions,
|
||||||
|
type VideoFrame,
|
||||||
|
type AudioData,
|
||||||
|
type VideoAgentConfig,
|
||||||
|
type FrameContext,
|
||||||
|
type FrameTriggerReason,
|
||||||
|
} from "./VideoAgent";
|
||||||
|
|
||||||
|
// Shared types
|
||||||
|
export {
|
||||||
|
type SpeechChunk,
|
||||||
|
type StreamingSpeechConfig,
|
||||||
|
type HistoryConfig,
|
||||||
|
type StopWhenCondition,
|
||||||
|
DEFAULT_STREAMING_SPEECH_CONFIG,
|
||||||
|
DEFAULT_HISTORY_CONFIG,
|
||||||
|
DEFAULT_MAX_AUDIO_SIZE,
|
||||||
|
} from "./types";
|
||||||
|
|||||||
60
src/types.ts
Normal file
60
src/types.ts
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import type { streamText } from "ai";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a chunk of text to be converted to speech
|
||||||
|
*/
|
||||||
|
export interface SpeechChunk {
|
||||||
|
id: number;
|
||||||
|
text: string;
|
||||||
|
audioPromise?: Promise<Uint8Array | null>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for streaming speech behavior
|
||||||
|
*/
|
||||||
|
export interface StreamingSpeechConfig {
|
||||||
|
/** Minimum characters before generating speech for a chunk */
|
||||||
|
minChunkSize: number;
|
||||||
|
/** Maximum characters per chunk (will split at sentence boundary before this) */
|
||||||
|
maxChunkSize: number;
|
||||||
|
/** Whether to enable parallel TTS generation */
|
||||||
|
parallelGeneration: boolean;
|
||||||
|
/** Maximum number of parallel TTS requests */
|
||||||
|
maxParallelRequests: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for conversation history memory management
|
||||||
|
*/
|
||||||
|
export interface HistoryConfig {
|
||||||
|
/** Maximum number of messages to keep in history. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
||||||
|
maxMessages: number;
|
||||||
|
/** Maximum total character count across all messages. When exceeded, oldest messages are trimmed. Set to 0 for unlimited. */
|
||||||
|
maxTotalChars: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default streaming speech configuration
|
||||||
|
*/
|
||||||
|
export const DEFAULT_STREAMING_SPEECH_CONFIG: StreamingSpeechConfig = {
|
||||||
|
minChunkSize: 50,
|
||||||
|
maxChunkSize: 200,
|
||||||
|
parallelGeneration: true,
|
||||||
|
maxParallelRequests: 3,
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default history configuration
|
||||||
|
*/
|
||||||
|
export const DEFAULT_HISTORY_CONFIG: HistoryConfig = {
|
||||||
|
maxMessages: 100,
|
||||||
|
maxTotalChars: 0, // unlimited by default
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Default maximum audio input size (10 MB) */
|
||||||
|
export const DEFAULT_MAX_AUDIO_SIZE = 10 * 1024 * 1024;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default stop condition type from streamText
|
||||||
|
*/
|
||||||
|
export type StopWhenCondition = NonNullable<Parameters<typeof streamText>[0]["stopWhen"]>;
|
||||||
Reference in New Issue
Block a user