feat: add serve-client and start-test-environment scripts, enhance voice-client with debugging info

2026-03-02 18:36:39 +00:00 · 2026-02-14 14:20:07 +05:30
parent 8e8dd9d9f6
commit 637d57fb41
6 changed files with 330 additions and 53 deletions
--- a/src/VoiceAgent.ts
+++ b/src/VoiceAgent.ts
@@ -156,6 +156,7 @@ export class VoiceAgent extends EventEmitter {
    this.socket.on("message", async (data) => {
      try {
        const message = JSON.parse(data.toString());
+        console.log(`Received WebSocket message of type: ${message.type}`);

        // Handle transcribed text from the client/STT
        if (message.type === "transcript") {
@@ -165,6 +166,7 @@ export class VoiceAgent extends EventEmitter {
          }
          // Interrupt ongoing speech when user starts speaking (barge-in)
          this.interruptCurrentResponse("user_speaking");
+          console.log(`Processing transcript: "${message.text}"`);
          await this.enqueueInput(message.text);
        }
        // Handle raw audio data that needs transcription
@@ -175,10 +177,12 @@ export class VoiceAgent extends EventEmitter {
          }
          // Interrupt ongoing speech when user starts speaking (barge-in)
          this.interruptCurrentResponse("user_speaking");
-          await this.processAudioInput(message.data);
+          console.log(`Received audio data (${message.data.length / 1000}KB) for processing, format: ${message.format || 'unknown'}`);
+          await this.processAudioInput(message.data, message.format);
        }
        // Handle explicit interrupt request from client
        else if (message.type === "interrupt") {
+          console.log(`Received interrupt request: ${message.reason || "client_request"}`);
          this.interruptCurrentResponse(message.reason || "client_request");
        }
      } catch (err) {
@@ -246,17 +250,38 @@ export class VoiceAgent extends EventEmitter {
      throw new Error("Transcription model not configured");
    }

-    const result = await transcribe({
-      model: this.transcriptionModel,
-      audio: audioData,
-    });
+    console.log(`Sending ${audioData.byteLength} bytes to Whisper for transcription`);

-    this.emit("transcription", {
-      text: result.text,
-      language: result.language,
-    });
+    try {
+      // Note: The AI SDK transcribe function only accepts these parameters
+      // We can't directly pass language or temperature to it
+      const result = await transcribe({
+        model: this.transcriptionModel,
+        audio: audioData,
+        // If we need to pass additional options to OpenAI Whisper,
+        // we would need to do it via providerOptions if supported
+      });

-    return result.text;
+      console.log(`Whisper transcription result: "${result.text}", language: ${result.language || 'unknown'}`);
+
+      this.emit("transcription", {
+        text: result.text,
+        language: result.language,
+      });
+
+      // Also send transcription to client for immediate feedback
+      this.sendWebSocketMessage({
+        type: "transcription_result",
+        text: result.text,
+        language: result.language,
+      });
+
+      return result.text;
+    } catch (error) {
+      console.error("Whisper transcription failed:", error);
+      // Re-throw to be handled by the caller
+      throw error;
+    }
  }

  /**
@@ -472,13 +497,16 @@ export class VoiceAgent extends EventEmitter {
    }

    try {
+      console.log(`Generating audio for chunk ${chunk.id}: "${chunk.text.substring(0, 50)}${chunk.text.length > 50 ? '...' : ''}"`);
      const audioData = await this.generateSpeechFromText(
        chunk.text,
        this.currentSpeechAbortController.signal
      );
+      console.log(`Generated audio for chunk ${chunk.id}: ${audioData.length} bytes`);
      return audioData;
    } catch (error) {
      if ((error as Error).name === "AbortError") {
+        console.log(`Audio generation aborted for chunk ${chunk.id}`);
        return null; // Cancelled, don't report as error
      }
      console.error(`Failed to generate audio for chunk ${chunk.id}:`, error);
@@ -494,6 +522,7 @@ export class VoiceAgent extends EventEmitter {
    if (this.isSpeaking) return;
    this.isSpeaking = true;

+    console.log(`Starting speech queue processing with ${this.speechChunkQueue.length} chunks`);
    this.emit("speech_start", { streaming: true });
    this.sendWebSocketMessage({ type: "speech_stream_start" });

@@ -501,6 +530,8 @@ export class VoiceAgent extends EventEmitter {
      while (this.speechChunkQueue.length > 0) {
        const chunk = this.speechChunkQueue[0];

+        console.log(`Processing speech chunk #${chunk.id} (${this.speechChunkQueue.length - 1} remaining)`);
+
        // Ensure audio generation has started
        if (!chunk.audioPromise) {
          chunk.audioPromise = this.generateChunkAudio(chunk);
@@ -510,13 +541,17 @@ export class VoiceAgent extends EventEmitter {
        const audioData = await chunk.audioPromise;

        // Check if we were interrupted while waiting
-        if (!this.isSpeaking) break;
+        if (!this.isSpeaking) {
+          console.log(`Speech interrupted during chunk #${chunk.id}`);
+          break;
+        }

        // Remove from queue after processing
        this.speechChunkQueue.shift();

        if (audioData) {
          const base64Audio = Buffer.from(audioData).toString("base64");
+          console.log(`Sending audio chunk #${chunk.id} (${audioData.length} bytes, ${this.outputFormat})`);

          // Send audio chunk via WebSocket
          this.sendWebSocketMessage({
@@ -535,6 +570,8 @@ export class VoiceAgent extends EventEmitter {
            text: chunk.text,
            uint8Array: audioData,
          });
+        } else {
+          console.log(`No audio data generated for chunk #${chunk.id}`);
        }

        // Start generating next chunks in parallel
@@ -545,14 +582,20 @@ export class VoiceAgent extends EventEmitter {
            this.speechChunkQueue.length
          );

-          for (let i = 0; i < toStart; i++) {
-            const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise);
-            if (nextChunk) {
-              nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
+          if (toStart > 0) {
+            console.log(`Starting parallel generation for ${toStart} more chunks`);
+            for (let i = 0; i < toStart; i++) {
+              const nextChunk = this.speechChunkQueue.find(c => !c.audioPromise);
+              if (nextChunk) {
+                nextChunk.audioPromise = this.generateChunkAudio(nextChunk);
+              }
            }
          }
        }
      }
+    } catch (error) {
+      console.error("Error in speech queue processing:", error);
+      this.emit("error", error);
    } finally {
      this.isSpeaking = false;
      this.currentSpeechAbortController = undefined;
@@ -564,6 +607,7 @@ export class VoiceAgent extends EventEmitter {
        this.speechQueueDonePromise = undefined;
      }

+      console.log(`Speech queue processing complete`);
      this.sendWebSocketMessage({ type: "speech_stream_end" });
      this.emit("speech_complete", { streaming: true });
    }
@@ -600,7 +644,7 @@ export class VoiceAgent extends EventEmitter {
  /**
   * Process incoming audio data: transcribe and generate response
   */
-  private async processAudioInput(base64Audio: string): Promise<void> {
+  private async processAudioInput(base64Audio: string, format?: string): Promise<void> {
    if (!this.transcriptionModel) {
      this.emit("error", new Error("Transcription model not configured for audio input"));
      return;
@@ -624,16 +668,28 @@ export class VoiceAgent extends EventEmitter {
        return;
      }

-      this.emit("audio_received", { size: audioBuffer.length });
+      this.emit("audio_received", { size: audioBuffer.length, format });
+      console.log(`Processing audio input: ${audioBuffer.length} bytes, format: ${format || 'unknown'}`);

      const transcribedText = await this.transcribeAudio(audioBuffer);
+      console.log(`Transcribed text: "${transcribedText}"`);

      if (transcribedText.trim()) {
        await this.enqueueInput(transcribedText);
+      } else {
+        this.emit("warning", "Transcription returned empty text");
+        this.sendWebSocketMessage({
+          type: "transcription_error",
+          error: "Whisper returned empty text"
+        });
      }
    } catch (error) {
      console.error("Failed to process audio input:", error);
      this.emit("error", error);
+      this.sendWebSocketMessage({
+        type: "transcription_error",
+        error: `Transcription failed: ${(error as Error).message || String(error)}`
+      });
    }
  }

@@ -1110,7 +1166,20 @@ export class VoiceAgent extends EventEmitter {

    try {
      if (this.socket.readyState === WebSocket.OPEN) {
+        // Skip logging huge audio data for better readability
+        if (message.type === "audio_chunk" || message.type === "audio") {
+          const { data, ...rest } = message as any;
+          console.log(`Sending WebSocket message: ${message.type}`,
+            data ? `(${(data.length / 1000).toFixed(1)}KB audio data)` : "",
+            rest
+          );
+        } else {
+          console.log(`Sending WebSocket message: ${message.type}`);
+        }
+
        this.socket.send(JSON.stringify(message));
+      } else {
+        console.warn(`Cannot send message, socket state: ${this.socket.readyState}`);
      }
    } catch (error) {
      // Socket may have closed between the readyState check and send()