sarthi_lab/src/services/stt.js

// STT service — uses whisper.cpp via Electron IPC, falls back to Web Speech API

/**
 * Encode raw PCM Float32 chunks captured from AudioContext into a WAV Blob.
 * Whisper requires wav/mp3/ogg/flac — MediaRecorder produces webm which whisper rejects.
 */
function encodeWAV(chunks, sampleRate) {
  const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
  const samples = new Float32Array(totalLength);
  let offset = 0;
  for (const c of chunks) { samples.set(c, offset); offset += c.length; }

  const pcm = new Int16Array(samples.length);
  for (let i = 0; i < samples.length; i++) {
    pcm[i] = Math.max(-32768, Math.min(32767, Math.round(samples[i] * 32767)));
  }

  const header = new ArrayBuffer(44);
  const v = new DataView(header);
  const s = (off, str) => { for (let i = 0; i < str.length; i++) v.setUint8(off + i, str.charCodeAt(i)); };

  s(0, 'RIFF');  v.setUint32(4, 36 + pcm.byteLength, true);
  s(8, 'WAVE'); s(12, 'fmt ');
  v.setUint32(16, 16, true);          // chunk size
  v.setUint16(20, 1, true);           // PCM
  v.setUint16(22, 1, true);           // mono
  v.setUint32(24, sampleRate, true);
  v.setUint32(28, sampleRate * 2, true); // byte rate
  v.setUint16(32, 2, true);           // block align
  v.setUint16(34, 16, true);          // bits per sample
  s(36, 'data'); v.setUint32(40, pcm.byteLength, true);

  return new Blob([header, pcm.buffer], { type: 'audio/wav' });
}

/**
 * Start recording from the microphone using AudioContext (produces proper WAV).
 * Returns a recorder handle with a stop() method that resolves to a WAV Blob.
 */
export async function startRecording() {
  const sampleRate = 16000;
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  const ctx = new AudioContext({ sampleRate });
  const source = ctx.createMediaStreamSource(stream);
  // ScriptProcessor is deprecated but still works reliably in Electron
  const processor = ctx.createScriptProcessor(4096, 1, 1);
  const chunks = [];

  processor.onaudioprocess = (e) => {
    chunks.push(new Float32Array(e.inputBuffer.getChannelData(0)));
  };

  source.connect(processor);
  processor.connect(ctx.destination);

  return {
    stop: async () => {
      processor.disconnect();
      source.disconnect();
      await ctx.close();
      stream.getTracks().forEach(t => t.stop());
      return encodeWAV(chunks, sampleRate);
    },
  };
}

/**
 * Transcribe an audio blob.
 *
 * In Electron: sends blob to whisper.cpp for offline transcription.
 * In browser:  Web Speech API is the primary STT (live mic, not blob-based).
 *              NOTE — Web Speech listens to the microphone in real time, so
 *              calling this from a browser without Electron will start a new
 *              listening session rather than processing the provided blob.
 *              This is intentional: browsers cannot re-process a recorded blob
 *              with Web Speech API.
 */
export async function transcribeAudio(audioBlob) {
  if (!audioBlob) return '';

  // ── Electron path: send blob to whisper.cpp ──────────────────────────────
  if (typeof window !== 'undefined' && window.electronAPI?.stt) {
    try {
      const reader = new FileReader();
      const base64Promise = new Promise((resolve, reject) => {
        reader.onload  = () => resolve(reader.result.split(',')[1]);
        reader.onerror = reject;
      });
      reader.readAsDataURL(audioBlob);

      const base64Audio = await base64Promise;
      const result = await window.electronAPI.stt.transcribe(base64Audio);

      if (result?.error && !result?.text) {
        // whisper reported a hard error (binary missing, etc.) — return empty
        console.warn('Whisper STT error:', result.error);
        return '';
      }

      return result?.text ?? '';
    } catch (err) {
      console.warn('Whisper STT IPC error:', err);
      return '';
    }
  }

  // ── Browser fallback: Web Speech API (live mic) ──────────────────────────
  const SpeechRecognition =
    (typeof window !== 'undefined') &&
    (window.SpeechRecognition || window.webkitSpeechRecognition);

  if (SpeechRecognition) {
    return new Promise((resolve, reject) => {
      const recognition = new SpeechRecognition();
      recognition.continuous      = false;
      recognition.interimResults  = false;
      recognition.lang            = 'en-US';
      recognition.onresult        = (event) => resolve(event.results[0][0].transcript);
      recognition.onerror         = (err)   => reject(err);
      recognition.start();
    });
  }

  throw new Error('No STT available');
}

export function isSTTAvailable() {
  if (typeof window === 'undefined') return false;
  if (window.electronAPI?.stt) return true;
  return !!(window.SpeechRecognition || window.webkitSpeechRecognition);
}