Files
sarthi_lab/src/services/stt.js

132 lines
4.8 KiB
JavaScript

// STT service — uses whisper.cpp via Electron IPC, falls back to Web Speech API
/**
* Encode raw PCM Float32 chunks captured from AudioContext into a WAV Blob.
* Whisper requires wav/mp3/ogg/flac — MediaRecorder produces webm which whisper rejects.
*/
function encodeWAV(chunks, sampleRate) {
const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
const samples = new Float32Array(totalLength);
let offset = 0;
for (const c of chunks) { samples.set(c, offset); offset += c.length; }
const pcm = new Int16Array(samples.length);
for (let i = 0; i < samples.length; i++) {
pcm[i] = Math.max(-32768, Math.min(32767, Math.round(samples[i] * 32767)));
}
const header = new ArrayBuffer(44);
const v = new DataView(header);
const s = (off, str) => { for (let i = 0; i < str.length; i++) v.setUint8(off + i, str.charCodeAt(i)); };
s(0, 'RIFF'); v.setUint32(4, 36 + pcm.byteLength, true);
s(8, 'WAVE'); s(12, 'fmt ');
v.setUint32(16, 16, true); // chunk size
v.setUint16(20, 1, true); // PCM
v.setUint16(22, 1, true); // mono
v.setUint32(24, sampleRate, true);
v.setUint32(28, sampleRate * 2, true); // byte rate
v.setUint16(32, 2, true); // block align
v.setUint16(34, 16, true); // bits per sample
s(36, 'data'); v.setUint32(40, pcm.byteLength, true);
return new Blob([header, pcm.buffer], { type: 'audio/wav' });
}
/**
* Start recording from the microphone using AudioContext (produces proper WAV).
* Returns a recorder handle with a stop() method that resolves to a WAV Blob.
*/
export async function startRecording() {
const sampleRate = 16000;
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const ctx = new AudioContext({ sampleRate });
const source = ctx.createMediaStreamSource(stream);
// ScriptProcessor is deprecated but still works reliably in Electron
const processor = ctx.createScriptProcessor(4096, 1, 1);
const chunks = [];
processor.onaudioprocess = (e) => {
chunks.push(new Float32Array(e.inputBuffer.getChannelData(0)));
};
source.connect(processor);
processor.connect(ctx.destination);
return {
stop: async () => {
processor.disconnect();
source.disconnect();
await ctx.close();
stream.getTracks().forEach(t => t.stop());
return encodeWAV(chunks, sampleRate);
},
};
}
/**
* Transcribe an audio blob.
*
* In Electron: sends blob to whisper.cpp for offline transcription.
* In browser: Web Speech API is the primary STT (live mic, not blob-based).
* NOTE — Web Speech listens to the microphone in real time, so
* calling this from a browser without Electron will start a new
* listening session rather than processing the provided blob.
* This is intentional: browsers cannot re-process a recorded blob
* with Web Speech API.
*/
export async function transcribeAudio(audioBlob) {
if (!audioBlob) return '';
// ── Electron path: send blob to whisper.cpp ──────────────────────────────
if (typeof window !== 'undefined' && window.electronAPI?.stt) {
try {
const reader = new FileReader();
const base64Promise = new Promise((resolve, reject) => {
reader.onload = () => resolve(reader.result.split(',')[1]);
reader.onerror = reject;
});
reader.readAsDataURL(audioBlob);
const base64Audio = await base64Promise;
const result = await window.electronAPI.stt.transcribe(base64Audio);
if (result?.error && !result?.text) {
// whisper reported a hard error (binary missing, etc.) — return empty
console.warn('Whisper STT error:', result.error);
return '';
}
return result?.text ?? '';
} catch (err) {
console.warn('Whisper STT IPC error:', err);
return '';
}
}
// ── Browser fallback: Web Speech API (live mic) ──────────────────────────
const SpeechRecognition =
(typeof window !== 'undefined') &&
(window.SpeechRecognition || window.webkitSpeechRecognition);
if (SpeechRecognition) {
return new Promise((resolve, reject) => {
const recognition = new SpeechRecognition();
recognition.continuous = false;
recognition.interimResults = false;
recognition.lang = 'en-US';
recognition.onresult = (event) => resolve(event.results[0][0].transcript);
recognition.onerror = (err) => reject(err);
recognition.start();
});
}
throw new Error('No STT available');
}
export function isSTTAvailable() {
if (typeof window === 'undefined') return false;
if (window.electronAPI?.stt) return true;
return !!(window.SpeechRecognition || window.webkitSpeechRecognition);
}