Initial commit: Sarthi Lab desktop application
This commit is contained in:
131
src/services/stt.js
Normal file
131
src/services/stt.js
Normal file
@@ -0,0 +1,131 @@
|
||||
// STT service — uses whisper.cpp via Electron IPC, falls back to Web Speech API
|
||||
|
||||
/**
|
||||
* Encode raw PCM Float32 chunks captured from AudioContext into a WAV Blob.
|
||||
* Whisper requires wav/mp3/ogg/flac — MediaRecorder produces webm which whisper rejects.
|
||||
*/
|
||||
function encodeWAV(chunks, sampleRate) {
|
||||
const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
||||
const samples = new Float32Array(totalLength);
|
||||
let offset = 0;
|
||||
for (const c of chunks) { samples.set(c, offset); offset += c.length; }
|
||||
|
||||
const pcm = new Int16Array(samples.length);
|
||||
for (let i = 0; i < samples.length; i++) {
|
||||
pcm[i] = Math.max(-32768, Math.min(32767, Math.round(samples[i] * 32767)));
|
||||
}
|
||||
|
||||
const header = new ArrayBuffer(44);
|
||||
const v = new DataView(header);
|
||||
const s = (off, str) => { for (let i = 0; i < str.length; i++) v.setUint8(off + i, str.charCodeAt(i)); };
|
||||
|
||||
s(0, 'RIFF'); v.setUint32(4, 36 + pcm.byteLength, true);
|
||||
s(8, 'WAVE'); s(12, 'fmt ');
|
||||
v.setUint32(16, 16, true); // chunk size
|
||||
v.setUint16(20, 1, true); // PCM
|
||||
v.setUint16(22, 1, true); // mono
|
||||
v.setUint32(24, sampleRate, true);
|
||||
v.setUint32(28, sampleRate * 2, true); // byte rate
|
||||
v.setUint16(32, 2, true); // block align
|
||||
v.setUint16(34, 16, true); // bits per sample
|
||||
s(36, 'data'); v.setUint32(40, pcm.byteLength, true);
|
||||
|
||||
return new Blob([header, pcm.buffer], { type: 'audio/wav' });
|
||||
}
|
||||
|
||||
/**
|
||||
* Start recording from the microphone using AudioContext (produces proper WAV).
|
||||
* Returns a recorder handle with a stop() method that resolves to a WAV Blob.
|
||||
*/
|
||||
export async function startRecording() {
|
||||
const sampleRate = 16000;
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
const ctx = new AudioContext({ sampleRate });
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
// ScriptProcessor is deprecated but still works reliably in Electron
|
||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
||||
const chunks = [];
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
chunks.push(new Float32Array(e.inputBuffer.getChannelData(0)));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(ctx.destination);
|
||||
|
||||
return {
|
||||
stop: async () => {
|
||||
processor.disconnect();
|
||||
source.disconnect();
|
||||
await ctx.close();
|
||||
stream.getTracks().forEach(t => t.stop());
|
||||
return encodeWAV(chunks, sampleRate);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe an audio blob.
|
||||
*
|
||||
* In Electron: sends blob to whisper.cpp for offline transcription.
|
||||
* In browser: Web Speech API is the primary STT (live mic, not blob-based).
|
||||
* NOTE — Web Speech listens to the microphone in real time, so
|
||||
* calling this from a browser without Electron will start a new
|
||||
* listening session rather than processing the provided blob.
|
||||
* This is intentional: browsers cannot re-process a recorded blob
|
||||
* with Web Speech API.
|
||||
*/
|
||||
export async function transcribeAudio(audioBlob) {
|
||||
if (!audioBlob) return '';
|
||||
|
||||
// ── Electron path: send blob to whisper.cpp ──────────────────────────────
|
||||
if (typeof window !== 'undefined' && window.electronAPI?.stt) {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
const base64Promise = new Promise((resolve, reject) => {
|
||||
reader.onload = () => resolve(reader.result.split(',')[1]);
|
||||
reader.onerror = reject;
|
||||
});
|
||||
reader.readAsDataURL(audioBlob);
|
||||
|
||||
const base64Audio = await base64Promise;
|
||||
const result = await window.electronAPI.stt.transcribe(base64Audio);
|
||||
|
||||
if (result?.error && !result?.text) {
|
||||
// whisper reported a hard error (binary missing, etc.) — return empty
|
||||
console.warn('Whisper STT error:', result.error);
|
||||
return '';
|
||||
}
|
||||
|
||||
return result?.text ?? '';
|
||||
} catch (err) {
|
||||
console.warn('Whisper STT IPC error:', err);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
// ── Browser fallback: Web Speech API (live mic) ──────────────────────────
|
||||
const SpeechRecognition =
|
||||
(typeof window !== 'undefined') &&
|
||||
(window.SpeechRecognition || window.webkitSpeechRecognition);
|
||||
|
||||
if (SpeechRecognition) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const recognition = new SpeechRecognition();
|
||||
recognition.continuous = false;
|
||||
recognition.interimResults = false;
|
||||
recognition.lang = 'en-US';
|
||||
recognition.onresult = (event) => resolve(event.results[0][0].transcript);
|
||||
recognition.onerror = (err) => reject(err);
|
||||
recognition.start();
|
||||
});
|
||||
}
|
||||
|
||||
throw new Error('No STT available');
|
||||
}
|
||||
|
||||
export function isSTTAvailable() {
|
||||
if (typeof window === 'undefined') return false;
|
||||
if (window.electronAPI?.stt) return true;
|
||||
return !!(window.SpeechRecognition || window.webkitSpeechRecognition);
|
||||
}
|
||||
Reference in New Issue
Block a user