132 lines
4.8 KiB
JavaScript
132 lines
4.8 KiB
JavaScript
// STT service — uses whisper.cpp via Electron IPC, falls back to Web Speech API
|
|
|
|
/**
|
|
* Encode raw PCM Float32 chunks captured from AudioContext into a WAV Blob.
|
|
* Whisper requires wav/mp3/ogg/flac — MediaRecorder produces webm which whisper rejects.
|
|
*/
|
|
function encodeWAV(chunks, sampleRate) {
|
|
const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
const samples = new Float32Array(totalLength);
|
|
let offset = 0;
|
|
for (const c of chunks) { samples.set(c, offset); offset += c.length; }
|
|
|
|
const pcm = new Int16Array(samples.length);
|
|
for (let i = 0; i < samples.length; i++) {
|
|
pcm[i] = Math.max(-32768, Math.min(32767, Math.round(samples[i] * 32767)));
|
|
}
|
|
|
|
const header = new ArrayBuffer(44);
|
|
const v = new DataView(header);
|
|
const s = (off, str) => { for (let i = 0; i < str.length; i++) v.setUint8(off + i, str.charCodeAt(i)); };
|
|
|
|
s(0, 'RIFF'); v.setUint32(4, 36 + pcm.byteLength, true);
|
|
s(8, 'WAVE'); s(12, 'fmt ');
|
|
v.setUint32(16, 16, true); // chunk size
|
|
v.setUint16(20, 1, true); // PCM
|
|
v.setUint16(22, 1, true); // mono
|
|
v.setUint32(24, sampleRate, true);
|
|
v.setUint32(28, sampleRate * 2, true); // byte rate
|
|
v.setUint16(32, 2, true); // block align
|
|
v.setUint16(34, 16, true); // bits per sample
|
|
s(36, 'data'); v.setUint32(40, pcm.byteLength, true);
|
|
|
|
return new Blob([header, pcm.buffer], { type: 'audio/wav' });
|
|
}
|
|
|
|
/**
|
|
* Start recording from the microphone using AudioContext (produces proper WAV).
|
|
* Returns a recorder handle with a stop() method that resolves to a WAV Blob.
|
|
*/
|
|
export async function startRecording() {
|
|
const sampleRate = 16000;
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
const ctx = new AudioContext({ sampleRate });
|
|
const source = ctx.createMediaStreamSource(stream);
|
|
// ScriptProcessor is deprecated but still works reliably in Electron
|
|
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
|
const chunks = [];
|
|
|
|
processor.onaudioprocess = (e) => {
|
|
chunks.push(new Float32Array(e.inputBuffer.getChannelData(0)));
|
|
};
|
|
|
|
source.connect(processor);
|
|
processor.connect(ctx.destination);
|
|
|
|
return {
|
|
stop: async () => {
|
|
processor.disconnect();
|
|
source.disconnect();
|
|
await ctx.close();
|
|
stream.getTracks().forEach(t => t.stop());
|
|
return encodeWAV(chunks, sampleRate);
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Transcribe an audio blob.
|
|
*
|
|
* In Electron: sends blob to whisper.cpp for offline transcription.
|
|
* In browser: Web Speech API is the primary STT (live mic, not blob-based).
|
|
* NOTE — Web Speech listens to the microphone in real time, so
|
|
* calling this from a browser without Electron will start a new
|
|
* listening session rather than processing the provided blob.
|
|
* This is intentional: browsers cannot re-process a recorded blob
|
|
* with Web Speech API.
|
|
*/
|
|
export async function transcribeAudio(audioBlob) {
|
|
if (!audioBlob) return '';
|
|
|
|
// ── Electron path: send blob to whisper.cpp ──────────────────────────────
|
|
if (typeof window !== 'undefined' && window.electronAPI?.stt) {
|
|
try {
|
|
const reader = new FileReader();
|
|
const base64Promise = new Promise((resolve, reject) => {
|
|
reader.onload = () => resolve(reader.result.split(',')[1]);
|
|
reader.onerror = reject;
|
|
});
|
|
reader.readAsDataURL(audioBlob);
|
|
|
|
const base64Audio = await base64Promise;
|
|
const result = await window.electronAPI.stt.transcribe(base64Audio);
|
|
|
|
if (result?.error && !result?.text) {
|
|
// whisper reported a hard error (binary missing, etc.) — return empty
|
|
console.warn('Whisper STT error:', result.error);
|
|
return '';
|
|
}
|
|
|
|
return result?.text ?? '';
|
|
} catch (err) {
|
|
console.warn('Whisper STT IPC error:', err);
|
|
return '';
|
|
}
|
|
}
|
|
|
|
// ── Browser fallback: Web Speech API (live mic) ──────────────────────────
|
|
const SpeechRecognition =
|
|
(typeof window !== 'undefined') &&
|
|
(window.SpeechRecognition || window.webkitSpeechRecognition);
|
|
|
|
if (SpeechRecognition) {
|
|
return new Promise((resolve, reject) => {
|
|
const recognition = new SpeechRecognition();
|
|
recognition.continuous = false;
|
|
recognition.interimResults = false;
|
|
recognition.lang = 'en-US';
|
|
recognition.onresult = (event) => resolve(event.results[0][0].transcript);
|
|
recognition.onerror = (err) => reject(err);
|
|
recognition.start();
|
|
});
|
|
}
|
|
|
|
throw new Error('No STT available');
|
|
}
|
|
|
|
export function isSTTAvailable() {
|
|
if (typeof window === 'undefined') return false;
|
|
if (window.electronAPI?.stt) return true;
|
|
return !!(window.SpeechRecognition || window.webkitSpeechRecognition);
|
|
}
|