All checks were successful
Build AppImage / build (push) Successful in 8m39s
Problem: Wenn auf NixOS der WebKit-Audio-Stack unvollständig ist
(fehlendes gst-plugin-pipewire, pipewire-pulse, Policy-Datei oder
whisper-cli/piper Binary), hängt die App fest ohne Fehlermeldung —
weder getUserMedia noch die Backend-Prozesse reagieren.
Frontend (VoicePanel.svelte):
- Preflight: prüft ob navigator.mediaDevices überhaupt existiert
- getUserMedia via Promise.race gegen 8s-Timeout (sonst hängt es ewig)
- Kategorisierte Fehler: NotAllowedError → Berechtigung,
NotFoundError → keine Hardware, NotReadableError → PipeWire-Problem
- Neuer 'connecting'-State mit 🔌-Icon — User sieht dass was passiert
- AudioContext-Konstruktor in try/catch
Backend (voice.rs):
- ffmpeg: 20s-Timeout + spezifische Fehlermeldung bei fehlendem Binary
- whisper-cli: 60s-Timeout, kein stilles Hängen mehr
- piper-tts: 30s-Timeout, Spawn-Fehler benennt NixOS-Wrapper
- Temp-Dateien werden bei Timeout auch aufgeräumt
406 lines
14 KiB
Rust
406 lines
14 KiB
Rust
// Claude Desktop — Voice Interface
|
|
// Phase 2.2: Lokales Whisper (whisper-cli) + Piper-TTS — komplett offline, kostenlos
|
|
// Gesprächs-Modus: Kontinuierlich zuhören, unterbrechen, antworten
|
|
|
|
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::process::Stdio;
|
|
use std::time::Duration;
|
|
use tokio::process::Command as TokioCommand;
|
|
use tokio::io::AsyncWriteExt;
|
|
|
|
/// Max-Laufzeit für whisper-cli Transkription — eine Minute reicht für ~10s Audio
|
|
/// auf ggml-base. Hängt der Prozess (z.B. weil whisper-cli fehlt und bash spawnt
|
|
/// stattdessen was anderes), killt der Timeout ihn statt die UI einfrieren zu lassen.
|
|
const WHISPER_TIMEOUT: Duration = Duration::from_secs(60);
|
|
|
|
/// Piper ist schneller als Whisper — 30s ist bereits extrem großzügig für ~600 Zeichen.
|
|
const PIPER_TIMEOUT: Duration = Duration::from_secs(30);
|
|
|
|
/// ffmpeg-Konvertierung sollte in Sekunden fertig sein.
|
|
const FFMPEG_TIMEOUT: Duration = Duration::from_secs(20);
|
|
|
|
/// Pfade zu den lokalen Binaries (werden in shell.nix bereitgestellt)
|
|
fn whisper_binary() -> String {
|
|
std::env::var("WHISPER_CPP_PATH")
|
|
.unwrap_or_else(|_| "whisper-cli".to_string())
|
|
}
|
|
|
|
fn piper_binary() -> String {
|
|
std::env::var("PIPER_TTS_PATH")
|
|
.unwrap_or_else(|_| "piper".to_string())
|
|
}
|
|
|
|
/// Modell-Pfade — relativ zum Executable oder absolut
|
|
fn whisper_model_path() -> String {
|
|
std::env::var("WHISPER_MODEL")
|
|
.unwrap_or_else(|_| {
|
|
let exe_dir = std::env::current_exe()
|
|
.ok()
|
|
.and_then(|p| p.parent().map(|p| p.to_path_buf()));
|
|
|
|
let home_dir = std::env::var("HOME").ok()
|
|
.map(std::path::PathBuf::from);
|
|
|
|
let candidates = vec![
|
|
// Relativ zum Binary (Dev-Modus)
|
|
exe_dir.as_ref().map(|d| d.join("../models/ggml-base.bin")),
|
|
exe_dir.as_ref().map(|d| d.join("models/ggml-base.bin")),
|
|
// XDG Data Home / Home-Verzeichnis (AppImage + Nix-Wrapper)
|
|
home_dir.as_ref().map(|d| d.join(".local/share/claude-desktop/models/ggml-base.bin")),
|
|
home_dir.as_ref().map(|d| d.join(".claude-desktop/models/ggml-base.bin")),
|
|
// CWD Fallback
|
|
Some(std::path::PathBuf::from("models/ggml-base.bin")),
|
|
];
|
|
|
|
candidates.into_iter()
|
|
.flatten()
|
|
.find(|p| p.exists())
|
|
.map(|p| p.to_string_lossy().to_string())
|
|
.unwrap_or_else(|| "models/ggml-base.bin".to_string())
|
|
})
|
|
}
|
|
|
|
/// Verfügbare Piper-Modelle mit Dateinamen
|
|
const PIPER_VOICES: &[(&str, &str)] = &[
|
|
("kerstin", "de_DE-kerstin-low.onnx"),
|
|
("thorsten-high", "de_DE-thorsten-high.onnx"),
|
|
("thorsten", "de_DE-thorsten-medium.onnx"),
|
|
("eva", "de_DE-eva_k-x_low.onnx"),
|
|
("ramona", "de_DE-ramona-low.onnx"),
|
|
];
|
|
|
|
/// Standard-Stimme: Kerstin (weiblich, deutsch)
|
|
const DEFAULT_VOICE: &str = "kerstin";
|
|
|
|
fn piper_model_path() -> String {
|
|
piper_model_for_voice(None)
|
|
}
|
|
|
|
/// Modell-Pfad für eine bestimmte Stimme (oder Default)
|
|
fn piper_model_for_voice(voice: Option<&str>) -> String {
|
|
// Env-Override hat Vorrang
|
|
if let Ok(path) = std::env::var("PIPER_MODEL") {
|
|
return path;
|
|
}
|
|
|
|
let voice_id = voice.unwrap_or(DEFAULT_VOICE);
|
|
let filename = PIPER_VOICES.iter()
|
|
.find(|(id, _)| *id == voice_id)
|
|
.map(|(_, f)| *f)
|
|
.unwrap_or(PIPER_VOICES[0].1); // Fallback auf Kerstin
|
|
|
|
let exe_dir = std::env::current_exe()
|
|
.ok()
|
|
.and_then(|p| p.parent().map(|p| p.to_path_buf()));
|
|
|
|
let home_dir = std::env::var("HOME").ok()
|
|
.map(std::path::PathBuf::from);
|
|
|
|
let candidates = vec![
|
|
// Relativ zum Binary (Dev-Modus)
|
|
exe_dir.as_ref().map(|d| d.join("../models").join(filename)),
|
|
exe_dir.as_ref().map(|d| d.join("models").join(filename)),
|
|
// XDG Data Home / Home-Verzeichnis (AppImage + Nix-Wrapper)
|
|
home_dir.as_ref().map(|d| d.join(".local/share/claude-desktop/models").join(filename)),
|
|
home_dir.as_ref().map(|d| d.join(".claude-desktop/models").join(filename)),
|
|
// CWD Fallback
|
|
Some(std::path::PathBuf::from("models").join(filename)),
|
|
];
|
|
|
|
candidates.into_iter()
|
|
.flatten()
|
|
.find(|p| p.exists())
|
|
.map(|p| p.to_string_lossy().to_string())
|
|
.unwrap_or_else(|| format!("models/{}", filename))
|
|
}
|
|
|
|
/// Voice-System Status
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct VoiceStatus {
|
|
pub whisper_available: bool,
|
|
pub piper_available: bool,
|
|
pub whisper_model: String,
|
|
pub piper_model: String,
|
|
pub openai_available: bool,
|
|
}
|
|
|
|
/// Prüft ob die lokalen Voice-Tools verfügbar sind
|
|
#[tauri::command]
|
|
pub async fn check_voice_availability() -> Result<VoiceStatus, String> {
|
|
// whisper-cli prüfen
|
|
let whisper_ok = TokioCommand::new(&whisper_binary())
|
|
.arg("--help")
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::null())
|
|
.status()
|
|
.await
|
|
.map(|s| s.success())
|
|
.unwrap_or(false);
|
|
|
|
// Modell-Datei prüfen
|
|
let whisper_model = whisper_model_path();
|
|
let whisper_model_ok = std::path::Path::new(&whisper_model).exists();
|
|
|
|
// Piper prüfen
|
|
let piper_ok = TokioCommand::new(&piper_binary())
|
|
.arg("--help")
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::null())
|
|
.status()
|
|
.await
|
|
.map(|_| true) // piper --help gibt exit 0 oder 1, beides OK
|
|
.unwrap_or(false);
|
|
|
|
let piper_model = piper_model_path();
|
|
let piper_model_ok = std::path::Path::new(&piper_model).exists();
|
|
|
|
// OpenAI API Key (Fallback)
|
|
let openai_available = std::env::var("OPENAI_API_KEY")
|
|
.map(|k| !k.is_empty())
|
|
.unwrap_or(false);
|
|
|
|
let status = VoiceStatus {
|
|
whisper_available: whisper_ok && whisper_model_ok,
|
|
piper_available: piper_ok && piper_model_ok,
|
|
whisper_model: if whisper_model_ok { whisper_model } else { "Nicht gefunden".to_string() },
|
|
piper_model: if piper_model_ok { piper_model } else { "Nicht gefunden".to_string() },
|
|
openai_available,
|
|
};
|
|
|
|
println!("🎤 Voice-Status: Whisper={}, Piper={}, OpenAI={}",
|
|
status.whisper_available, status.piper_available, status.openai_available);
|
|
|
|
Ok(status)
|
|
}
|
|
|
|
/// Transkribiert Audio mit lokalem whisper-cli
|
|
/// Audio kommt als Base64-kodiertes WAV vom Frontend
|
|
#[tauri::command]
|
|
pub async fn transcribe_audio(
|
|
audio_base64: String,
|
|
format: String,
|
|
) -> Result<String, String> {
|
|
let audio_bytes = BASE64.decode(&audio_base64)
|
|
.map_err(|e| format!("Base64-Dekodierung fehlgeschlagen: {}", e))?;
|
|
|
|
// Temporäre WAV-Datei schreiben (whisper-cli braucht eine Datei)
|
|
let tmp_dir = std::env::temp_dir();
|
|
let input_path = tmp_dir.join(format!("claude-voice-input.{}", format));
|
|
|
|
tokio::fs::write(&input_path, &audio_bytes)
|
|
.await
|
|
.map_err(|e| format!("Temp-Datei schreiben fehlgeschlagen: {}", e))?;
|
|
|
|
// Falls nicht WAV: mit ffmpeg konvertieren (WebM → WAV 16kHz mono)
|
|
let wav_path = if format != "wav" {
|
|
let wav_path = tmp_dir.join("claude-voice-input.wav");
|
|
let ffmpeg_fut = TokioCommand::new("ffmpeg")
|
|
.args(["-y", "-i"])
|
|
.arg(&input_path)
|
|
.args(["-ar", "16000", "-ac", "1", "-f", "wav"])
|
|
.arg(&wav_path)
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::null())
|
|
.status();
|
|
|
|
match tokio::time::timeout(FFMPEG_TIMEOUT, ffmpeg_fut).await {
|
|
Ok(Ok(status)) if status.success() => wav_path,
|
|
Ok(Ok(_)) => {
|
|
println!("⚠️ ffmpeg Konvertierung fehlgeschlagen (Exit != 0), versuche direkt...");
|
|
input_path.clone()
|
|
}
|
|
Ok(Err(e)) => {
|
|
let _ = tokio::fs::remove_file(&input_path).await;
|
|
return Err(format!(
|
|
"ffmpeg nicht ausführbar: {} — auf NixOS ffmpeg im Nix-Wrapper prüfen",
|
|
e
|
|
));
|
|
}
|
|
Err(_) => {
|
|
let _ = tokio::fs::remove_file(&input_path).await;
|
|
return Err(format!("ffmpeg Timeout nach {}s", FFMPEG_TIMEOUT.as_secs()));
|
|
}
|
|
}
|
|
} else {
|
|
input_path.clone()
|
|
};
|
|
|
|
// whisper-cli ausführen (Datei als letztes Argument, kein --file Flag)
|
|
let model = whisper_model_path();
|
|
let whisper_fut = TokioCommand::new(&whisper_binary())
|
|
.args([
|
|
"--model", &model,
|
|
"--language", "de",
|
|
"--no-timestamps",
|
|
"--no-prints",
|
|
"--threads", "4",
|
|
])
|
|
.arg(&wav_path)
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.output();
|
|
|
|
let output = match tokio::time::timeout(WHISPER_TIMEOUT, whisper_fut).await {
|
|
Ok(Ok(out)) => out,
|
|
Ok(Err(e)) => {
|
|
let _ = tokio::fs::remove_file(&input_path).await;
|
|
if format != "wav" { let _ = tokio::fs::remove_file(&wav_path).await; }
|
|
return Err(format!(
|
|
"whisper-cli nicht ausführbar: {} — Binary fehlt? Auf NixOS whisper-cpp im Nix-Wrapper prüfen",
|
|
e
|
|
));
|
|
}
|
|
Err(_) => {
|
|
let _ = tokio::fs::remove_file(&input_path).await;
|
|
if format != "wav" { let _ = tokio::fs::remove_file(&wav_path).await; }
|
|
return Err(format!(
|
|
"whisper-cli Timeout nach {}s — Prozess hängt, Audio zu lang oder Binary defekt",
|
|
WHISPER_TIMEOUT.as_secs()
|
|
));
|
|
}
|
|
};
|
|
|
|
// Aufräumen
|
|
let _ = tokio::fs::remove_file(&input_path).await;
|
|
if format != "wav" {
|
|
let _ = tokio::fs::remove_file(&wav_path).await;
|
|
}
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
return Err(format!("whisper-cli Fehler: {}", stderr));
|
|
}
|
|
|
|
// Transkription parsen — whisper-cli gibt Text auf stdout
|
|
let text = String::from_utf8_lossy(&output.stdout)
|
|
.lines()
|
|
.filter(|l| !l.trim().is_empty())
|
|
.map(|l| l.trim().to_string())
|
|
.collect::<Vec<_>>()
|
|
.join(" ")
|
|
.trim()
|
|
.to_string();
|
|
|
|
println!("🎤 Transkription (lokal): \"{}\"", text);
|
|
|
|
Ok(text)
|
|
}
|
|
|
|
/// Text-to-Speech mit lokalem Piper-TTS
|
|
/// Gibt Base64-kodierten WAV-Audio zurück
|
|
#[tauri::command]
|
|
pub async fn text_to_speech(
|
|
text: String,
|
|
voice: Option<String>,
|
|
) -> Result<String, String> {
|
|
let model = piper_model_for_voice(voice.as_deref());
|
|
|
|
if !std::path::Path::new(&model).exists() {
|
|
return Err(format!("Piper-Modell nicht gefunden: {}", model));
|
|
}
|
|
|
|
// Piper über stdin füttern, RAW PCM auf stdout
|
|
let mut child = TokioCommand::new(&piper_binary())
|
|
.args(["--model", &model, "--output-raw"])
|
|
.stdin(Stdio::piped())
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.spawn()
|
|
.map_err(|e| format!(
|
|
"Piper starten fehlgeschlagen: {} — Binary fehlt? Auf NixOS piper-tts im Nix-Wrapper prüfen",
|
|
e
|
|
))?;
|
|
|
|
// Text über stdin senden
|
|
if let Some(mut stdin) = child.stdin.take() {
|
|
stdin.write_all(text.as_bytes()).await
|
|
.map_err(|e| format!("Piper stdin schreiben fehlgeschlagen: {}", e))?;
|
|
drop(stdin); // EOF senden
|
|
}
|
|
|
|
// Timeout: hängt Piper (z.B. Modell-Mismatch), killen statt UI einfrieren.
|
|
let output = match tokio::time::timeout(PIPER_TIMEOUT, child.wait_with_output()).await {
|
|
Ok(Ok(out)) => out,
|
|
Ok(Err(e)) => return Err(format!("Piper Fehler: {}", e)),
|
|
Err(_) => {
|
|
return Err(format!(
|
|
"Piper Timeout nach {}s — Prozess hängt. Text zu lang oder Modell defekt?",
|
|
PIPER_TIMEOUT.as_secs()
|
|
));
|
|
}
|
|
};
|
|
|
|
if output.stdout.is_empty() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
return Err(format!("Piper hat kein Audio erzeugt: {}", stderr));
|
|
}
|
|
|
|
// Raw PCM → WAV Header hinzufügen (16-bit, 22050Hz, mono — Piper Default)
|
|
let pcm_data = &output.stdout;
|
|
let wav_data = pcm_to_wav(pcm_data, 22050, 1, 16);
|
|
|
|
let audio_base64 = BASE64.encode(&wav_data);
|
|
|
|
println!("🔊 TTS (Piper lokal): {} Zeichen → {} Bytes WAV", text.len(), wav_data.len());
|
|
|
|
Ok(audio_base64)
|
|
}
|
|
|
|
/// Konvertiert Raw PCM zu WAV (mit Header)
|
|
fn pcm_to_wav(pcm: &[u8], sample_rate: u32, channels: u16, bits_per_sample: u16) -> Vec<u8> {
|
|
let byte_rate = sample_rate * channels as u32 * bits_per_sample as u32 / 8;
|
|
let block_align = channels * bits_per_sample / 8;
|
|
let data_size = pcm.len() as u32;
|
|
let file_size = 36 + data_size;
|
|
|
|
let mut wav = Vec::with_capacity(44 + pcm.len());
|
|
|
|
// RIFF Header
|
|
wav.extend_from_slice(b"RIFF");
|
|
wav.extend_from_slice(&file_size.to_le_bytes());
|
|
wav.extend_from_slice(b"WAVE");
|
|
|
|
// fmt Chunk
|
|
wav.extend_from_slice(b"fmt ");
|
|
wav.extend_from_slice(&16u32.to_le_bytes()); // Chunk-Größe
|
|
wav.extend_from_slice(&1u16.to_le_bytes()); // PCM Format
|
|
wav.extend_from_slice(&channels.to_le_bytes());
|
|
wav.extend_from_slice(&sample_rate.to_le_bytes());
|
|
wav.extend_from_slice(&byte_rate.to_le_bytes());
|
|
wav.extend_from_slice(&block_align.to_le_bytes());
|
|
wav.extend_from_slice(&bits_per_sample.to_le_bytes());
|
|
|
|
// data Chunk
|
|
wav.extend_from_slice(b"data");
|
|
wav.extend_from_slice(&data_size.to_le_bytes());
|
|
wav.extend_from_slice(pcm);
|
|
|
|
wav
|
|
}
|
|
|
|
/// Verfügbare TTS-Stimmen — prüft welche Modelle lokal vorhanden sind
|
|
#[tauri::command]
|
|
pub async fn get_tts_voices() -> Result<Vec<serde_json::Value>, String> {
|
|
let voices_meta = vec![
|
|
("kerstin", "Kerstin (Deutsch)", "Weiblich, lokal, Standard"),
|
|
("thorsten-high", "Thorsten HQ (Deutsch)", "Männlich, hohe Qualität, lokal"),
|
|
("thorsten", "Thorsten (Deutsch)", "Männlich, mittlere Qualität, lokal"),
|
|
("eva", "Eva (Deutsch)", "Weiblich, lokal"),
|
|
("ramona", "Ramona (Deutsch)", "Weiblich, lokal"),
|
|
];
|
|
|
|
let mut result = Vec::new();
|
|
for (id, name, desc) in voices_meta {
|
|
let model_path = piper_model_for_voice(Some(id));
|
|
let available = std::path::Path::new(&model_path).exists();
|
|
result.push(serde_json::json!({
|
|
"id": id,
|
|
"name": name,
|
|
"description": desc,
|
|
"available": available,
|
|
"default": id == DEFAULT_VOICE,
|
|
}));
|
|
}
|
|
Ok(result)
|
|
}
|