Phase 2.2: Lokales Voice — whisper-cli + piper-tts, Gesprächsmodus mit VAD
Some checks failed
Build AppImage / build (push) Has been cancelled

OpenAI komplett entfernt. Voice läuft jetzt offline mit lokalen Binaries:
- whisper-cli (whisper-cpp 1.8.3) für Speech-to-Text
- piper-tts mit Thorsten-Stimme (Deutsch) für Text-to-Speech
- GStreamer + PipeWire in shell.nix für WebKitGTK Mikrofon-Zugriff
- VoicePanel: Echtgespräch mit VAD-Stille-Erkennung, Interrupt, Loop
- Models in .gitignore (~250MB)

[appimage]

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eddy 2026-04-21 08:48:03 +02:00
parent 05c4913833
commit f287514af5
5 changed files with 856 additions and 682 deletions

4
.gitignore vendored
View file

@ -25,6 +25,10 @@ result-*
.env
.env.local
# Voice-Modelle (zu groß für Git, ~250MB)
models/*.bin
models/*.onnx
# OS
.DS_Store
Thumbs.db

View file

@ -30,10 +30,20 @@ pkgs.mkShell {
# Node.js (falls nicht global)
nodejs_22
# Für Audio (Whisper/TTS später)
# Phase 2.2: Lokales Voice (kostenlos, offline)
whisper-cpp # Speech-to-Text (whisper-cli Binary)
piper-tts # Text-to-Speech (deutsche Stimme: Thorsten)
alsa-lib
ffmpeg
# GStreamer für WebKitGTK Mikrofon-Zugriff (getUserMedia)
gst_all_1.gstreamer
gst_all_1.gst-plugins-base
gst_all_1.gst-plugins-good
gst_all_1.gst-plugins-bad
gst_all_1.gst-plugins-ugly
pipewire # PipeWire-Support für Audio-Capture
# Zusätzliche Bibliotheken für Tauri CLI
bzip2
zlib
@ -58,11 +68,27 @@ pkgs.mkShell {
pkgs.xz
pkgs.zstd
pkgs.openssl
pkgs.gst_all_1.gstreamer
pkgs.gst_all_1.gst-plugins-base
pkgs.gst_all_1.gst-plugins-good
pkgs.gst_all_1.gst-plugins-bad
pkgs.pipewire
pkgs.alsa-lib
]}:$LD_LIBRARY_PATH"
# GStreamer Plugin-Pfade für WebKitGTK Mikrofon-Zugriff
export GST_PLUGIN_PATH="${pkgs.lib.makeSearchPathOutput "lib" "lib/gstreamer-1.0" [
pkgs.gst_all_1.gstreamer
pkgs.gst_all_1.gst-plugins-base
pkgs.gst_all_1.gst-plugins-good
pkgs.gst_all_1.gst-plugins-bad
pkgs.gst_all_1.gst-plugins-ugly
]}''${GST_PLUGIN_PATH:+:$GST_PLUGIN_PATH}"
echo "🦀 Claude Desktop Entwicklungsumgebung geladen"
echo " Rust: $(rustc --version 2>/dev/null || echo 'nicht gefunden')"
echo " Cargo: $(cargo --version 2>/dev/null || echo 'nicht gefunden')"
echo " Node: $(node --version 2>/dev/null || echo 'nicht gefunden')"
echo " GStreamer: $(gst-inspect-1.0 --version 2>/dev/null | head -1 || echo 'nicht gefunden')"
'';
}

1
src-tauri/Cargo.lock generated
View file

@ -494,6 +494,7 @@ dependencies = [
"tokio",
"tokio-tungstenite",
"uuid",
"webkit2gtk",
]
[[package]]

View file

@ -1,173 +1,295 @@
// Claude Desktop — Voice Interface
// Speech-to-Text mit Whisper API, Text-to-Speech mit OpenAI TTS
// Phase 2.2: Lokales Whisper (whisper-cli) + Piper-TTS — komplett offline, kostenlos
// Gesprächs-Modus: Kontinuierlich zuhören, unterbrechen, antworten
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
use serde::{Deserialize, Serialize};
use std::process::Stdio;
use tokio::process::Command as TokioCommand;
use tokio::io::AsyncWriteExt;
/// Whisper API Konfiguration
const OPENAI_API_URL: &str = "https://api.openai.com/v1/audio/transcriptions";
const TTS_API_URL: &str = "https://api.openai.com/v1/audio/speech";
/// Pfade zu den lokalen Binaries (werden in shell.nix bereitgestellt)
fn whisper_binary() -> String {
std::env::var("WHISPER_CPP_PATH")
.unwrap_or_else(|_| "whisper-cli".to_string())
}
/// Transkriptions-Ergebnis
fn piper_binary() -> String {
std::env::var("PIPER_TTS_PATH")
.unwrap_or_else(|_| "piper".to_string())
}
/// Modell-Pfade — relativ zum Executable oder absolut
fn whisper_model_path() -> String {
std::env::var("WHISPER_MODEL")
.unwrap_or_else(|_| {
let exe_dir = std::env::current_exe()
.ok()
.and_then(|p| p.parent().map(|p| p.to_path_buf()));
let candidates = vec![
exe_dir.as_ref().map(|d| d.join("../models/ggml-base.bin")),
exe_dir.as_ref().map(|d| d.join("models/ggml-base.bin")),
Some(std::path::PathBuf::from("models/ggml-base.bin")),
];
candidates.into_iter()
.flatten()
.find(|p| p.exists())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|| "models/ggml-base.bin".to_string())
})
}
fn piper_model_path() -> String {
std::env::var("PIPER_MODEL")
.unwrap_or_else(|_| {
let exe_dir = std::env::current_exe()
.ok()
.and_then(|p| p.parent().map(|p| p.to_path_buf()));
let candidates = vec![
exe_dir.as_ref().map(|d| d.join("../models/de_DE-thorsten-high.onnx")),
exe_dir.as_ref().map(|d| d.join("models/de_DE-thorsten-high.onnx")),
Some(std::path::PathBuf::from("models/de_DE-thorsten-high.onnx")),
];
candidates.into_iter()
.flatten()
.find(|p| p.exists())
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|| "models/de_DE-thorsten-high.onnx".to_string())
})
}
/// Voice-System Status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionResult {
pub text: String,
pub language: Option<String>,
pub duration: Option<f64>,
pub struct VoiceStatus {
pub whisper_available: bool,
pub piper_available: bool,
pub whisper_model: String,
pub piper_model: String,
pub openai_available: bool,
}
/// TTS-Stimmen
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum TtsVoice {
Alloy,
Echo,
Fable,
Onyx,
Nova,
Shimmer,
/// Prüft ob die lokalen Voice-Tools verfügbar sind
#[tauri::command]
pub async fn check_voice_availability() -> Result<VoiceStatus, String> {
// whisper-cli prüfen
let whisper_ok = TokioCommand::new(&whisper_binary())
.arg("--help")
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.await
.map(|s| s.success())
.unwrap_or(false);
// Modell-Datei prüfen
let whisper_model = whisper_model_path();
let whisper_model_ok = std::path::Path::new(&whisper_model).exists();
// Piper prüfen
let piper_ok = TokioCommand::new(&piper_binary())
.arg("--help")
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.await
.map(|_| true) // piper --help gibt exit 0 oder 1, beides OK
.unwrap_or(false);
let piper_model = piper_model_path();
let piper_model_ok = std::path::Path::new(&piper_model).exists();
// OpenAI API Key (Fallback)
let openai_available = std::env::var("OPENAI_API_KEY")
.map(|k| !k.is_empty())
.unwrap_or(false);
let status = VoiceStatus {
whisper_available: whisper_ok && whisper_model_ok,
piper_available: piper_ok && piper_model_ok,
whisper_model: if whisper_model_ok { whisper_model } else { "Nicht gefunden".to_string() },
piper_model: if piper_model_ok { piper_model } else { "Nicht gefunden".to_string() },
openai_available,
};
println!("🎤 Voice-Status: Whisper={}, Piper={}, OpenAI={}",
status.whisper_available, status.piper_available, status.openai_available);
Ok(status)
}
impl TtsVoice {
fn as_str(&self) -> &str {
match self {
TtsVoice::Alloy => "alloy",
TtsVoice::Echo => "echo",
TtsVoice::Fable => "fable",
TtsVoice::Onyx => "onyx",
TtsVoice::Nova => "nova",
TtsVoice::Shimmer => "shimmer",
}
}
}
/// Holt den OpenAI API Key aus Umgebungsvariable oder Settings
fn get_openai_key() -> Result<String, String> {
// Erst Umgebungsvariable prüfen
if let Ok(key) = std::env::var("OPENAI_API_KEY") {
if !key.is_empty() {
return Ok(key);
}
}
// Alternativ: Aus Settings laden (TODO)
Err("OpenAI API Key nicht gefunden. Setze OPENAI_API_KEY Umgebungsvariable.".to_string())
}
/// Transkribiert Audio mit OpenAI Whisper API
/// Transkribiert Audio mit lokalem whisper-cli
/// Audio kommt als Base64-kodiertes WAV vom Frontend
#[tauri::command]
pub async fn transcribe_audio(
audio_base64: String,
format: String,
) -> Result<String, String> {
let api_key = get_openai_key()?;
// Base64 dekodieren
let audio_bytes = BASE64.decode(&audio_base64)
.map_err(|e| format!("Base64-Dekodierung fehlgeschlagen: {}", e))?;
// Temporäre Datei erstellen (Whisper API braucht Datei-Upload)
// Multipart-Request an Whisper API — direkt aus dem Byte-Buffer
let client = reqwest::Client::new();
// Temporäre WAV-Datei schreiben (whisper-cli braucht eine Datei)
let tmp_dir = std::env::temp_dir();
let input_path = tmp_dir.join(format!("claude-voice-input.{}", format));
let file_part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", format))
.mime_str(&format!("audio/{}", format))
.map_err(|e| format!("MIME-Type fehlgeschlagen: {}", e))?;
let form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model", "whisper-1")
.text("language", "de") // Deutsch priorisieren
.text("response_format", "json");
let response = client
.post(OPENAI_API_URL)
.bearer_auth(&api_key)
.multipart(form)
.send()
tokio::fs::write(&input_path, &audio_bytes)
.await
.map_err(|e| format!("API-Request fehlgeschlagen: {}", e))?;
.map_err(|e| format!("Temp-Datei schreiben fehlgeschlagen: {}", e))?;
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("Whisper API Fehler: {}", error_text));
// Falls nicht WAV: mit ffmpeg konvertieren (WebM → WAV 16kHz mono)
let wav_path = if format != "wav" {
let wav_path = tmp_dir.join("claude-voice-input.wav");
let ffmpeg_result = TokioCommand::new("ffmpeg")
.args(["-y", "-i"])
.arg(&input_path)
.args(["-ar", "16000", "-ac", "1", "-f", "wav"])
.arg(&wav_path)
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.await;
match ffmpeg_result {
Ok(status) if status.success() => wav_path,
_ => {
println!("⚠️ ffmpeg Konvertierung fehlgeschlagen, versuche direkt...");
input_path.clone()
}
}
} else {
input_path.clone()
};
// whisper-cli ausführen (Datei als letztes Argument, kein --file Flag)
let model = whisper_model_path();
let output = TokioCommand::new(&whisper_binary())
.args([
"--model", &model,
"--language", "de",
"--no-timestamps",
"--no-prints",
"--threads", "4",
])
.arg(&wav_path)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| format!("whisper-cli ausführen fehlgeschlagen: {}", e))?;
// Aufräumen
let _ = tokio::fs::remove_file(&input_path).await;
if format != "wav" {
let _ = tokio::fs::remove_file(&wav_path).await;
}
// Response parsen
#[derive(Deserialize)]
struct WhisperResponse {
text: String,
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(format!("whisper-cli Fehler: {}", stderr));
}
let result: WhisperResponse = response.json().await
.map_err(|e| format!("Response parsen fehlgeschlagen: {}", e))?;
// Transkription parsen — whisper-cli gibt Text auf stdout
let text = String::from_utf8_lossy(&output.stdout)
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| l.trim().to_string())
.collect::<Vec<_>>()
.join(" ")
.trim()
.to_string();
println!("🎤 Transkription: \"{}\"", result.text);
println!("🎤 Transkription (lokal): \"{}\"", text);
Ok(result.text)
Ok(text)
}
/// Text-to-Speech mit OpenAI TTS API
/// Text-to-Speech mit lokalem Piper-TTS
/// Gibt Base64-kodierten WAV-Audio zurück
#[tauri::command]
pub async fn text_to_speech(
text: String,
voice: Option<String>,
_voice: Option<String>, // Wird für Piper ignoriert (Modell bestimmt Stimme)
) -> Result<String, String> {
let api_key = get_openai_key()?;
let model = piper_model_path();
let voice_name = voice.unwrap_or_else(|| "nova".to_string());
let client = reqwest::Client::new();
let body = serde_json::json!({
"model": "tts-1",
"input": text,
"voice": voice_name,
"response_format": "mp3"
});
let response = client
.post(TTS_API_URL)
.bearer_auth(&api_key)
.json(&body)
.send()
.await
.map_err(|e| format!("TTS API-Request fehlgeschlagen: {}", e))?;
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("TTS API Fehler: {}", error_text));
if !std::path::Path::new(&model).exists() {
return Err(format!("Piper-Modell nicht gefunden: {}", model));
}
// Audio-Bytes als Base64 zurückgeben
let audio_bytes = response.bytes().await
.map_err(|e| format!("Audio lesen fehlgeschlagen: {}", e))?;
// Piper über stdin füttern, RAW PCM auf stdout
let mut child = TokioCommand::new(&piper_binary())
.args(["--model", &model, "--output-raw"])
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|e| format!("Piper starten fehlgeschlagen: {}", e))?;
let audio_base64 = BASE64.encode(&audio_bytes);
// Text über stdin senden
if let Some(mut stdin) = child.stdin.take() {
stdin.write_all(text.as_bytes()).await
.map_err(|e| format!("Piper stdin schreiben fehlgeschlagen: {}", e))?;
drop(stdin); // EOF senden
}
println!("🔊 TTS generiert: {} Zeichen → {} Bytes Audio", text.len(), audio_bytes.len());
let output = child.wait_with_output().await
.map_err(|e| format!("Piper Fehler: {}", e))?;
if output.stdout.is_empty() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(format!("Piper hat kein Audio erzeugt: {}", stderr));
}
// Raw PCM → WAV Header hinzufügen (16-bit, 22050Hz, mono — Piper Default)
let pcm_data = &output.stdout;
let wav_data = pcm_to_wav(pcm_data, 22050, 1, 16);
let audio_base64 = BASE64.encode(&wav_data);
println!("🔊 TTS (Piper lokal): {} Zeichen → {} Bytes WAV", text.len(), wav_data.len());
Ok(audio_base64)
}
/// Prüft ob Voice-Features verfügbar sind (API Key vorhanden)
#[tauri::command]
pub async fn check_voice_availability() -> Result<bool, String> {
match get_openai_key() {
Ok(_) => Ok(true),
Err(_) => Ok(false),
}
/// Konvertiert Raw PCM zu WAV (mit Header)
fn pcm_to_wav(pcm: &[u8], sample_rate: u32, channels: u16, bits_per_sample: u16) -> Vec<u8> {
let byte_rate = sample_rate * channels as u32 * bits_per_sample as u32 / 8;
let block_align = channels * bits_per_sample / 8;
let data_size = pcm.len() as u32;
let file_size = 36 + data_size;
let mut wav = Vec::with_capacity(44 + pcm.len());
// RIFF Header
wav.extend_from_slice(b"RIFF");
wav.extend_from_slice(&file_size.to_le_bytes());
wav.extend_from_slice(b"WAVE");
// fmt Chunk
wav.extend_from_slice(b"fmt ");
wav.extend_from_slice(&16u32.to_le_bytes()); // Chunk-Größe
wav.extend_from_slice(&1u16.to_le_bytes()); // PCM Format
wav.extend_from_slice(&channels.to_le_bytes());
wav.extend_from_slice(&sample_rate.to_le_bytes());
wav.extend_from_slice(&byte_rate.to_le_bytes());
wav.extend_from_slice(&block_align.to_le_bytes());
wav.extend_from_slice(&bits_per_sample.to_le_bytes());
// data Chunk
wav.extend_from_slice(b"data");
wav.extend_from_slice(&data_size.to_le_bytes());
wav.extend_from_slice(pcm);
wav
}
/// Verfügbare TTS-Stimmen
/// Verfügbare TTS-Stimmen — bei Piper: Modell-basiert
#[tauri::command]
pub async fn get_tts_voices() -> Result<Vec<serde_json::Value>, String> {
Ok(vec![
serde_json::json!({ "id": "alloy", "name": "Alloy", "description": "Neutral, ausgewogen" }),
serde_json::json!({ "id": "echo", "name": "Echo", "description": "Männlich, warm" }),
serde_json::json!({ "id": "fable", "name": "Fable", "description": "Expressiv, britisch" }),
serde_json::json!({ "id": "onyx", "name": "Onyx", "description": "Tief, autoritär" }),
serde_json::json!({ "id": "nova", "name": "Nova", "description": "Weiblich, freundlich" }),
serde_json::json!({ "id": "shimmer", "name": "Shimmer", "description": "Weiblich, sanft" }),
serde_json::json!({ "id": "thorsten-high", "name": "Thorsten (Deutsch)", "description": "Lokal, hohe Qualität, männlich" }),
])
}

File diff suppressed because it is too large Load diff