From f287514af59a73b57faab3974a8b67422c627a0e Mon Sep 17 00:00:00 2001 From: Eddy Date: Tue, 21 Apr 2026 08:48:03 +0200 Subject: [PATCH] =?UTF-8?q?Phase=202.2:=20Lokales=20Voice=20=E2=80=94=20wh?= =?UTF-8?q?isper-cli=20+=20piper-tts,=20Gespr=C3=A4chsmodus=20mit=20VAD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAI komplett entfernt. Voice läuft jetzt offline mit lokalen Binaries: - whisper-cli (whisper-cpp 1.8.3) für Speech-to-Text - piper-tts mit Thorsten-Stimme (Deutsch) für Text-to-Speech - GStreamer + PipeWire in shell.nix für WebKitGTK Mikrofon-Zugriff - VoicePanel: Echtgespräch mit VAD-Stille-Erkennung, Interrupt, Loop - Models in .gitignore (~250MB) [appimage] Co-Authored-By: Claude Opus 4.6 --- .gitignore | 4 + shell.nix | 28 +- src-tauri/Cargo.lock | 1 + src-tauri/src/voice.rs | 368 ++++++--- src/lib/components/VoicePanel.svelte | 1137 +++++++++++++------------- 5 files changed, 856 insertions(+), 682 deletions(-) diff --git a/.gitignore b/.gitignore index 44b12bc..e2c45d4 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,10 @@ result-* .env .env.local +# Voice-Modelle (zu groß für Git, ~250MB) +models/*.bin +models/*.onnx + # OS .DS_Store Thumbs.db diff --git a/shell.nix b/shell.nix index 13960cf..6037934 100644 --- a/shell.nix +++ b/shell.nix @@ -30,10 +30,20 @@ pkgs.mkShell { # Node.js (falls nicht global) nodejs_22 - # Für Audio (Whisper/TTS später) + # Phase 2.2: Lokales Voice (kostenlos, offline) + whisper-cpp # Speech-to-Text (whisper-cli Binary) + piper-tts # Text-to-Speech (deutsche Stimme: Thorsten) alsa-lib ffmpeg + # GStreamer für WebKitGTK Mikrofon-Zugriff (getUserMedia) + gst_all_1.gstreamer + gst_all_1.gst-plugins-base + gst_all_1.gst-plugins-good + gst_all_1.gst-plugins-bad + gst_all_1.gst-plugins-ugly + pipewire # PipeWire-Support für Audio-Capture + # Zusätzliche Bibliotheken für Tauri CLI bzip2 zlib @@ -58,11 +68,27 @@ pkgs.mkShell { pkgs.xz pkgs.zstd pkgs.openssl + pkgs.gst_all_1.gstreamer + pkgs.gst_all_1.gst-plugins-base + pkgs.gst_all_1.gst-plugins-good + pkgs.gst_all_1.gst-plugins-bad + pkgs.pipewire + pkgs.alsa-lib ]}:$LD_LIBRARY_PATH" + # GStreamer Plugin-Pfade für WebKitGTK Mikrofon-Zugriff + export GST_PLUGIN_PATH="${pkgs.lib.makeSearchPathOutput "lib" "lib/gstreamer-1.0" [ + pkgs.gst_all_1.gstreamer + pkgs.gst_all_1.gst-plugins-base + pkgs.gst_all_1.gst-plugins-good + pkgs.gst_all_1.gst-plugins-bad + pkgs.gst_all_1.gst-plugins-ugly + ]}''${GST_PLUGIN_PATH:+:$GST_PLUGIN_PATH}" + echo "🦀 Claude Desktop Entwicklungsumgebung geladen" echo " Rust: $(rustc --version 2>/dev/null || echo 'nicht gefunden')" echo " Cargo: $(cargo --version 2>/dev/null || echo 'nicht gefunden')" echo " Node: $(node --version 2>/dev/null || echo 'nicht gefunden')" + echo " GStreamer: $(gst-inspect-1.0 --version 2>/dev/null | head -1 || echo 'nicht gefunden')" ''; } diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 15a077c..f8419cd 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -494,6 +494,7 @@ dependencies = [ "tokio", "tokio-tungstenite", "uuid", + "webkit2gtk", ] [[package]] diff --git a/src-tauri/src/voice.rs b/src-tauri/src/voice.rs index e16af6a..2c072e4 100644 --- a/src-tauri/src/voice.rs +++ b/src-tauri/src/voice.rs @@ -1,173 +1,295 @@ // Claude Desktop — Voice Interface -// Speech-to-Text mit Whisper API, Text-to-Speech mit OpenAI TTS +// Phase 2.2: Lokales Whisper (whisper-cli) + Piper-TTS — komplett offline, kostenlos +// Gesprächs-Modus: Kontinuierlich zuhören, unterbrechen, antworten use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64}; use serde::{Deserialize, Serialize}; +use std::process::Stdio; +use tokio::process::Command as TokioCommand; +use tokio::io::AsyncWriteExt; -/// Whisper API Konfiguration -const OPENAI_API_URL: &str = "https://api.openai.com/v1/audio/transcriptions"; -const TTS_API_URL: &str = "https://api.openai.com/v1/audio/speech"; +/// Pfade zu den lokalen Binaries (werden in shell.nix bereitgestellt) +fn whisper_binary() -> String { + std::env::var("WHISPER_CPP_PATH") + .unwrap_or_else(|_| "whisper-cli".to_string()) +} -/// Transkriptions-Ergebnis +fn piper_binary() -> String { + std::env::var("PIPER_TTS_PATH") + .unwrap_or_else(|_| "piper".to_string()) +} + +/// Modell-Pfade — relativ zum Executable oder absolut +fn whisper_model_path() -> String { + std::env::var("WHISPER_MODEL") + .unwrap_or_else(|_| { + let exe_dir = std::env::current_exe() + .ok() + .and_then(|p| p.parent().map(|p| p.to_path_buf())); + + let candidates = vec![ + exe_dir.as_ref().map(|d| d.join("../models/ggml-base.bin")), + exe_dir.as_ref().map(|d| d.join("models/ggml-base.bin")), + Some(std::path::PathBuf::from("models/ggml-base.bin")), + ]; + + candidates.into_iter() + .flatten() + .find(|p| p.exists()) + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_else(|| "models/ggml-base.bin".to_string()) + }) +} + +fn piper_model_path() -> String { + std::env::var("PIPER_MODEL") + .unwrap_or_else(|_| { + let exe_dir = std::env::current_exe() + .ok() + .and_then(|p| p.parent().map(|p| p.to_path_buf())); + + let candidates = vec![ + exe_dir.as_ref().map(|d| d.join("../models/de_DE-thorsten-high.onnx")), + exe_dir.as_ref().map(|d| d.join("models/de_DE-thorsten-high.onnx")), + Some(std::path::PathBuf::from("models/de_DE-thorsten-high.onnx")), + ]; + + candidates.into_iter() + .flatten() + .find(|p| p.exists()) + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_else(|| "models/de_DE-thorsten-high.onnx".to_string()) + }) +} + +/// Voice-System Status #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TranscriptionResult { - pub text: String, - pub language: Option, - pub duration: Option, +pub struct VoiceStatus { + pub whisper_available: bool, + pub piper_available: bool, + pub whisper_model: String, + pub piper_model: String, + pub openai_available: bool, } -/// TTS-Stimmen -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum TtsVoice { - Alloy, - Echo, - Fable, - Onyx, - Nova, - Shimmer, +/// Prüft ob die lokalen Voice-Tools verfügbar sind +#[tauri::command] +pub async fn check_voice_availability() -> Result { + // whisper-cli prüfen + let whisper_ok = TokioCommand::new(&whisper_binary()) + .arg("--help") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await + .map(|s| s.success()) + .unwrap_or(false); + + // Modell-Datei prüfen + let whisper_model = whisper_model_path(); + let whisper_model_ok = std::path::Path::new(&whisper_model).exists(); + + // Piper prüfen + let piper_ok = TokioCommand::new(&piper_binary()) + .arg("--help") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await + .map(|_| true) // piper --help gibt exit 0 oder 1, beides OK + .unwrap_or(false); + + let piper_model = piper_model_path(); + let piper_model_ok = std::path::Path::new(&piper_model).exists(); + + // OpenAI API Key (Fallback) + let openai_available = std::env::var("OPENAI_API_KEY") + .map(|k| !k.is_empty()) + .unwrap_or(false); + + let status = VoiceStatus { + whisper_available: whisper_ok && whisper_model_ok, + piper_available: piper_ok && piper_model_ok, + whisper_model: if whisper_model_ok { whisper_model } else { "Nicht gefunden".to_string() }, + piper_model: if piper_model_ok { piper_model } else { "Nicht gefunden".to_string() }, + openai_available, + }; + + println!("🎤 Voice-Status: Whisper={}, Piper={}, OpenAI={}", + status.whisper_available, status.piper_available, status.openai_available); + + Ok(status) } -impl TtsVoice { - fn as_str(&self) -> &str { - match self { - TtsVoice::Alloy => "alloy", - TtsVoice::Echo => "echo", - TtsVoice::Fable => "fable", - TtsVoice::Onyx => "onyx", - TtsVoice::Nova => "nova", - TtsVoice::Shimmer => "shimmer", - } - } -} - -/// Holt den OpenAI API Key aus Umgebungsvariable oder Settings -fn get_openai_key() -> Result { - // Erst Umgebungsvariable prüfen - if let Ok(key) = std::env::var("OPENAI_API_KEY") { - if !key.is_empty() { - return Ok(key); - } - } - - // Alternativ: Aus Settings laden (TODO) - Err("OpenAI API Key nicht gefunden. Setze OPENAI_API_KEY Umgebungsvariable.".to_string()) -} - -/// Transkribiert Audio mit OpenAI Whisper API +/// Transkribiert Audio mit lokalem whisper-cli +/// Audio kommt als Base64-kodiertes WAV vom Frontend #[tauri::command] pub async fn transcribe_audio( audio_base64: String, format: String, ) -> Result { - let api_key = get_openai_key()?; - - // Base64 dekodieren let audio_bytes = BASE64.decode(&audio_base64) .map_err(|e| format!("Base64-Dekodierung fehlgeschlagen: {}", e))?; - // Temporäre Datei erstellen (Whisper API braucht Datei-Upload) - // Multipart-Request an Whisper API — direkt aus dem Byte-Buffer - let client = reqwest::Client::new(); + // Temporäre WAV-Datei schreiben (whisper-cli braucht eine Datei) + let tmp_dir = std::env::temp_dir(); + let input_path = tmp_dir.join(format!("claude-voice-input.{}", format)); - let file_part = reqwest::multipart::Part::bytes(audio_bytes) - .file_name(format!("audio.{}", format)) - .mime_str(&format!("audio/{}", format)) - .map_err(|e| format!("MIME-Type fehlgeschlagen: {}", e))?; - - let form = reqwest::multipart::Form::new() - .part("file", file_part) - .text("model", "whisper-1") - .text("language", "de") // Deutsch priorisieren - .text("response_format", "json"); - - let response = client - .post(OPENAI_API_URL) - .bearer_auth(&api_key) - .multipart(form) - .send() + tokio::fs::write(&input_path, &audio_bytes) .await - .map_err(|e| format!("API-Request fehlgeschlagen: {}", e))?; + .map_err(|e| format!("Temp-Datei schreiben fehlgeschlagen: {}", e))?; - if !response.status().is_success() { - let error_text = response.text().await.unwrap_or_default(); - return Err(format!("Whisper API Fehler: {}", error_text)); + // Falls nicht WAV: mit ffmpeg konvertieren (WebM → WAV 16kHz mono) + let wav_path = if format != "wav" { + let wav_path = tmp_dir.join("claude-voice-input.wav"); + let ffmpeg_result = TokioCommand::new("ffmpeg") + .args(["-y", "-i"]) + .arg(&input_path) + .args(["-ar", "16000", "-ac", "1", "-f", "wav"]) + .arg(&wav_path) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .await; + + match ffmpeg_result { + Ok(status) if status.success() => wav_path, + _ => { + println!("⚠️ ffmpeg Konvertierung fehlgeschlagen, versuche direkt..."); + input_path.clone() + } + } + } else { + input_path.clone() + }; + + // whisper-cli ausführen (Datei als letztes Argument, kein --file Flag) + let model = whisper_model_path(); + let output = TokioCommand::new(&whisper_binary()) + .args([ + "--model", &model, + "--language", "de", + "--no-timestamps", + "--no-prints", + "--threads", "4", + ]) + .arg(&wav_path) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| format!("whisper-cli ausführen fehlgeschlagen: {}", e))?; + + // Aufräumen + let _ = tokio::fs::remove_file(&input_path).await; + if format != "wav" { + let _ = tokio::fs::remove_file(&wav_path).await; } - // Response parsen - #[derive(Deserialize)] - struct WhisperResponse { - text: String, + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("whisper-cli Fehler: {}", stderr)); } - let result: WhisperResponse = response.json().await - .map_err(|e| format!("Response parsen fehlgeschlagen: {}", e))?; + // Transkription parsen — whisper-cli gibt Text auf stdout + let text = String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|l| !l.trim().is_empty()) + .map(|l| l.trim().to_string()) + .collect::>() + .join(" ") + .trim() + .to_string(); - println!("🎤 Transkription: \"{}\"", result.text); + println!("🎤 Transkription (lokal): \"{}\"", text); - Ok(result.text) + Ok(text) } -/// Text-to-Speech mit OpenAI TTS API +/// Text-to-Speech mit lokalem Piper-TTS +/// Gibt Base64-kodierten WAV-Audio zurück #[tauri::command] pub async fn text_to_speech( text: String, - voice: Option, + _voice: Option, // Wird für Piper ignoriert (Modell bestimmt Stimme) ) -> Result { - let api_key = get_openai_key()?; + let model = piper_model_path(); - let voice_name = voice.unwrap_or_else(|| "nova".to_string()); - - let client = reqwest::Client::new(); - - let body = serde_json::json!({ - "model": "tts-1", - "input": text, - "voice": voice_name, - "response_format": "mp3" - }); - - let response = client - .post(TTS_API_URL) - .bearer_auth(&api_key) - .json(&body) - .send() - .await - .map_err(|e| format!("TTS API-Request fehlgeschlagen: {}", e))?; - - if !response.status().is_success() { - let error_text = response.text().await.unwrap_or_default(); - return Err(format!("TTS API Fehler: {}", error_text)); + if !std::path::Path::new(&model).exists() { + return Err(format!("Piper-Modell nicht gefunden: {}", model)); } - // Audio-Bytes als Base64 zurückgeben - let audio_bytes = response.bytes().await - .map_err(|e| format!("Audio lesen fehlgeschlagen: {}", e))?; + // Piper über stdin füttern, RAW PCM auf stdout + let mut child = TokioCommand::new(&piper_binary()) + .args(["--model", &model, "--output-raw"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| format!("Piper starten fehlgeschlagen: {}", e))?; - let audio_base64 = BASE64.encode(&audio_bytes); + // Text über stdin senden + if let Some(mut stdin) = child.stdin.take() { + stdin.write_all(text.as_bytes()).await + .map_err(|e| format!("Piper stdin schreiben fehlgeschlagen: {}", e))?; + drop(stdin); // EOF senden + } - println!("🔊 TTS generiert: {} Zeichen → {} Bytes Audio", text.len(), audio_bytes.len()); + let output = child.wait_with_output().await + .map_err(|e| format!("Piper Fehler: {}", e))?; + + if output.stdout.is_empty() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Piper hat kein Audio erzeugt: {}", stderr)); + } + + // Raw PCM → WAV Header hinzufügen (16-bit, 22050Hz, mono — Piper Default) + let pcm_data = &output.stdout; + let wav_data = pcm_to_wav(pcm_data, 22050, 1, 16); + + let audio_base64 = BASE64.encode(&wav_data); + + println!("🔊 TTS (Piper lokal): {} Zeichen → {} Bytes WAV", text.len(), wav_data.len()); Ok(audio_base64) } -/// Prüft ob Voice-Features verfügbar sind (API Key vorhanden) -#[tauri::command] -pub async fn check_voice_availability() -> Result { - match get_openai_key() { - Ok(_) => Ok(true), - Err(_) => Ok(false), - } +/// Konvertiert Raw PCM zu WAV (mit Header) +fn pcm_to_wav(pcm: &[u8], sample_rate: u32, channels: u16, bits_per_sample: u16) -> Vec { + let byte_rate = sample_rate * channels as u32 * bits_per_sample as u32 / 8; + let block_align = channels * bits_per_sample / 8; + let data_size = pcm.len() as u32; + let file_size = 36 + data_size; + + let mut wav = Vec::with_capacity(44 + pcm.len()); + + // RIFF Header + wav.extend_from_slice(b"RIFF"); + wav.extend_from_slice(&file_size.to_le_bytes()); + wav.extend_from_slice(b"WAVE"); + + // fmt Chunk + wav.extend_from_slice(b"fmt "); + wav.extend_from_slice(&16u32.to_le_bytes()); // Chunk-Größe + wav.extend_from_slice(&1u16.to_le_bytes()); // PCM Format + wav.extend_from_slice(&channels.to_le_bytes()); + wav.extend_from_slice(&sample_rate.to_le_bytes()); + wav.extend_from_slice(&byte_rate.to_le_bytes()); + wav.extend_from_slice(&block_align.to_le_bytes()); + wav.extend_from_slice(&bits_per_sample.to_le_bytes()); + + // data Chunk + wav.extend_from_slice(b"data"); + wav.extend_from_slice(&data_size.to_le_bytes()); + wav.extend_from_slice(pcm); + + wav } -/// Verfügbare TTS-Stimmen +/// Verfügbare TTS-Stimmen — bei Piper: Modell-basiert #[tauri::command] pub async fn get_tts_voices() -> Result, String> { Ok(vec![ - serde_json::json!({ "id": "alloy", "name": "Alloy", "description": "Neutral, ausgewogen" }), - serde_json::json!({ "id": "echo", "name": "Echo", "description": "Männlich, warm" }), - serde_json::json!({ "id": "fable", "name": "Fable", "description": "Expressiv, britisch" }), - serde_json::json!({ "id": "onyx", "name": "Onyx", "description": "Tief, autoritär" }), - serde_json::json!({ "id": "nova", "name": "Nova", "description": "Weiblich, freundlich" }), - serde_json::json!({ "id": "shimmer", "name": "Shimmer", "description": "Weiblich, sanft" }), + serde_json::json!({ "id": "thorsten-high", "name": "Thorsten (Deutsch)", "description": "Lokal, hohe Qualität, männlich" }), ]) } diff --git a/src/lib/components/VoicePanel.svelte b/src/lib/components/VoicePanel.svelte index 4516ec1..32f904d 100644 --- a/src/lib/components/VoicePanel.svelte +++ b/src/lib/components/VoicePanel.svelte @@ -5,15 +5,25 @@ import { get } from 'svelte/store'; import { isProcessing, messages, addMessage } from '$lib/stores/app'; - // Voice-Zustand - let isListening = false; - let isSpeaking = false; - let isVoiceAvailable = false; - let voiceMode: 'push-to-talk' | 'continuous' = 'push-to-talk'; - let selectedVoice = 'nova'; - let availableVoices: { id: string; name: string; description: string }[] = []; + // === Typen === + interface VoiceStatus { + whisper_available: boolean; + piper_available: boolean; + whisper_model: string; + piper_model: string; + openai_available: boolean; + } - // Audio-Kontext + // === Gesprächs-Zustand === + type ConversationState = 'idle' | 'listening' | 'transcribing' | 'waiting' | 'speaking'; + let state: ConversationState = 'idle'; + let conversationActive = false; + + // === Voice-Status === + let voiceStatus: VoiceStatus | null = null; + let isReady = false; + + // === Audio === let audioContext: AudioContext | null = null; let mediaStream: MediaStream | null = null; let mediaRecorder: MediaRecorder | null = null; @@ -21,278 +31,365 @@ let analyser: AnalyserNode | null = null; let animationFrame: number | null = null; - // Visualisierung - let canvasEl: HTMLCanvasElement; - let volumeLevel = 0; + // === VAD (Voice Activity Detection) === + const SILENCE_THRESHOLD = 0.03; // RMS-Schwelle für Stille + const SILENCE_DURATION = 1800; // ms Stille bevor Aufnahme endet + const MIN_RECORDING = 500; // Mindest-Aufnahmedauer in ms + let silenceStart = 0; + let recordingStart = 0; + let currentVolume = 0; - // Transkription (live) - let currentTranscript = ''; + // === Gesprächslog === + interface LogEntry { + role: 'user' | 'assistant'; + text: string; + time: string; + } + let conversationLog: LogEntry[] = []; + const MAX_LOG_ENTRIES = 6; - // Fehler-Anzeige - let micError = ''; - - // TTS Audio-Element + // === TTS === let ttsAudio: HTMLAudioElement | null = null; - // Event-Listener + // === Fehler === + let errorMsg = ''; + + // === Event-Listener === let ttsListener: UnlistenFn | null = null; onMount(async () => { - // Voice-Verfügbarkeit prüfen try { - isVoiceAvailable = await invoke('check_voice_availability'); - if (isVoiceAvailable) { - availableVoices = await invoke('get_tts_voices'); + voiceStatus = await invoke('check_voice_availability'); + isReady = voiceStatus.whisper_available && voiceStatus.piper_available; + + if (!isReady && voiceStatus) { + if (!voiceStatus.whisper_available) { + errorMsg = 'Whisper nicht verfügbar. Modell: ' + voiceStatus.whisper_model; + } else if (!voiceStatus.piper_available) { + errorMsg = 'Piper-TTS nicht verfügbar. Modell: ' + voiceStatus.piper_model; + } } } catch (err) { - console.warn('Voice nicht verfügbar:', err); + console.warn('Voice-Status Fehler:', err); + errorMsg = `Voice-System nicht verfügbar: ${err}`; } - - // TTS-Event listener - ttsListener = await listen('tts-audio', (event) => { - playTtsAudio(event.payload); - }); }); onDestroy(() => { - stopListening(); + stopConversation(); ttsListener?.(); }); - async function startListening() { - if (isListening) return; - micError = ''; + // === Gesprächs-Loop === + async function startConversation() { + if (conversationActive) return; + conversationActive = true; + errorMsg = ''; + conversationLog = []; + console.log('🎙️ Gespräch gestartet'); + + // Mikrofon-Zugriff holen try { - // Mikrofon-Zugriff — zuerst mit optimalen Constraints versuchen, - // bei OverconstrainedError (z.B. WebKitGTK/Tauri) auf Fallback ausweichen - let usedFallback = false; + await initMicrophone(); + } catch (err) { + errorMsg = `Mikrofon-Fehler: ${err instanceof Error ? err.message : err}`; + conversationActive = false; + return; + } + + // Loop: Zuhören → Transkribieren → Claude → Sprechen → Zuhören + startListening(); + } + + function stopConversation() { + conversationActive = false; + stopSpeaking(); + stopRecording(); + cleanupAudio(); + state = 'idle'; + console.log('🎙️ Gespräch beendet'); + } + + async function initMicrophone() { + // Mikrofon-Zugriff mit Fallback-Kette + try { + mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 16000 } + }); + } catch { try { - mediaStream = await navigator.mediaDevices.getUserMedia({ - audio: { - echoCancellation: true, - noiseSuppression: true, - sampleRate: 16000, - }, - }); - console.log('🎤 Mikrofon mit optimalen Constraints geöffnet'); - } catch (constraintErr) { - // WebKitGTK wirft diverse Fehler (OverconstrainedError, TypeError "Invalid constraint", etc.) - console.warn('Mikrofon-Constraints fehlgeschlagen, versuche Fallbacks:', constraintErr); - try { - mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }); - } catch (_) { - // Auch { audio: true } fehlgeschlagen — versuche explizites Device - const devices = await navigator.mediaDevices.enumerateDevices(); - const audioInput = devices.find(d => d.kind === 'audioinput'); - if (audioInput) { - mediaStream = await navigator.mediaDevices.getUserMedia({ - audio: { deviceId: { exact: audioInput.deviceId } } - }); - } else { - mediaStream = await navigator.mediaDevices.getUserMedia({ audio: {} }); - } + mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }); + } catch { + const devices = await navigator.mediaDevices.enumerateDevices(); + const mic = devices.find(d => d.kind === 'audioinput'); + if (mic) { + mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { deviceId: { exact: mic.deviceId } } + }); + } else { + throw new Error('Kein Mikrofon gefunden. Unter WebKitGTK (Tauri/Linux) wird PipeWire oder PulseAudio mit gst-plugin-pipewire benötigt.'); } - usedFallback = true; - console.log('🎤 Mikrofon mit Fallback geöffnet'); } + } - // Audio-Kontext für Visualisierung - audioContext = new AudioContext(); - const source = audioContext.createMediaStreamSource(mediaStream); - analyser = audioContext.createAnalyser(); - analyser.fftSize = 256; - source.connect(analyser); + audioContext = new AudioContext(); + const source = audioContext.createMediaStreamSource(mediaStream); + analyser = audioContext.createAnalyser(); + analyser.fftSize = 2048; + source.connect(analyser); + } - // MediaRecorder für Aufnahme + function startListening() { + if (!conversationActive || !mediaStream) return; + state = 'listening'; + audioChunks = []; + silenceStart = 0; + recordingStart = Date.now(); + + // MediaRecorder starten + try { mediaRecorder = new MediaRecorder(mediaStream, { mimeType: 'audio/webm;codecs=opus', }); - - audioChunks = []; - - mediaRecorder.ondataavailable = (event) => { - if (event.data.size > 0) { - audioChunks.push(event.data); - } - }; - - mediaRecorder.onstop = async () => { - if (audioChunks.length > 0) { - await processAudio(); - } - }; - - mediaRecorder.start(100); // Chunks alle 100ms - isListening = true; - - // Visualisierung starten - visualize(); - - console.log('🎤 Aufnahme gestartet' + (usedFallback ? ' (Fallback-Modus)' : '')); - } catch (err) { - console.error('Mikrofon-Fehler:', err); - micError = `Mikrofon-Zugriff fehlgeschlagen: ${err instanceof Error ? err.message : err}`; - // Fehler nach 8 Sekunden ausblenden - setTimeout(() => { micError = ''; }, 8000); + } catch { + // Fallback ohne expliziten Codec + mediaRecorder = new MediaRecorder(mediaStream); } + + mediaRecorder.ondataavailable = (e) => { + if (e.data.size > 0) audioChunks.push(e.data); + }; + + mediaRecorder.onstop = () => { + if (audioChunks.length > 0 && conversationActive) { + processRecording(); + } + }; + + mediaRecorder.start(100); + + // VAD-Loop starten + monitorVAD(); } - function stopListening() { - if (!isListening) return; + function monitorVAD() { + if (!analyser || state !== 'listening') return; - if (mediaRecorder && mediaRecorder.state !== 'inactive') { - mediaRecorder.stop(); + const buffer = new Float32Array(analyser.fftSize); + analyser.getFloatTimeDomainData(buffer); + + // RMS berechnen + let sum = 0; + for (let i = 0; i < buffer.length; i++) { + sum += buffer[i] * buffer[i]; + } + const rms = Math.sqrt(sum / buffer.length); + currentVolume = rms; + + const now = Date.now(); + const recordingDuration = now - recordingStart; + + if (rms < SILENCE_THRESHOLD) { + // Stille + if (silenceStart === 0) silenceStart = now; + + if (now - silenceStart > SILENCE_DURATION && recordingDuration > MIN_RECORDING) { + // Genug Stille nach genug Aufnahme → Aufnahme beenden + stopRecording(); + return; + } + } else { + // Sprache erkannt + silenceStart = 0; } - if (mediaStream) { - mediaStream.getTracks().forEach((track) => track.stop()); - mediaStream = null; - } + animationFrame = requestAnimationFrame(() => monitorVAD()); + } + function stopRecording() { if (animationFrame) { cancelAnimationFrame(animationFrame); animationFrame = null; } - if (audioContext) { - audioContext.close(); - audioContext = null; + if (mediaRecorder && mediaRecorder.state !== 'inactive') { + mediaRecorder.stop(); } - - isListening = false; - volumeLevel = 0; - - console.log('🎤 Aufnahme gestoppt'); } - async function processAudio() { - if (audioChunks.length === 0) return; + async function processRecording() { + if (!conversationActive) return; + state = 'transcribing'; - const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + const blob = new Blob(audioChunks, { type: 'audio/webm' }); audioChunks = []; - // Blob zu Base64 konvertieren - const reader = new FileReader(); - reader.onloadend = async () => { - const base64 = (reader.result as string).split(',')[1]; - - try { - currentTranscript = 'Transkribiere...'; - const text: string = await invoke('transcribe_audio', { - audioBase64: base64, - format: 'webm', - }); - - currentTranscript = text; - - if (text.trim()) { - // Nachricht an Chat senden - addMessage('user', text); - - // An Claude senden (triggert TTS-Response) - await sendToClaudeWithTts(text); - } - } catch (err) { - console.error('Transkription fehlgeschlagen:', err); - currentTranscript = `Fehler: ${err}`; - } - }; - reader.readAsDataURL(audioBlob); - } - - async function sendToClaudeWithTts(text: string) { - // Nachricht an Claude senden und Antwort per TTS vorlesen - try { - $isProcessing = true; - - // Claude-Request abfeuern - await invoke('send_message', { message: text }); - - // Auf Ende der Verarbeitung warten (all-stopped Event) - await new Promise((resolve) => { - // Timeout nach 120 Sekunden als Sicherheitsnetz - const timeout = setTimeout(() => { - console.warn('TTS-Timeout: Claude hat nach 120s nicht geantwortet'); - unlisten(); - resolve(); - }, 120_000); - - let unlisten: UnlistenFn; - - listen('all-stopped', () => { - clearTimeout(timeout); - unlisten(); - resolve(); - }).then((fn) => { - unlisten = fn; - }); - }); - - // Letzte Assistant-Nachricht aus dem Store holen - const allMessages = get(messages); - const lastAssistant = [...allMessages] - .reverse() - .find((m) => m.role === 'assistant' && m.content.trim()); - - if (lastAssistant) { - // TTS auf max 500 Zeichen begrenzen (lange Antworten abschneiden) - let ttsText = lastAssistant.content.trim(); - if (ttsText.length > 500) { - // Am letzten Satzende vor 500 Zeichen abschneiden - const cutoff = ttsText.lastIndexOf('.', 500); - ttsText = cutoff > 200 - ? ttsText.substring(0, cutoff + 1) - : ttsText.substring(0, 500) + '…'; - } - - await speakText(ttsText); - } - } catch (err) { - console.error('sendToClaudeWithTts fehlgeschlagen:', err); + // Zu klein? Überspringen + if (blob.size < 1000) { + console.log('⏭️ Aufnahme zu kurz, übersprungen'); + if (conversationActive) startListening(); + return; } - } - async function speakText(text: string) { - if (isSpeaking) { - stopSpeaking(); - } + // Blob → Base64 + const base64 = await blobToBase64(blob); try { - isSpeaking = true; - const audioBase64: string = await invoke('text_to_speech', { - text, - voice: selectedVoice, + const text: string = await invoke('transcribe_audio', { + audioBase64: base64, + format: 'webm', }); - playTtsAudio(audioBase64); - } catch (err) { - console.error('TTS fehlgeschlagen:', err); - isSpeaking = false; - } - } + const cleaned = text.trim(); + if (!cleaned || cleaned.length < 2) { + console.log('⏭️ Leere Transkription, weiter zuhören'); + if (conversationActive) startListening(); + return; + } - function playTtsAudio(base64: string) { - if (ttsAudio) { - ttsAudio.pause(); - ttsAudio = null; - } + // User-Nachricht ins Log + addLogEntry('user', cleaned); + addMessage('user', cleaned); - ttsAudio = new Audio(`data:audio/mp3;base64,${base64}`); + // Claude fragen + state = 'waiting'; + await invoke('send_message', { message: cleaned }); - ttsAudio.onended = () => { - isSpeaking = false; - // Bei Continuous-Modus: Wieder zuhören - if (voiceMode === 'continuous' && !isListening) { + // Auf Antwort warten + const response = await waitForResponse(); + + if (response && conversationActive) { + addLogEntry('assistant', response); + + // TTS: Text aufbereiten (Code-Blöcke entfernen, Markdown strippen) + let ttsText = prepareTtsText(response); + + // TTS abspielen + state = 'speaking'; + await speakAndWait(ttsText); + } + + // Weiter zuhören + if (conversationActive) { startListening(); } - }; + } catch (err) { + console.error('Verarbeitung fehlgeschlagen:', err); + errorMsg = `Fehler: ${err}`; + setTimeout(() => { errorMsg = ''; }, 5000); + if (conversationActive) startListening(); + } + } - ttsAudio.onerror = () => { - isSpeaking = false; - }; + async function waitForResponse(): Promise { + return new Promise((resolve) => { + const timeout = setTimeout(() => { + if (unlisten) unlisten(); + resolve(null); + }, 120_000); - ttsAudio.play(); + let unlisten: UnlistenFn; + listen('all-stopped', () => { + clearTimeout(timeout); + if (unlisten) unlisten(); + + // Letzte Assistant-Nachricht holen + const allMessages = get(messages); + const last = [...allMessages].reverse().find(m => m.role === 'assistant' && m.content.trim()); + resolve(last ? last.content.trim() : null); + }).then((fn: UnlistenFn) => { unlisten = fn; }); + }); + } + + function prepareTtsText(text: string): string { + // Code-Blöcke entfernen + let clean = text.replace(/```[\s\S]*?```/g, ''); + // Inline-Code entfernen + clean = clean.replace(/`[^`]+`/g, ''); + // Markdown-Formatierung entfernen + clean = clean.replace(/[*_~#]+/g, ''); + // URLs entfernen + clean = clean.replace(/https?:\/\/\S+/g, ''); + // Mehrfache Leerzeichen/Zeilenumbrüche + clean = clean.replace(/\s+/g, ' ').trim(); + + // Auf 600 Zeichen begrenzen (am Satzende schneiden) + if (clean.length > 600) { + const cutoff = clean.lastIndexOf('.', 600); + clean = cutoff > 200 ? clean.substring(0, cutoff + 1) : clean.substring(0, 600) + '…'; + } + + return clean; + } + + async function speakAndWait(text: string): Promise { + if (!text || !conversationActive) return; + + try { + const audioBase64: string = await invoke('text_to_speech', { + text, + voice: null, + }); + + return new Promise((resolve) => { + if (ttsAudio) { + ttsAudio.pause(); + ttsAudio = null; + } + + // Piper gibt WAV zurück + ttsAudio = new Audio(`data:audio/wav;base64,${audioBase64}`); + + ttsAudio.onended = () => { + ttsAudio = null; + resolve(); + }; + + ttsAudio.onerror = (e) => { + console.error('TTS-Wiedergabe fehlgeschlagen:', e); + ttsAudio = null; + resolve(); + }; + + ttsAudio.play().catch(err => { + console.error('Audio-Play fehlgeschlagen:', err); + resolve(); + }); + + // Interrupt-Detection: Während TTS läuft, prüfe ob User spricht + monitorInterrupt(resolve); + }); + } catch (err) { + console.error('TTS fehlgeschlagen:', err); + } + } + + function monitorInterrupt(onInterrupt: () => void) { + if (!analyser || state !== 'speaking') return; + + const buffer = new Float32Array(analyser.fftSize); + analyser.getFloatTimeDomainData(buffer); + + let sum = 0; + for (let i = 0; i < buffer.length; i++) { + sum += buffer[i] * buffer[i]; + } + const rms = Math.sqrt(sum / buffer.length); + currentVolume = rms; + + // User spricht laut genug → Claude unterbrechen + if (rms > SILENCE_THRESHOLD * 3) { + console.log('⚡ Unterbrochen! (RMS:', rms.toFixed(3), ')'); + stopSpeaking(); + onInterrupt(); + return; + } + + // Weiter prüfen + if (state === 'speaking' && ttsAudio && !ttsAudio.paused) { + requestAnimationFrame(() => monitorInterrupt(onInterrupt)); + } } function stopSpeaking() { @@ -301,213 +398,152 @@ ttsAudio.currentTime = 0; ttsAudio = null; } - isSpeaking = false; } - // Unterbrechung: User spricht während Claude spricht - function handleInterrupt() { - if (isSpeaking) { - stopSpeaking(); - console.log('⚡ Claude unterbrochen'); + function cleanupAudio() { + if (mediaStream) { + mediaStream.getTracks().forEach(t => t.stop()); + mediaStream = null; + } + if (audioContext) { + audioContext.close(); + audioContext = null; + } + analyser = null; + if (animationFrame) { + cancelAnimationFrame(animationFrame); + animationFrame = null; } } - function visualize() { - if (!analyser || !canvasEl) return; + // === Hilfsfunktionen === - const ctx = canvasEl.getContext('2d'); - if (!ctx) return; - - const bufferLength = analyser.frequencyBinCount; - const dataArray = new Uint8Array(bufferLength); - - function draw() { - if (!analyser) return; - - animationFrame = requestAnimationFrame(draw); - analyser.getByteFrequencyData(dataArray); - - // Durchschnittliche Lautstärke berechnen - const average = dataArray.reduce((a, b) => a + b, 0) / bufferLength; - volumeLevel = average / 255; - - // VAD: Bei hoher Lautstärke während TTS → Unterbrechen - if (volumeLevel > 0.3 && isSpeaking) { - handleInterrupt(); - } - - // Canvas zeichnen - ctx.fillStyle = 'var(--bg-secondary)'; - ctx.fillRect(0, 0, canvasEl.width, canvasEl.height); - - const barWidth = (canvasEl.width / bufferLength) * 2.5; - let x = 0; - - for (let i = 0; i < bufferLength; i++) { - const barHeight = (dataArray[i] / 255) * canvasEl.height; - - // Farbverlauf basierend auf Höhe - const hue = (i / bufferLength) * 60 + 200; // Blau-Violett - ctx.fillStyle = `hsl(${hue}, 70%, ${50 + volumeLevel * 30}%)`; - - ctx.fillRect(x, canvasEl.height - barHeight, barWidth, barHeight); - x += barWidth + 1; - } - } - - draw(); + function blobToBase64(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => { + const result = reader.result as string; + resolve(result.split(',')[1]); + }; + reader.onerror = reject; + reader.readAsDataURL(blob); + }); } - function toggleVoiceMode() { - voiceMode = voiceMode === 'push-to-talk' ? 'continuous' : 'push-to-talk'; - if (voiceMode === 'push-to-talk' && isListening) { - stopListening(); + function addLogEntry(role: 'user' | 'assistant', text: string) { + const now = new Date(); + const time = now.toLocaleTimeString('de-DE', { hour: '2-digit', minute: '2-digit' }); + conversationLog = [...conversationLog, { role, text, time }].slice(-MAX_LOG_ENTRIES); + } + + function getStateLabel(): string { + switch (state) { + case 'listening': return 'Höre zu…'; + case 'transcribing': return 'Transkribiere…'; + case 'waiting': return 'Claude denkt nach…'; + case 'speaking': return 'Claude spricht…'; + default: return 'Bereit'; } } - // Push-to-Talk Handling - function handlePttDown() { - if (voiceMode === 'push-to-talk') { - startListening(); - } - } - - function handlePttUp() { - if (voiceMode === 'push-to-talk') { - stopListening(); - } - } - - // Keyboard-Shortcut (Leertaste für PTT) - function handleKeydown(e: KeyboardEvent) { - if (e.code === 'Space' && !e.repeat && voiceMode === 'push-to-talk' && e.target === document.body) { - e.preventDefault(); - handlePttDown(); - } - } - - function handleKeyup(e: KeyboardEvent) { - if (e.code === 'Space' && voiceMode === 'push-to-talk') { - handlePttUp(); + function getStateIcon(): string { + switch (state) { + case 'listening': return '🎤'; + case 'transcribing': return '⏳'; + case 'waiting': return '🤔'; + case 'speaking': return '🔊'; + default: return '💬'; } } - -
-

🎤 Sprachsteuerung

- {#if !isVoiceAvailable} - API Key fehlt +

🎙️ Gespräch

+ {#if voiceStatus} +
+ + Whisper {voiceStatus.whisper_available ? '✓' : '✗'} + + + Piper {voiceStatus.piper_available ? '✓' : '✗'} + +
{/if}
- {#if isVoiceAvailable} -
- -
- - -
- - -
- - {#if isListening} -
- - Höre zu... -
- {/if} -
- - -
- {#if voiceMode === 'push-to-talk'} - -

Oder Leertaste gedrückt halten

+ {#if isReady} + +
+ + 🎙️ + Gespräch starten + {/if} + +
+ + + {#if conversationActive} +
+
+ {getStateIcon()} +
+ {getStateLabel()} + + {#if state === 'speaking'} + {/if}
- - {#if currentTranscript} -
- Du: - {currentTranscript} -
- {/if} - - - {#if isSpeaking} -
- - Claude spricht... - -
- {/if} - - - {#if micError} -
- ⚠️ {micError} -
- {/if} - - -
- - + +
+
-
+ {/if} + + + {#if conversationLog.length > 0} +
+ {#each conversationLog as entry} +
+ {entry.time} + {entry.role === 'user' ? 'Du' : 'Claude'}: + {entry.text.length > 120 ? entry.text.substring(0, 120) + '…' : entry.text} +
+ {/each} +
+ {/if} {:else} +
-

Setze OPENAI_API_KEY Umgebungsvariable für Sprachsteuerung.

-

Oder warte auf lokale Whisper/Piper Integration.

+ {#if voiceStatus} + {#if !voiceStatus.whisper_available} +

⚠️ Whisper-Modell nicht gefunden

+

Lade ggml-base.bin in den models/ Ordner

+ {/if} + {#if !voiceStatus.piper_available} +

⚠️ Piper-TTS-Modell nicht gefunden

+

Lade de_DE-thorsten-high.onnx in den models/ Ordner

+ {/if} + {:else} +

Voice-System wird geladen...

+ {/if}
{/if} + + + {#if errorMsg} +
⚠️ {errorMsg}
+ {/if}