claude-desktop/src-tauri/src/voice.rs
Eddy f51241efa6 Phase 10 Sprach-Interface + Phase 9 Nacharbeiten
Voice (Phase 10):
- voice.rs: OpenAI Whisper (STT) + TTS Backend
- ChatPanel: Mikrofon-Button, VAD (Pause 1.5s), Live-Pegel
- SettingsPanel: OpenAI-Key Konfiguration

Phase 9 Nacharbeiten:
- Auto-Extract vor Compacting (Entscheidungen/TODOs/Insights)
- get_tool_hints() - relevante KB-Eintraege bei Tool-Start
- activeKnowledgeHints Store, Anzeige im KnowledgePanel

Tech-Schulden:
- Dead-Code in memory.rs entfernt (MemorySystem struct)
- cargo-check Warnings behoben

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 18:24:28 +02:00

188 lines
5.8 KiB
Rust

// Claude Desktop — Voice Interface
// Speech-to-Text mit Whisper API, Text-to-Speech mit OpenAI TTS
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64};
use serde::{Deserialize, Serialize};
use std::io::Write;
/// Whisper API Konfiguration
const OPENAI_API_URL: &str = "https://api.openai.com/v1/audio/transcriptions";
const TTS_API_URL: &str = "https://api.openai.com/v1/audio/speech";
/// Transkriptions-Ergebnis
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionResult {
pub text: String,
pub language: Option<String>,
pub duration: Option<f64>,
}
/// TTS-Stimmen
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum TtsVoice {
Alloy,
Echo,
Fable,
Onyx,
Nova,
Shimmer,
}
impl TtsVoice {
fn as_str(&self) -> &str {
match self {
TtsVoice::Alloy => "alloy",
TtsVoice::Echo => "echo",
TtsVoice::Fable => "fable",
TtsVoice::Onyx => "onyx",
TtsVoice::Nova => "nova",
TtsVoice::Shimmer => "shimmer",
}
}
}
/// Holt den OpenAI API Key aus Umgebungsvariable oder Settings
fn get_openai_key() -> Result<String, String> {
// Erst Umgebungsvariable prüfen
if let Ok(key) = std::env::var("OPENAI_API_KEY") {
if !key.is_empty() {
return Ok(key);
}
}
// Alternativ: Aus Settings laden (TODO)
Err("OpenAI API Key nicht gefunden. Setze OPENAI_API_KEY Umgebungsvariable.".to_string())
}
/// Transkribiert Audio mit OpenAI Whisper API
#[tauri::command]
pub async fn transcribe_audio(
audio_base64: String,
format: String,
) -> Result<String, String> {
let api_key = get_openai_key()?;
// Base64 dekodieren
let audio_bytes = BASE64.decode(&audio_base64)
.map_err(|e| format!("Base64-Dekodierung fehlgeschlagen: {}", e))?;
// Temporäre Datei erstellen (Whisper API braucht Datei-Upload)
let temp_dir = std::env::temp_dir();
let temp_file = temp_dir.join(format!("whisper_audio_{}.{}", uuid::Uuid::new_v4(), format));
let mut file = std::fs::File::create(&temp_file)
.map_err(|e| format!("Temp-Datei erstellen fehlgeschlagen: {}", e))?;
file.write_all(&audio_bytes)
.map_err(|e| format!("Audio schreiben fehlgeschlagen: {}", e))?;
drop(file);
// Multipart-Request an Whisper API
let client = reqwest::Client::new();
let file_part = reqwest::multipart::Part::file(&temp_file)
.await
.map_err(|e| format!("Datei lesen fehlgeschlagen: {}", e))?
.file_name(format!("audio.{}", format))
.mime_str(&format!("audio/{}", format))
.map_err(|e| format!("MIME-Type fehlgeschlagen: {}", e))?;
let form = reqwest::multipart::Form::new()
.part("file", file_part)
.text("model", "whisper-1")
.text("language", "de") // Deutsch priorisieren
.text("response_format", "json");
let response = client
.post(OPENAI_API_URL)
.bearer_auth(&api_key)
.multipart(form)
.send()
.await
.map_err(|e| format!("API-Request fehlgeschlagen: {}", e))?;
// Temp-Datei löschen
let _ = std::fs::remove_file(&temp_file);
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("Whisper API Fehler: {}", error_text));
}
// Response parsen
#[derive(Deserialize)]
struct WhisperResponse {
text: String,
}
let result: WhisperResponse = response.json().await
.map_err(|e| format!("Response parsen fehlgeschlagen: {}", e))?;
println!("🎤 Transkription: \"{}\"", result.text);
Ok(result.text)
}
/// Text-to-Speech mit OpenAI TTS API
#[tauri::command]
pub async fn text_to_speech(
text: String,
voice: Option<String>,
) -> Result<String, String> {
let api_key = get_openai_key()?;
let voice_name = voice.unwrap_or_else(|| "nova".to_string());
let client = reqwest::Client::new();
let body = serde_json::json!({
"model": "tts-1",
"input": text,
"voice": voice_name,
"response_format": "mp3"
});
let response = client
.post(TTS_API_URL)
.bearer_auth(&api_key)
.json(&body)
.send()
.await
.map_err(|e| format!("TTS API-Request fehlgeschlagen: {}", e))?;
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("TTS API Fehler: {}", error_text));
}
// Audio-Bytes als Base64 zurückgeben
let audio_bytes = response.bytes().await
.map_err(|e| format!("Audio lesen fehlgeschlagen: {}", e))?;
let audio_base64 = BASE64.encode(&audio_bytes);
println!("🔊 TTS generiert: {} Zeichen → {} Bytes Audio", text.len(), audio_bytes.len());
Ok(audio_base64)
}
/// Prüft ob Voice-Features verfügbar sind (API Key vorhanden)
#[tauri::command]
pub async fn check_voice_availability() -> Result<bool, String> {
match get_openai_key() {
Ok(_) => Ok(true),
Err(_) => Ok(false),
}
}
/// Verfügbare TTS-Stimmen
#[tauri::command]
pub async fn get_tts_voices() -> Result<Vec<serde_json::Value>, String> {
Ok(vec![
serde_json::json!({ "id": "alloy", "name": "Alloy", "description": "Neutral, ausgewogen" }),
serde_json::json!({ "id": "echo", "name": "Echo", "description": "Männlich, warm" }),
serde_json::json!({ "id": "fable", "name": "Fable", "description": "Expressiv, britisch" }),
serde_json::json!({ "id": "onyx", "name": "Onyx", "description": "Tief, autoritär" }),
serde_json::json!({ "id": "nova", "name": "Nova", "description": "Weiblich, freundlich" }),
serde_json::json!({ "id": "shimmer", "name": "Shimmer", "description": "Weiblich, sanft" }),
])
}