// Claude Desktop — Voice Interface // Speech-to-Text mit Whisper API, Text-to-Speech mit OpenAI TTS use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64}; use serde::{Deserialize, Serialize}; use std::io::Write; /// Whisper API Konfiguration const OPENAI_API_URL: &str = "https://api.openai.com/v1/audio/transcriptions"; const TTS_API_URL: &str = "https://api.openai.com/v1/audio/speech"; /// Transkriptions-Ergebnis #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TranscriptionResult { pub text: String, pub language: Option, pub duration: Option, } /// TTS-Stimmen #[derive(Debug, Clone, Serialize, Deserialize)] pub enum TtsVoice { Alloy, Echo, Fable, Onyx, Nova, Shimmer, } impl TtsVoice { fn as_str(&self) -> &str { match self { TtsVoice::Alloy => "alloy", TtsVoice::Echo => "echo", TtsVoice::Fable => "fable", TtsVoice::Onyx => "onyx", TtsVoice::Nova => "nova", TtsVoice::Shimmer => "shimmer", } } } /// Holt den OpenAI API Key aus Umgebungsvariable oder Settings fn get_openai_key() -> Result { // Erst Umgebungsvariable prüfen if let Ok(key) = std::env::var("OPENAI_API_KEY") { if !key.is_empty() { return Ok(key); } } // Alternativ: Aus Settings laden (TODO) Err("OpenAI API Key nicht gefunden. Setze OPENAI_API_KEY Umgebungsvariable.".to_string()) } /// Transkribiert Audio mit OpenAI Whisper API #[tauri::command] pub async fn transcribe_audio( audio_base64: String, format: String, ) -> Result { let api_key = get_openai_key()?; // Base64 dekodieren let audio_bytes = BASE64.decode(&audio_base64) .map_err(|e| format!("Base64-Dekodierung fehlgeschlagen: {}", e))?; // Temporäre Datei erstellen (Whisper API braucht Datei-Upload) let temp_dir = std::env::temp_dir(); let temp_file = temp_dir.join(format!("whisper_audio_{}.{}", uuid::Uuid::new_v4(), format)); let mut file = std::fs::File::create(&temp_file) .map_err(|e| format!("Temp-Datei erstellen fehlgeschlagen: {}", e))?; file.write_all(&audio_bytes) .map_err(|e| format!("Audio schreiben fehlgeschlagen: {}", e))?; drop(file); // Multipart-Request an Whisper API let client = reqwest::Client::new(); let file_part = reqwest::multipart::Part::file(&temp_file) .await .map_err(|e| format!("Datei lesen fehlgeschlagen: {}", e))? .file_name(format!("audio.{}", format)) .mime_str(&format!("audio/{}", format)) .map_err(|e| format!("MIME-Type fehlgeschlagen: {}", e))?; let form = reqwest::multipart::Form::new() .part("file", file_part) .text("model", "whisper-1") .text("language", "de") // Deutsch priorisieren .text("response_format", "json"); let response = client .post(OPENAI_API_URL) .bearer_auth(&api_key) .multipart(form) .send() .await .map_err(|e| format!("API-Request fehlgeschlagen: {}", e))?; // Temp-Datei löschen let _ = std::fs::remove_file(&temp_file); if !response.status().is_success() { let error_text = response.text().await.unwrap_or_default(); return Err(format!("Whisper API Fehler: {}", error_text)); } // Response parsen #[derive(Deserialize)] struct WhisperResponse { text: String, } let result: WhisperResponse = response.json().await .map_err(|e| format!("Response parsen fehlgeschlagen: {}", e))?; println!("🎤 Transkription: \"{}\"", result.text); Ok(result.text) } /// Text-to-Speech mit OpenAI TTS API #[tauri::command] pub async fn text_to_speech( text: String, voice: Option, ) -> Result { let api_key = get_openai_key()?; let voice_name = voice.unwrap_or_else(|| "nova".to_string()); let client = reqwest::Client::new(); let body = serde_json::json!({ "model": "tts-1", "input": text, "voice": voice_name, "response_format": "mp3" }); let response = client .post(TTS_API_URL) .bearer_auth(&api_key) .json(&body) .send() .await .map_err(|e| format!("TTS API-Request fehlgeschlagen: {}", e))?; if !response.status().is_success() { let error_text = response.text().await.unwrap_or_default(); return Err(format!("TTS API Fehler: {}", error_text)); } // Audio-Bytes als Base64 zurückgeben let audio_bytes = response.bytes().await .map_err(|e| format!("Audio lesen fehlgeschlagen: {}", e))?; let audio_base64 = BASE64.encode(&audio_bytes); println!("🔊 TTS generiert: {} Zeichen → {} Bytes Audio", text.len(), audio_bytes.len()); Ok(audio_base64) } /// Prüft ob Voice-Features verfügbar sind (API Key vorhanden) #[tauri::command] pub async fn check_voice_availability() -> Result { match get_openai_key() { Ok(_) => Ok(true), Err(_) => Ok(false), } } /// Verfügbare TTS-Stimmen #[tauri::command] pub async fn get_tts_voices() -> Result, String> { Ok(vec![ serde_json::json!({ "id": "alloy", "name": "Alloy", "description": "Neutral, ausgewogen" }), serde_json::json!({ "id": "echo", "name": "Echo", "description": "Männlich, warm" }), serde_json::json!({ "id": "fable", "name": "Fable", "description": "Expressiv, britisch" }), serde_json::json!({ "id": "onyx", "name": "Onyx", "description": "Tief, autoritär" }), serde_json::json!({ "id": "nova", "name": "Nova", "description": "Weiblich, freundlich" }), serde_json::json!({ "id": "shimmer", "name": "Shimmer", "description": "Weiblich, sanft" }), ]) }