#!/usr/bin/env python3 """ Streaming LLM -> chunked TTS -> queued playback script for Riko. """ import os import time import uuid import json import shutil import yaml from pathlib import Path from openai import OpenAI from contextlib import suppress from queue import Queue from threading import Event, Thread from dotenv import load_dotenv import soundfile as sf # --- Eigene Module laden --- from process.asr_func.asr_auto_record import record_on_speech from process.asr_func.asr_transcribe_groq import transcribe_audio_groq from process.tts_func.elevenlabs_ping import elevenlabs_gen from process.tts_func.sovits_ping import play_audio, get_wav_duration from process.tts_func.tts_preprocess import clean_llm_output from process.vrm_func.vrm_ping import vrm_talk, vrm_animate from process.vrm_func.vrm_states_ping import set_vrm_state # --------------------------- # Setup & Keys # --------------------------- load_dotenv() OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') if not OPENAI_API_KEY: raise EnvironmentError('Please set OPENAI_API_KEY in your environment') client = OpenAI(api_key=OPENAI_API_KEY) CONFIG_PATH = os.path.expanduser('character_config.yaml') if not os.path.exists(CONFIG_PATH): raise FileNotFoundError(f"Config not found at {CONFIG_PATH}") with open(CONFIG_PATH, 'r') as f: char_config = yaml.safe_load(f) HISTORY_FILE = char_config['history_file'] MODEL = char_config.get('model', 'gpt-4.1-mini') SYSTEM_PROMPT = [ { "role": "system", "content": [ {"type": "input_text", "text": char_config['presets']['default']['system_prompt']} ] } ] # --------------------------- # Hilfs-Funktionen # --------------------------- def get_vrm_expression(text: str) -> str: text_lower = text.lower() if any(word in text_lower for word in ['*giggle', '*laugh', '*smile', '*smirk', '*cheer']): return "happy" elif any(word in text_lower for word in ['*angry', '*glare', '*mad', '*frustrat']): return "angry" elif any(word in text_lower for word in ['*sad', '*cry', '*sigh', '*sorrow']): return "sad" elif any(word in text_lower for word in ['*gasp', '*shock', '*surpris', '*wide']): return "surprised" elif any(word in text_lower for word in ['*neutral', '*stare']): return "neutral" else: return "relaxed" def load_history(): if os.path.exists(HISTORY_FILE): with open(HISTORY_FILE, 'r') as f: return json.load(f) return SYSTEM_PROMPT.copy() def save_history(history): with open(HISTORY_FILE, 'w') as f: json.dump(history, f, indent=2) def ensure_dirs(): Path('client/audio').mkdir(parents=True, exist_ok=True) Path('audio').mkdir(parents=True, exist_ok=True) def copy_to_public(client_path: Path, public_path: Path): shutil.copy2(client_path, public_path) def fallback_get_wav_duration(p: Path): try: import wave, contextlib with contextlib.closing(wave.open(str(p),'r')) as wf: frames = wf.getnframes() rate = wf.getframerate() return frames / float(rate) except Exception: return 3.0 # --------------------------- # Streaming helper # --------------------------- def stream_text_chunks(messages, min_len=60, max_len=350): buffer = "" with client.responses.stream( model=MODEL, input=messages, temperature=1, top_p=1, max_output_tokens=2048, ) as stream: for event in stream: if event.type == "response.output_text.delta": buffer += event.delta if buffer.endswith(('. ', '? ', '! ', '\n', '." ', '?" ')) and len(buffer) >= min_len: yield buffer.strip() buffer = "" elif len(buffer) >= max_len: yield buffer.strip() buffer = "" elif event.type == "response.output_text.done": if buffer.strip(): yield buffer.strip() final_response = stream.get_final_response() # --------------------------- # Playback worker (Queue) # --------------------------- class PlaybackWorker: def __init__(self): self.q = Queue() self.thread = Thread(target=self._run, daemon=True) self._running = False self.queue_finished_event = Event() self.queue_finished_event.set() self._talking = False def start(self): if not self._running: self._running = True self.thread.start() def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float): self.queue_finished_event.clear() self.q.put((public_audio_path, expression, assistant_text, duration)) def wait_until_finished(self, timeout=None): return self.queue_finished_event.wait(timeout) def _run(self): while True: item = self.q.get() if item is None: break public_audio_path, expression, assistant_text, duration = item try: if not self._talking: thinking_anim = Path("animations/mixamo") / "Talking.fbx" vrm_animate("start_mixamo", str(thinking_anim)) set_vrm_state("talking") self._talking = True except Exception as e: pass try: # 3D Modell ansteuern (braucht int) vrm_talk(str(public_audio_path), expression, assistant_text, int(duration)) # --- FIX FÜR DEN TON: Python spielt das Audio jetzt laut ab! --- except Exception as e: print("Playback error:", e) try: # Da play_audio schon wartet, bis der Ton fertig ist, # packen wir hier nur noch unsere Atempause von 0.4s obendrauf! time.sleep(0.4) except Exception: time.sleep(0.4) try: if self.q.empty(): self.queue_finished_event.set() idle_path = Path("animations/mixamo") / "Idle.fbx" self._talking = False except Exception as e: pass def stop(self): self.q.put(None) self.thread.join() # --------------------------- # Main orchestration # --------------------------- def main_loop(): ensure_dirs() playback = PlaybackWorker() playback.start() while True: try: print("\n⏳ Waiting for playback queue to finish...") playback.wait_until_finished() print("✅ Queue finished, ready for input") idle_anim = Path("animations/mixamo") / "Idle.fbx" vrm_animate("start_mixamo", str(idle_anim)) set_vrm_state("idle") conversation_recording = Path("audio") / "conversation.wav" conversation_recording.parent.mkdir(parents=True, exist_ok=True) conversation_recording = str(conversation_recording) record_on_speech( output_file=conversation_recording, samplerate=44100, channels=1, silence_threshold=0.02, silence_duration=2, device=None ) try: thinking_anim = Path("animations/mixamo") / "Thinking.fbx" vrm_animate("start_mixamo", str(thinking_anim)) set_vrm_state("thinking") except Exception: pass user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording) messages = load_history() messages.append({ "role": "user", "content": [{"type": "input_text", "text": user_spoken_text}] }) print("[llm] streaming response...") full_assistant_text = "" for chunk in stream_text_chunks(messages): print("[chunk]", chunk) full_assistant_text += (chunk + " ") expression = get_vrm_expression(chunk) tts_read_text = clean_llm_output(chunk) # --- DER LEBENSRETTENDE FIX: Stumme Aktionen überspringen --- if not tts_read_text.strip(): print(f"[Skip] Chunk enthielt nur stumme Aktionen: {chunk}") continue uid = uuid.uuid4().hex filename = f"output_{uid}.wav" client_out = Path('client') / 'audio' / filename public_out = Path('audio') / filename client_out.parent.mkdir(parents=True, exist_ok=True) try: elevenlabs_gen(tts_read_text, output_wav_pth=str(client_out)) except TypeError: elevenlabs_gen(tts_read_text, str(client_out)) copy_to_public(client_out, public_out) try: duration = get_wav_duration(public_out) except Exception: duration = fallback_get_wav_duration(public_out) playback.enqueue(public_out, expression, chunk, duration) final_text = full_assistant_text.strip() print("[llm final]", final_text) messages.append({ "role": "assistant", "content": [{"type": "output_text", "text": final_text}] }) save_history(messages) time.sleep(0.1) except KeyboardInterrupt: print("Interrupted by user, stopping.") playback.stop() break except Exception as e: print("Error in main loop:", e) time.sleep(1) if __name__ == '__main__': main_loop()