Ai_Assistant/server/main_chat.py

#!/usr/bin/env python3
"""
Streaming LLM -> chunked TTS -> queued playback script for Riko.
"""

import os
import time
import uuid
import json
import shutil
import yaml
from pathlib import Path
from openai import OpenAI
from contextlib import suppress
from queue import Queue
from threading import Event, Thread
from dotenv import load_dotenv
import soundfile as sf

# --- Eigene Module laden ---
from process.asr_func.asr_auto_record import record_on_speech
from process.asr_func.asr_transcribe_groq import transcribe_audio_groq
from process.tts_func.elevenlabs_ping import elevenlabs_gen
from process.tts_func.sovits_ping import play_audio, get_wav_duration
from process.tts_func.tts_preprocess import clean_llm_output
from process.vrm_func.vrm_ping import vrm_talk, vrm_animate
from process.vrm_func.vrm_states_ping import set_vrm_state

# ---------------------------
# Setup & Keys
# ---------------------------
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise EnvironmentError('Please set OPENAI_API_KEY in your environment')

client = OpenAI(api_key=OPENAI_API_KEY)

CONFIG_PATH = os.path.expanduser('character_config.yaml')
if not os.path.exists(CONFIG_PATH):
    raise FileNotFoundError(f"Config not found at {CONFIG_PATH}")

with open(CONFIG_PATH, 'r') as f:
    char_config = yaml.safe_load(f)

HISTORY_FILE = char_config['history_file']
MODEL = char_config.get('model', 'gpt-4.1-mini')

SYSTEM_PROMPT = [
    {
        "role": "system",
        "content": [
            {"type": "input_text", "text": char_config['presets']['default']['system_prompt']}
        ]
    }
]

# ---------------------------
# Hilfs-Funktionen
# ---------------------------
def get_vrm_expression(text: str) -> str:
    text_lower = text.lower()
    if any(word in text_lower for word in ['*giggle', '*laugh', '*smile', '*smirk', '*cheer']):
        return "happy"
    elif any(word in text_lower for word in ['*angry', '*glare', '*mad', '*frustrat']):
        return "angry"
    elif any(word in text_lower for word in ['*sad', '*cry', '*sigh', '*sorrow']):
        return "sad"
    elif any(word in text_lower for word in ['*gasp', '*shock', '*surpris', '*wide']):
        return "surprised"
    elif any(word in text_lower for word in ['*neutral', '*stare']):
        return "neutral"
    else:
        return "relaxed"

def load_history():
    if os.path.exists(HISTORY_FILE):
        with open(HISTORY_FILE, 'r') as f:
            return json.load(f)
    return SYSTEM_PROMPT.copy()

def save_history(history):
    with open(HISTORY_FILE, 'w') as f:
        json.dump(history, f, indent=2)

def ensure_dirs():
    Path('client/audio').mkdir(parents=True, exist_ok=True)
    Path('audio').mkdir(parents=True, exist_ok=True)

def copy_to_public(client_path: Path, public_path: Path):
    shutil.copy2(client_path, public_path)

def fallback_get_wav_duration(p: Path):
    try:
        import wave, contextlib
        with contextlib.closing(wave.open(str(p),'r')) as wf:
            frames = wf.getnframes()
            rate = wf.getframerate()
            return frames / float(rate)
    except Exception:
        return 3.0

# ---------------------------
# Streaming helper
# ---------------------------
def stream_text_chunks(messages, min_len=60, max_len=350):
    buffer = ""

    with client.responses.stream(
        model=MODEL,
        input=messages,
        temperature=1,
        top_p=1,
        max_output_tokens=2048,
    ) as stream:

        for event in stream:
            if event.type == "response.output_text.delta":
                buffer += event.delta
                if buffer.endswith(('. ', '? ', '! ', '\n', '." ', '?" ')) and len(buffer) >= min_len:
                    yield buffer.strip()
                    buffer = ""
                elif len(buffer) >= max_len:
                    yield buffer.strip()
                    buffer = ""
            elif event.type == "response.output_text.done":
                if buffer.strip():
                    yield buffer.strip()
        final_response = stream.get_final_response()

# ---------------------------
# Playback worker (Queue)
# ---------------------------
class PlaybackWorker:
    def __init__(self):
        self.q = Queue()
        self.thread = Thread(target=self._run, daemon=True)
        self._running = False
        self.queue_finished_event = Event()
        self.queue_finished_event.set()
        self._talking = False

    def start(self):
        if not self._running:
            self._running = True
            self.thread.start()

    def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float):
        self.queue_finished_event.clear()
        self.q.put((public_audio_path, expression, assistant_text, duration))

    def wait_until_finished(self, timeout=None):
        return self.queue_finished_event.wait(timeout)

    def _run(self):
        while True:
            item = self.q.get()
            if item is None:
                break
            public_audio_path, expression, assistant_text, duration = item

            try:
                if not self._talking:
                    thinking_anim = Path("animations/mixamo") / "Talking.fbx"
                    vrm_animate("start_mixamo", str(thinking_anim))
                    set_vrm_state("talking")
                    self._talking = True
            except Exception as e:
                pass

            try:
                # 3D Modell ansteuern (braucht int)
                vrm_talk(str(public_audio_path), expression, assistant_text, int(duration))

                # --- FIX FÜR DEN TON: Python spielt das Audio jetzt laut ab! ---

            except Exception as e:
                print("Playback error:", e)

            try:
                # Da play_audio schon wartet, bis der Ton fertig ist,
                # packen wir hier nur noch unsere Atempause von 0.4s obendrauf!
                time.sleep(0.4)
            except Exception:
                time.sleep(0.4)

            try:
                if self.q.empty():
                    self.queue_finished_event.set()
                    idle_path = Path("animations/mixamo") / "Idle.fbx"
                    self._talking = False
            except Exception as e:
                pass

    def stop(self):
        self.q.put(None)
        self.thread.join()

# ---------------------------
# Main orchestration
# ---------------------------
def main_loop():
    ensure_dirs()
    playback = PlaybackWorker()
    playback.start()

    while True:
        try:
            print("\n⏳ Waiting for playback queue to finish...")
            playback.wait_until_finished()
            print("✅ Queue finished, ready for input")

            idle_anim = Path("animations/mixamo") / "Idle.fbx"
            vrm_animate("start_mixamo", str(idle_anim))
            set_vrm_state("idle")

            conversation_recording = Path("audio") / "conversation.wav"
            conversation_recording.parent.mkdir(parents=True, exist_ok=True)
            conversation_recording = str(conversation_recording)

            record_on_speech(
                output_file=conversation_recording,
                samplerate=44100,
                channels=1,
                silence_threshold=0.02,
                silence_duration=2,
                device=None
            )

            try:
                thinking_anim = Path("animations/mixamo") / "Thinking.fbx"
                vrm_animate("start_mixamo", str(thinking_anim))
                set_vrm_state("thinking")
            except Exception:
                pass

            user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)

            messages = load_history()
            messages.append({
                "role": "user",
                "content": [{"type": "input_text", "text": user_spoken_text}]
            })

            print("[llm] streaming response...")
            full_assistant_text = ""

            for chunk in stream_text_chunks(messages):
                print("[chunk]", chunk)
                full_assistant_text += (chunk + " ")

                expression = get_vrm_expression(chunk)
                tts_read_text = clean_llm_output(chunk)

                # --- DER LEBENSRETTENDE FIX: Stumme Aktionen überspringen ---
                if not tts_read_text.strip():
                    print(f"[Skip] Chunk enthielt nur stumme Aktionen: {chunk}")
                    continue

                uid = uuid.uuid4().hex
                filename = f"output_{uid}.wav"
                client_out = Path('client') / 'audio' / filename
                public_out = Path('audio') / filename
                client_out.parent.mkdir(parents=True, exist_ok=True)

                try:
                    elevenlabs_gen(tts_read_text, output_wav_pth=str(client_out))
                except TypeError:
                    elevenlabs_gen(tts_read_text, str(client_out))

                copy_to_public(client_out, public_out)

                try:
                    duration = get_wav_duration(public_out)
                except Exception:
                    duration = fallback_get_wav_duration(public_out)

                playback.enqueue(public_out, expression, chunk, duration)

            final_text = full_assistant_text.strip()
            print("[llm final]", final_text)

            messages.append({
                "role": "assistant",
                "content": [{"type": "output_text", "text": final_text}]
            })
            save_history(messages)
            time.sleep(0.1)

        except KeyboardInterrupt:
            print("Interrupted by user, stopping.")
            playback.stop()
            break
        except Exception as e:
            print("Error in main loop:", e)
            time.sleep(1)

if __name__ == '__main__':
    main_loop()
Initial release 0.5 2026-05-24 13:31:30 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Streaming LLM -> chunked TTS -> queued playback script for Riko.`
			`"""`

			`import os`
			`import time`
			`import uuid`
			`import json`
			`import shutil`
			`import yaml`
			`from pathlib import Path`
			`from openai import OpenAI`
			`from contextlib import suppress`
			`from queue import Queue`
			`from threading import Event, Thread`
			`from dotenv import load_dotenv`
			`import soundfile as sf`

			`# --- Eigene Module laden ---`
			`from process.asr_func.asr_auto_record import record_on_speech`
			`from process.asr_func.asr_transcribe_groq import transcribe_audio_groq`
			`from process.tts_func.elevenlabs_ping import elevenlabs_gen`
			`from process.tts_func.sovits_ping import play_audio, get_wav_duration`
			`from process.tts_func.tts_preprocess import clean_llm_output`
			`from process.vrm_func.vrm_ping import vrm_talk, vrm_animate`
			`from process.vrm_func.vrm_states_ping import set_vrm_state`

			`# ---------------------------`
			`# Setup & Keys`
			`# ---------------------------`
			`load_dotenv()`
			`OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')`
			`if not OPENAI_API_KEY:`
			`raise EnvironmentError('Please set OPENAI_API_KEY in your environment')`

			`client = OpenAI(api_key=OPENAI_API_KEY)`

			`CONFIG_PATH = os.path.expanduser('character_config.yaml')`
			`if not os.path.exists(CONFIG_PATH):`
			`raise FileNotFoundError(f"Config not found at {CONFIG_PATH}")`

			`with open(CONFIG_PATH, 'r') as f:`
			`char_config = yaml.safe_load(f)`

			`HISTORY_FILE = char_config['history_file']`
			`MODEL = char_config.get('model', 'gpt-4.1-mini')`

			`SYSTEM_PROMPT = [`
			`{`
			`"role": "system",`
			`"content": [`
			`{"type": "input_text", "text": char_config['presets']['default']['system_prompt']}`
			`]`
			`}`
			`]`

			`# ---------------------------`
			`# Hilfs-Funktionen`
			`# ---------------------------`
			`def get_vrm_expression(text: str) -> str:`
			`text_lower = text.lower()`
			`if any(word in text_lower for word in ['giggle', 'laugh', 'smile', 'smirk', '*cheer']):`
			`return "happy"`
			`elif any(word in text_lower for word in ['angry', 'glare', 'mad', 'frustrat']):`
			`return "angry"`
			`elif any(word in text_lower for word in ['sad', 'cry', 'sigh', 'sorrow']):`
			`return "sad"`
			`elif any(word in text_lower for word in ['gasp', 'shock', 'surpris', 'wide']):`
			`return "surprised"`
			`elif any(word in text_lower for word in ['neutral', 'stare']):`
			`return "neutral"`
			`else:`
			`return "relaxed"`

			`def load_history():`
			`if os.path.exists(HISTORY_FILE):`
			`with open(HISTORY_FILE, 'r') as f:`
			`return json.load(f)`
			`return SYSTEM_PROMPT.copy()`

			`def save_history(history):`
			`with open(HISTORY_FILE, 'w') as f:`
			`json.dump(history, f, indent=2)`

			`def ensure_dirs():`
			`Path('client/audio').mkdir(parents=True, exist_ok=True)`
			`Path('audio').mkdir(parents=True, exist_ok=True)`

			`def copy_to_public(client_path: Path, public_path: Path):`
			`shutil.copy2(client_path, public_path)`

			`def fallback_get_wav_duration(p: Path):`
			`try:`
			`import wave, contextlib`
			`with contextlib.closing(wave.open(str(p),'r')) as wf:`
			`frames = wf.getnframes()`
			`rate = wf.getframerate()`
			`return frames / float(rate)`
			`except Exception:`
			`return 3.0`

			`# ---------------------------`
			`# Streaming helper`
			`# ---------------------------`
			`def stream_text_chunks(messages, min_len=60, max_len=350):`
			`buffer = ""`

			`with client.responses.stream(`
			`model=MODEL,`
			`input=messages,`
			`temperature=1,`
			`top_p=1,`
			`max_output_tokens=2048,`
			`) as stream:`

			`for event in stream:`
			`if event.type == "response.output_text.delta":`
			`buffer += event.delta`
			`if buffer.endswith(('. ', '? ', '! ', '\n', '." ', '?" ')) and len(buffer) >= min_len:`
			`yield buffer.strip()`
			`buffer = ""`
			`elif len(buffer) >= max_len:`
			`yield buffer.strip()`
			`buffer = ""`
			`elif event.type == "response.output_text.done":`
			`if buffer.strip():`
			`yield buffer.strip()`
			`final_response = stream.get_final_response()`

			`# ---------------------------`
			`# Playback worker (Queue)`
			`# ---------------------------`
			`class PlaybackWorker:`
			`def __init__(self):`
			`self.q = Queue()`
			`self.thread = Thread(target=self._run, daemon=True)`
			`self._running = False`
			`self.queue_finished_event = Event()`
			`self.queue_finished_event.set()`
			`self._talking = False`

			`def start(self):`
			`if not self._running:`
			`self._running = True`
			`self.thread.start()`

			`def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float):`
			`self.queue_finished_event.clear()`
			`self.q.put((public_audio_path, expression, assistant_text, duration))`

			`def wait_until_finished(self, timeout=None):`
			`return self.queue_finished_event.wait(timeout)`

			`def _run(self):`
			`while True:`
			`item = self.q.get()`
			`if item is None:`
			`break`
			`public_audio_path, expression, assistant_text, duration = item`

			`try:`
			`if not self._talking:`
			`thinking_anim = Path("animations/mixamo") / "Talking.fbx"`
			`vrm_animate("start_mixamo", str(thinking_anim))`
			`set_vrm_state("talking")`
			`self._talking = True`
			`except Exception as e:`
			`pass`

			`try:`
			`# 3D Modell ansteuern (braucht int)`
			`vrm_talk(str(public_audio_path), expression, assistant_text, int(duration))`

			`# --- FIX FÜR DEN TON: Python spielt das Audio jetzt laut ab! ---`

			`except Exception as e:`
			`print("Playback error:", e)`

			`try:`
			`# Da play_audio schon wartet, bis der Ton fertig ist,`
			`# packen wir hier nur noch unsere Atempause von 0.4s obendrauf!`
			`time.sleep(0.4)`
			`except Exception:`
			`time.sleep(0.4)`

			`try:`
			`if self.q.empty():`
			`self.queue_finished_event.set()`
			`idle_path = Path("animations/mixamo") / "Idle.fbx"`
			`self._talking = False`
			`except Exception as e:`
			`pass`

			`def stop(self):`
			`self.q.put(None)`
			`self.thread.join()`

			`# ---------------------------`
			`# Main orchestration`
			`# ---------------------------`
			`def main_loop():`
			`ensure_dirs()`
			`playback = PlaybackWorker()`
			`playback.start()`

			`while True:`
			`try:`
			`print("\n⏳ Waiting for playback queue to finish...")`
			`playback.wait_until_finished()`
			`print("✅ Queue finished, ready for input")`

			`idle_anim = Path("animations/mixamo") / "Idle.fbx"`
			`vrm_animate("start_mixamo", str(idle_anim))`
			`set_vrm_state("idle")`

			`conversation_recording = Path("audio") / "conversation.wav"`
			`conversation_recording.parent.mkdir(parents=True, exist_ok=True)`
			`conversation_recording = str(conversation_recording)`

			`record_on_speech(`
			`output_file=conversation_recording,`
			`samplerate=44100,`
			`channels=1,`
			`silence_threshold=0.02,`
			`silence_duration=2,`
			`device=None`
			`)`

			`try:`
			`thinking_anim = Path("animations/mixamo") / "Thinking.fbx"`
			`vrm_animate("start_mixamo", str(thinking_anim))`
			`set_vrm_state("thinking")`
			`except Exception:`
			`pass`

			`user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)`

			`messages = load_history()`
			`messages.append({`
			`"role": "user",`
			`"content": [{"type": "input_text", "text": user_spoken_text}]`
			`})`

			`print("[llm] streaming response...")`
			`full_assistant_text = ""`

			`for chunk in stream_text_chunks(messages):`
			`print("[chunk]", chunk)`
			`full_assistant_text += (chunk + " ")`

			`expression = get_vrm_expression(chunk)`
			`tts_read_text = clean_llm_output(chunk)`

			`# --- DER LEBENSRETTENDE FIX: Stumme Aktionen überspringen ---`
			`if not tts_read_text.strip():`
			`print(f"[Skip] Chunk enthielt nur stumme Aktionen: {chunk}")`
			`continue`

			`uid = uuid.uuid4().hex`
			`filename = f"output_{uid}.wav"`
			`client_out = Path('client') / 'audio' / filename`
			`public_out = Path('audio') / filename`
			`client_out.parent.mkdir(parents=True, exist_ok=True)`

			`try:`
			`elevenlabs_gen(tts_read_text, output_wav_pth=str(client_out))`
			`except TypeError:`
			`elevenlabs_gen(tts_read_text, str(client_out))`

			`copy_to_public(client_out, public_out)`

			`try:`
			`duration = get_wav_duration(public_out)`
			`except Exception:`
			`duration = fallback_get_wav_duration(public_out)`

			`playback.enqueue(public_out, expression, chunk, duration)`

			`final_text = full_assistant_text.strip()`
			`print("[llm final]", final_text)`

			`messages.append({`
			`"role": "assistant",`
			`"content": [{"type": "output_text", "text": final_text}]`
			`})`
			`save_history(messages)`
			`time.sleep(0.1)`

			`except KeyboardInterrupt:`
			`print("Interrupted by user, stopping.")`
			`playback.stop()`
			`break`
			`except Exception as e:`
			`print("Error in main loop:", e)`
			`time.sleep(1)`

			`if __name__ == '__main__':`
			`main_loop()`