Ai_Assistant/server/main_chat.py

300 lines
9.7 KiB
Python
Raw Permalink Normal View History

2026-05-24 13:31:30 +02:00
#!/usr/bin/env python3
"""
Streaming LLM -> chunked TTS -> queued playback script for Riko.
"""
import os
import time
import uuid
import json
import shutil
import yaml
from pathlib import Path
from openai import OpenAI
from contextlib import suppress
from queue import Queue
from threading import Event, Thread
from dotenv import load_dotenv
import soundfile as sf
# --- Eigene Module laden ---
from process.asr_func.asr_auto_record import record_on_speech
from process.asr_func.asr_transcribe_groq import transcribe_audio_groq
from process.tts_func.elevenlabs_ping import elevenlabs_gen
from process.tts_func.sovits_ping import play_audio, get_wav_duration
from process.tts_func.tts_preprocess import clean_llm_output
from process.vrm_func.vrm_ping import vrm_talk, vrm_animate
from process.vrm_func.vrm_states_ping import set_vrm_state
# ---------------------------
# Setup & Keys
# ---------------------------
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
raise EnvironmentError('Please set OPENAI_API_KEY in your environment')
client = OpenAI(api_key=OPENAI_API_KEY)
CONFIG_PATH = os.path.expanduser('character_config.yaml')
if not os.path.exists(CONFIG_PATH):
raise FileNotFoundError(f"Config not found at {CONFIG_PATH}")
with open(CONFIG_PATH, 'r') as f:
char_config = yaml.safe_load(f)
HISTORY_FILE = char_config['history_file']
MODEL = char_config.get('model', 'gpt-4.1-mini')
SYSTEM_PROMPT = [
{
"role": "system",
"content": [
{"type": "input_text", "text": char_config['presets']['default']['system_prompt']}
]
}
]
# ---------------------------
# Hilfs-Funktionen
# ---------------------------
def get_vrm_expression(text: str) -> str:
text_lower = text.lower()
if any(word in text_lower for word in ['*giggle', '*laugh', '*smile', '*smirk', '*cheer']):
return "happy"
elif any(word in text_lower for word in ['*angry', '*glare', '*mad', '*frustrat']):
return "angry"
elif any(word in text_lower for word in ['*sad', '*cry', '*sigh', '*sorrow']):
return "sad"
elif any(word in text_lower for word in ['*gasp', '*shock', '*surpris', '*wide']):
return "surprised"
elif any(word in text_lower for word in ['*neutral', '*stare']):
return "neutral"
else:
return "relaxed"
def load_history():
if os.path.exists(HISTORY_FILE):
with open(HISTORY_FILE, 'r') as f:
return json.load(f)
return SYSTEM_PROMPT.copy()
def save_history(history):
with open(HISTORY_FILE, 'w') as f:
json.dump(history, f, indent=2)
def ensure_dirs():
Path('client/audio').mkdir(parents=True, exist_ok=True)
Path('audio').mkdir(parents=True, exist_ok=True)
def copy_to_public(client_path: Path, public_path: Path):
shutil.copy2(client_path, public_path)
def fallback_get_wav_duration(p: Path):
try:
import wave, contextlib
with contextlib.closing(wave.open(str(p),'r')) as wf:
frames = wf.getnframes()
rate = wf.getframerate()
return frames / float(rate)
except Exception:
return 3.0
# ---------------------------
# Streaming helper
# ---------------------------
def stream_text_chunks(messages, min_len=60, max_len=350):
buffer = ""
with client.responses.stream(
model=MODEL,
input=messages,
temperature=1,
top_p=1,
max_output_tokens=2048,
) as stream:
for event in stream:
if event.type == "response.output_text.delta":
buffer += event.delta
if buffer.endswith(('. ', '? ', '! ', '\n', '." ', '?" ')) and len(buffer) >= min_len:
yield buffer.strip()
buffer = ""
elif len(buffer) >= max_len:
yield buffer.strip()
buffer = ""
elif event.type == "response.output_text.done":
if buffer.strip():
yield buffer.strip()
final_response = stream.get_final_response()
# ---------------------------
# Playback worker (Queue)
# ---------------------------
class PlaybackWorker:
def __init__(self):
self.q = Queue()
self.thread = Thread(target=self._run, daemon=True)
self._running = False
self.queue_finished_event = Event()
self.queue_finished_event.set()
self._talking = False
def start(self):
if not self._running:
self._running = True
self.thread.start()
def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float):
self.queue_finished_event.clear()
self.q.put((public_audio_path, expression, assistant_text, duration))
def wait_until_finished(self, timeout=None):
return self.queue_finished_event.wait(timeout)
def _run(self):
while True:
item = self.q.get()
if item is None:
break
public_audio_path, expression, assistant_text, duration = item
try:
if not self._talking:
thinking_anim = Path("animations/mixamo") / "Talking.fbx"
vrm_animate("start_mixamo", str(thinking_anim))
set_vrm_state("talking")
self._talking = True
except Exception as e:
pass
try:
# 3D Modell ansteuern (braucht int)
vrm_talk(str(public_audio_path), expression, assistant_text, int(duration))
# --- FIX FÜR DEN TON: Python spielt das Audio jetzt laut ab! ---
except Exception as e:
print("Playback error:", e)
try:
# Da play_audio schon wartet, bis der Ton fertig ist,
# packen wir hier nur noch unsere Atempause von 0.4s obendrauf!
time.sleep(0.4)
except Exception:
time.sleep(0.4)
try:
if self.q.empty():
self.queue_finished_event.set()
idle_path = Path("animations/mixamo") / "Idle.fbx"
self._talking = False
except Exception as e:
pass
def stop(self):
self.q.put(None)
self.thread.join()
# ---------------------------
# Main orchestration
# ---------------------------
def main_loop():
ensure_dirs()
playback = PlaybackWorker()
playback.start()
while True:
try:
print("\n⏳ Waiting for playback queue to finish...")
playback.wait_until_finished()
print("✅ Queue finished, ready for input")
idle_anim = Path("animations/mixamo") / "Idle.fbx"
vrm_animate("start_mixamo", str(idle_anim))
set_vrm_state("idle")
conversation_recording = Path("audio") / "conversation.wav"
conversation_recording.parent.mkdir(parents=True, exist_ok=True)
conversation_recording = str(conversation_recording)
record_on_speech(
output_file=conversation_recording,
samplerate=44100,
channels=1,
silence_threshold=0.02,
silence_duration=2,
device=None
)
try:
thinking_anim = Path("animations/mixamo") / "Thinking.fbx"
vrm_animate("start_mixamo", str(thinking_anim))
set_vrm_state("thinking")
except Exception:
pass
user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)
messages = load_history()
messages.append({
"role": "user",
"content": [{"type": "input_text", "text": user_spoken_text}]
})
print("[llm] streaming response...")
full_assistant_text = ""
for chunk in stream_text_chunks(messages):
print("[chunk]", chunk)
full_assistant_text += (chunk + " ")
expression = get_vrm_expression(chunk)
tts_read_text = clean_llm_output(chunk)
# --- DER LEBENSRETTENDE FIX: Stumme Aktionen überspringen ---
if not tts_read_text.strip():
print(f"[Skip] Chunk enthielt nur stumme Aktionen: {chunk}")
continue
uid = uuid.uuid4().hex
filename = f"output_{uid}.wav"
client_out = Path('client') / 'audio' / filename
public_out = Path('audio') / filename
client_out.parent.mkdir(parents=True, exist_ok=True)
try:
elevenlabs_gen(tts_read_text, output_wav_pth=str(client_out))
except TypeError:
elevenlabs_gen(tts_read_text, str(client_out))
copy_to_public(client_out, public_out)
try:
duration = get_wav_duration(public_out)
except Exception:
duration = fallback_get_wav_duration(public_out)
playback.enqueue(public_out, expression, chunk, duration)
final_text = full_assistant_text.strip()
print("[llm final]", final_text)
messages.append({
"role": "assistant",
"content": [{"type": "output_text", "text": final_text}]
})
save_history(messages)
time.sleep(0.1)
except KeyboardInterrupt:
print("Interrupted by user, stopping.")
playback.stop()
break
except Exception as e:
print("Error in main loop:", e)
time.sleep(1)
if __name__ == '__main__':
main_loop()