300 lines
9.7 KiB
Python
300 lines
9.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Streaming LLM -> chunked TTS -> queued playback script for Riko.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
import uuid
|
||
|
|
import json
|
||
|
|
import shutil
|
||
|
|
import yaml
|
||
|
|
from pathlib import Path
|
||
|
|
from openai import OpenAI
|
||
|
|
from contextlib import suppress
|
||
|
|
from queue import Queue
|
||
|
|
from threading import Event, Thread
|
||
|
|
from dotenv import load_dotenv
|
||
|
|
import soundfile as sf
|
||
|
|
|
||
|
|
# --- Eigene Module laden ---
|
||
|
|
from process.asr_func.asr_auto_record import record_on_speech
|
||
|
|
from process.asr_func.asr_transcribe_groq import transcribe_audio_groq
|
||
|
|
from process.tts_func.elevenlabs_ping import elevenlabs_gen
|
||
|
|
from process.tts_func.sovits_ping import play_audio, get_wav_duration
|
||
|
|
from process.tts_func.tts_preprocess import clean_llm_output
|
||
|
|
from process.vrm_func.vrm_ping import vrm_talk, vrm_animate
|
||
|
|
from process.vrm_func.vrm_states_ping import set_vrm_state
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Setup & Keys
|
||
|
|
# ---------------------------
|
||
|
|
load_dotenv()
|
||
|
|
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
||
|
|
if not OPENAI_API_KEY:
|
||
|
|
raise EnvironmentError('Please set OPENAI_API_KEY in your environment')
|
||
|
|
|
||
|
|
client = OpenAI(api_key=OPENAI_API_KEY)
|
||
|
|
|
||
|
|
CONFIG_PATH = os.path.expanduser('character_config.yaml')
|
||
|
|
if not os.path.exists(CONFIG_PATH):
|
||
|
|
raise FileNotFoundError(f"Config not found at {CONFIG_PATH}")
|
||
|
|
|
||
|
|
with open(CONFIG_PATH, 'r') as f:
|
||
|
|
char_config = yaml.safe_load(f)
|
||
|
|
|
||
|
|
HISTORY_FILE = char_config['history_file']
|
||
|
|
MODEL = char_config.get('model', 'gpt-4.1-mini')
|
||
|
|
|
||
|
|
SYSTEM_PROMPT = [
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": [
|
||
|
|
{"type": "input_text", "text": char_config['presets']['default']['system_prompt']}
|
||
|
|
]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Hilfs-Funktionen
|
||
|
|
# ---------------------------
|
||
|
|
def get_vrm_expression(text: str) -> str:
|
||
|
|
text_lower = text.lower()
|
||
|
|
if any(word in text_lower for word in ['*giggle', '*laugh', '*smile', '*smirk', '*cheer']):
|
||
|
|
return "happy"
|
||
|
|
elif any(word in text_lower for word in ['*angry', '*glare', '*mad', '*frustrat']):
|
||
|
|
return "angry"
|
||
|
|
elif any(word in text_lower for word in ['*sad', '*cry', '*sigh', '*sorrow']):
|
||
|
|
return "sad"
|
||
|
|
elif any(word in text_lower for word in ['*gasp', '*shock', '*surpris', '*wide']):
|
||
|
|
return "surprised"
|
||
|
|
elif any(word in text_lower for word in ['*neutral', '*stare']):
|
||
|
|
return "neutral"
|
||
|
|
else:
|
||
|
|
return "relaxed"
|
||
|
|
|
||
|
|
def load_history():
|
||
|
|
if os.path.exists(HISTORY_FILE):
|
||
|
|
with open(HISTORY_FILE, 'r') as f:
|
||
|
|
return json.load(f)
|
||
|
|
return SYSTEM_PROMPT.copy()
|
||
|
|
|
||
|
|
def save_history(history):
|
||
|
|
with open(HISTORY_FILE, 'w') as f:
|
||
|
|
json.dump(history, f, indent=2)
|
||
|
|
|
||
|
|
def ensure_dirs():
|
||
|
|
Path('client/audio').mkdir(parents=True, exist_ok=True)
|
||
|
|
Path('audio').mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
def copy_to_public(client_path: Path, public_path: Path):
|
||
|
|
shutil.copy2(client_path, public_path)
|
||
|
|
|
||
|
|
def fallback_get_wav_duration(p: Path):
|
||
|
|
try:
|
||
|
|
import wave, contextlib
|
||
|
|
with contextlib.closing(wave.open(str(p),'r')) as wf:
|
||
|
|
frames = wf.getnframes()
|
||
|
|
rate = wf.getframerate()
|
||
|
|
return frames / float(rate)
|
||
|
|
except Exception:
|
||
|
|
return 3.0
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Streaming helper
|
||
|
|
# ---------------------------
|
||
|
|
def stream_text_chunks(messages, min_len=60, max_len=350):
|
||
|
|
buffer = ""
|
||
|
|
|
||
|
|
with client.responses.stream(
|
||
|
|
model=MODEL,
|
||
|
|
input=messages,
|
||
|
|
temperature=1,
|
||
|
|
top_p=1,
|
||
|
|
max_output_tokens=2048,
|
||
|
|
) as stream:
|
||
|
|
|
||
|
|
for event in stream:
|
||
|
|
if event.type == "response.output_text.delta":
|
||
|
|
buffer += event.delta
|
||
|
|
if buffer.endswith(('. ', '? ', '! ', '\n', '." ', '?" ')) and len(buffer) >= min_len:
|
||
|
|
yield buffer.strip()
|
||
|
|
buffer = ""
|
||
|
|
elif len(buffer) >= max_len:
|
||
|
|
yield buffer.strip()
|
||
|
|
buffer = ""
|
||
|
|
elif event.type == "response.output_text.done":
|
||
|
|
if buffer.strip():
|
||
|
|
yield buffer.strip()
|
||
|
|
final_response = stream.get_final_response()
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Playback worker (Queue)
|
||
|
|
# ---------------------------
|
||
|
|
class PlaybackWorker:
|
||
|
|
def __init__(self):
|
||
|
|
self.q = Queue()
|
||
|
|
self.thread = Thread(target=self._run, daemon=True)
|
||
|
|
self._running = False
|
||
|
|
self.queue_finished_event = Event()
|
||
|
|
self.queue_finished_event.set()
|
||
|
|
self._talking = False
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
if not self._running:
|
||
|
|
self._running = True
|
||
|
|
self.thread.start()
|
||
|
|
|
||
|
|
def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float):
|
||
|
|
self.queue_finished_event.clear()
|
||
|
|
self.q.put((public_audio_path, expression, assistant_text, duration))
|
||
|
|
|
||
|
|
def wait_until_finished(self, timeout=None):
|
||
|
|
return self.queue_finished_event.wait(timeout)
|
||
|
|
|
||
|
|
def _run(self):
|
||
|
|
while True:
|
||
|
|
item = self.q.get()
|
||
|
|
if item is None:
|
||
|
|
break
|
||
|
|
public_audio_path, expression, assistant_text, duration = item
|
||
|
|
|
||
|
|
try:
|
||
|
|
if not self._talking:
|
||
|
|
thinking_anim = Path("animations/mixamo") / "Talking.fbx"
|
||
|
|
vrm_animate("start_mixamo", str(thinking_anim))
|
||
|
|
set_vrm_state("talking")
|
||
|
|
self._talking = True
|
||
|
|
except Exception as e:
|
||
|
|
pass
|
||
|
|
|
||
|
|
try:
|
||
|
|
# 3D Modell ansteuern (braucht int)
|
||
|
|
vrm_talk(str(public_audio_path), expression, assistant_text, int(duration))
|
||
|
|
|
||
|
|
# --- FIX FÜR DEN TON: Python spielt das Audio jetzt laut ab! ---
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print("Playback error:", e)
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Da play_audio schon wartet, bis der Ton fertig ist,
|
||
|
|
# packen wir hier nur noch unsere Atempause von 0.4s obendrauf!
|
||
|
|
time.sleep(0.4)
|
||
|
|
except Exception:
|
||
|
|
time.sleep(0.4)
|
||
|
|
|
||
|
|
try:
|
||
|
|
if self.q.empty():
|
||
|
|
self.queue_finished_event.set()
|
||
|
|
idle_path = Path("animations/mixamo") / "Idle.fbx"
|
||
|
|
self._talking = False
|
||
|
|
except Exception as e:
|
||
|
|
pass
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.q.put(None)
|
||
|
|
self.thread.join()
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Main orchestration
|
||
|
|
# ---------------------------
|
||
|
|
def main_loop():
|
||
|
|
ensure_dirs()
|
||
|
|
playback = PlaybackWorker()
|
||
|
|
playback.start()
|
||
|
|
|
||
|
|
while True:
|
||
|
|
try:
|
||
|
|
print("\n⏳ Waiting for playback queue to finish...")
|
||
|
|
playback.wait_until_finished()
|
||
|
|
print("✅ Queue finished, ready for input")
|
||
|
|
|
||
|
|
idle_anim = Path("animations/mixamo") / "Idle.fbx"
|
||
|
|
vrm_animate("start_mixamo", str(idle_anim))
|
||
|
|
set_vrm_state("idle")
|
||
|
|
|
||
|
|
conversation_recording = Path("audio") / "conversation.wav"
|
||
|
|
conversation_recording.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
conversation_recording = str(conversation_recording)
|
||
|
|
|
||
|
|
record_on_speech(
|
||
|
|
output_file=conversation_recording,
|
||
|
|
samplerate=44100,
|
||
|
|
channels=1,
|
||
|
|
silence_threshold=0.02,
|
||
|
|
silence_duration=2,
|
||
|
|
device=None
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
thinking_anim = Path("animations/mixamo") / "Thinking.fbx"
|
||
|
|
vrm_animate("start_mixamo", str(thinking_anim))
|
||
|
|
set_vrm_state("thinking")
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)
|
||
|
|
|
||
|
|
messages = load_history()
|
||
|
|
messages.append({
|
||
|
|
"role": "user",
|
||
|
|
"content": [{"type": "input_text", "text": user_spoken_text}]
|
||
|
|
})
|
||
|
|
|
||
|
|
print("[llm] streaming response...")
|
||
|
|
full_assistant_text = ""
|
||
|
|
|
||
|
|
for chunk in stream_text_chunks(messages):
|
||
|
|
print("[chunk]", chunk)
|
||
|
|
full_assistant_text += (chunk + " ")
|
||
|
|
|
||
|
|
expression = get_vrm_expression(chunk)
|
||
|
|
tts_read_text = clean_llm_output(chunk)
|
||
|
|
|
||
|
|
# --- DER LEBENSRETTENDE FIX: Stumme Aktionen überspringen ---
|
||
|
|
if not tts_read_text.strip():
|
||
|
|
print(f"[Skip] Chunk enthielt nur stumme Aktionen: {chunk}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
uid = uuid.uuid4().hex
|
||
|
|
filename = f"output_{uid}.wav"
|
||
|
|
client_out = Path('client') / 'audio' / filename
|
||
|
|
public_out = Path('audio') / filename
|
||
|
|
client_out.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
try:
|
||
|
|
elevenlabs_gen(tts_read_text, output_wav_pth=str(client_out))
|
||
|
|
except TypeError:
|
||
|
|
elevenlabs_gen(tts_read_text, str(client_out))
|
||
|
|
|
||
|
|
copy_to_public(client_out, public_out)
|
||
|
|
|
||
|
|
try:
|
||
|
|
duration = get_wav_duration(public_out)
|
||
|
|
except Exception:
|
||
|
|
duration = fallback_get_wav_duration(public_out)
|
||
|
|
|
||
|
|
playback.enqueue(public_out, expression, chunk, duration)
|
||
|
|
|
||
|
|
final_text = full_assistant_text.strip()
|
||
|
|
print("[llm final]", final_text)
|
||
|
|
|
||
|
|
messages.append({
|
||
|
|
"role": "assistant",
|
||
|
|
"content": [{"type": "output_text", "text": final_text}]
|
||
|
|
})
|
||
|
|
save_history(messages)
|
||
|
|
time.sleep(0.1)
|
||
|
|
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
print("Interrupted by user, stopping.")
|
||
|
|
playback.stop()
|
||
|
|
break
|
||
|
|
except Exception as e:
|
||
|
|
print("Error in main loop:", e)
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main_loop()
|