Ai_Assistant/server/_archive/main_chat.py

416 lines
14 KiB
Python
Raw Normal View History

2026-05-24 13:31:30 +02:00
from faster_whisper import WhisperModel
from process.asr_func.asr_auto_record import record_on_speech, transcribe_audio
from process.asr_func.asr_transcribe_groq import record_on_speech, transcribe_audio_groq
from process.llm_funcs.llm_scr import llm_response, llm_response_with_memory
from process.tts_func.sovits_ping import sovits_gen, play_audio, get_wav_duration
from process.tts_func.tts_preprocess import clean_llm_output
from process.vrm_func.vrm_ping import vrm_talk, vrm_animate
from process.vrm_func.vrm_states_ping import set_vrm_state
from pathlib import Path
import os
import time
### transcribe audio
import uuid
import soundfile as sf
import asyncio
import threading
import random
import os
import time
import uuid
import json
import shutil
import yaml
from pathlib import Path
from openai import OpenAI
from contextlib import suppress
from queue import Queue
from threading import Event, Thread
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
def get_wav_duration(path):
with sf.SoundFile(path) as f:
return len(f) / f.samplerate
#!/usr/bin/env python3
"""
Streaming LLM -> chunked TTS -> queued playback script for Riko.
What it does:
- Records user speech (uses your record_on_speech)
- Transcribes (transcribe_audio)
- Streams LLM text (OpenAI Responses streaming)
- For each chunk: generate TTS via sovits_gen, copy to public audio dir, enqueue for playback
- Playback loop calls vrm_talk and vrm_animate and waits for the audio's duration to avoid overlap
- At the end of the stream, the full assistant text is appended to the JSON history file
Fill in or import the helper functions you already have in your project:
- record_on_speech(output_file, samplerate, channels, silence_threshold, silence_duration, device)
- transcribe_audio(whisper_model, aud_path)
- clean_asr_output(text)
- sovits_gen(in_text, emotion, output_wav_pth) -> returns path to generated wav (must write to output_wav_pth)
- vrm_talk(public_audio_path, expression, llm_output, duration)
- vrm_animate(cmd, path)
- clean_llm_output(text)
- get_emotion(text, emotion_model, tokenizer)
- map_emotion_to_expression(emotion)
- get_wav_duration(path)
Make sure char_config.yaml contains history_file and model keys (see your example config).
"""
import os
import time
import uuid
import json
import shutil
import yaml
from pathlib import Path
from openai import OpenAI
from contextlib import suppress
from queue import Queue
from threading import Thread
# ---------------------------
# Load config + OpenAI client
# ---------------------------
CONFIG_PATH = os.path.expanduser('character_config.yaml')
if not os.path.exists(CONFIG_PATH):
raise FileNotFoundError(f"Config not found at {CONFIG_PATH}")
with open(CONFIG_PATH, 'r') as f:
char_config = yaml.safe_load(f)
HISTORY_FILE = char_config['history_file']
MODEL = char_config.get('model', 'gpt-4.1-mini')
# MODEL = 'gpt-4.1'
CASE_SYSTEM_PROMPT = "You are a helpful assistant." # override for testing.
SYSTEM_PROMPT = [
{
"role": "system",
"content": [
{"type": "input_text", "text": char_config['presets']['default']['system_prompt']}
#{"type": "input_text", "text": CASE_SYSTEM_PROMPT}
]
}
]
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
raise EnvironmentError('Please set OPENAI_API_KEY in your environment')
client = OpenAI(api_key=OPENAI_API_KEY)
# ---------------------------
# History utilities
# ---------------------------
def load_history():
if os.path.exists(HISTORY_FILE):
with open(HISTORY_FILE, 'r') as f:
return json.load(f)
return SYSTEM_PROMPT.copy()
def save_history(history):
with open(HISTORY_FILE, 'w') as f:
json.dump(history, f, indent=2)
# ---------------------------
# Streaming helper
# ---------------------------
def stream_text_chunks(messages, min_len=30, max_len=120):
"""Synchronous generator that yields text chunks as the model streams.
Uses the OpenAI Responses streaming client like your example.
"""
buffer = ""
with client.responses.stream(
model=MODEL,
input=messages,
temperature=1,
top_p=1,
max_output_tokens=2048,
) as stream:
for event in stream:
# event.type values: "response.output_text.delta", "response.output_text.done", ...
if event.type == "response.output_text.delta":
buffer += event.delta
# Rule 1: yield if buffer ends with punctuation AND is long enough
if buffer.endswith(('.', '?', '!', '')) and len(buffer) >= min_len:
yield buffer.strip()
buffer = ""
# Rule 2: force yield if buffer gets too long
elif len(buffer) >= max_len:
yield buffer.strip()
buffer = ""
elif event.type == "response.output_text.done":
# Final flush
if buffer.strip():
yield buffer.strip()
# After final flush we'll exit the context; final response object is available
final_response = stream.get_final_response()
# Optionally return final_response (can't return from generator).
# ---------------------------
# Playback worker (single-threaded sequential playback)
# ---------------------------
class PlaybackWorker:
def __init__(self):
# queue items are tuples: (public_audio_path (Path), expression (str), assistant_text (str), duration (float))
self.q = Queue()
self.thread = Thread(target=self._run, daemon=True)
self._running = False
self.queue_finished_event = Event() # NEW: Event to signal queue is empty
self.queue_finished_event.set() # Start as finished (no items)
# flag to indicate whether the avatar is currently in the "talking" animation state
self._talking = False
def start(self):
if not self._running:
self._running = True
self.thread.start()
def enqueue(self, public_audio_path: Path, expression: str, assistant_text: str, duration: float):
self.queue_finished_event.clear() # NEW: Mark queue as not finished
self.q.put((public_audio_path, expression, assistant_text, duration))
def wait_until_finished(self, timeout=None):
"""
Wait until the playback queue is completely empty and processed.
Args:
timeout: Maximum time to wait in seconds (None = wait forever)
Returns:
True if queue finished, False if timeout occurred
"""
return self.queue_finished_event.wait(timeout)
def _run(self):
while True:
item = self.q.get()
if item is None:
break
public_audio_path, expression, assistant_text, duration = item
# Start the talking animation once when the first chunk of a sequence begins.
# Subsequent chunks won't retrigger the animation (avoids jump/cut).
try:
if not self._talking:
thinking_anim = Path("animations/mixamo") / "Talking.fbx"
vrm_animate("start_mixamo", str(thinking_anim))
set_vrm_state("talking")
self._talking = True
except Exception as e:
print("vrm_animate (start talking) failed:", e)
# Call vrm_talk for every chunk so the client receives the audio cue + metadata
try:
vrm_talk(str(public_audio_path), expression, assistant_text, int(duration))
except Exception as e:
print("vrm_talk failed:", e)
# wait for the audio's duration so we don't overlap
try:
time.sleep(duration)
except Exception:
# defensive
time.sleep(max(0.2, duration))
# If the queue is empty after finishing this chunk, return to idle and clear talking flag.
# This ensures a smooth transition back to idle at the end of the final chunk.
try:
if self.q.empty():
self.queue_finished_event.set()
idle_path = Path("animations/mixamo") / "Idle.fbx"
#vrm_animate("start_mixamo", str(idle_path))
self._talking = False
except Exception as e:
print("vrm_animate (idle) failed:", e)
def stop(self):
self.q.put(None)
self.thread.join()
# ---------------------------
# Utilities
# ---------------------------
def ensure_dirs():
Path('client/audio').mkdir(parents=True, exist_ok=True)
Path('audio').mkdir(parents=True, exist_ok=True)
def copy_to_public(client_path: Path, public_path: Path):
# Copy the client audio file to the public folder that vrm_talk expects
shutil.copy2(client_path, public_path)
# Fallback duration reader (if you don't have one)
def fallback_get_wav_duration(p: Path):
try:
import wave, contextlib
with contextlib.closing(wave.open(str(p),'r')) as wf:
frames = wf.getnframes()
rate = wf.getframerate()
return frames / float(rate)
except Exception:
return 3.0
# ---------------------------
# Main orchestration
# ---------------------------
def main_loop():
ensure_dirs()
playback = PlaybackWorker()
playback.start()
# Load any models or tokenizers you have for emotion detection here
# whisper_model, emotion_model, tokenizer = load_your_models()
while True:
try:
print("\n⏳ Waiting for playback queue to finish...")
playback.wait_until_finished()
print("✅ Queue finished, ready for input")
# 1) Idle animation + state
# try:
idle_anim = Path("animations/mixamo") / "Idle.fbx"
vrm_animate("start_mixamo", str(idle_anim))
set_vrm_state("idle")
# except Exception:
# pass
# cont = input(">>> ENTER TO CONTINUE") # UNCOMMENT IF YOU WANT PUSH TO TALK
conversation_recording = Path("audio") / "conversation.wav"
conversation_recording.parent.mkdir(parents=True, exist_ok=True)
conversation_recording = str(conversation_recording)
record_on_speech(
output_file=conversation_recording,
samplerate=44100,
channels=1,
silence_threshold=0.02, # Adjust based on your microphone sensitivity
silence_duration=2, # Stop after 3 seconds of silence
device=None # Use default device, or specify by ID or name
)
# record while listening sorry I don't think this works I'll have to work on it later.
# set_vrm_state("listening")
# 3) Thinking animation
try:
thinking_anim = Path("animations/mixamo") / "Thinking.fbx"
vrm_animate("start_mixamo", str(thinking_anim))
set_vrm_state("thinking")
except Exception:
pass
# 4) Transcribe
user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)
# 5) Build messages history
messages = load_history()
messages.append({
"role": "user",
"content": [
{"type": "input_text", "text": user_spoken_text}
]
})
# 6) Stream model and generate TTS per chunk
print("[llm] streaming response...")
# thinking_anim = Path("animations/mixamo") / "Talking.fbx"
# vrm_animate("start_mixamo", str(thinking_anim))
# set_vrm_state("talking")
full_assistant_text = ""
for chunk in stream_text_chunks(messages):
print("[chunk]", chunk)
# accumulate final text
full_assistant_text += (chunk + " ")
# prepare TTS text and emotion
tts_read_text = clean_llm_output(chunk)
# emotion = get_emotion(chunk, None, None) # plug your emotion model/tokenizer
# expression = map_emotion_to_expression(emotion)
# temp implementation
emotion = "relaxed" # or "smug" etc.
expression = "relaxed"
# create unique filename for this chunk
uid = uuid.uuid4().hex
filename = f"output_{uid}.wav"
client_out = Path('client') / 'audio' / filename
public_out = Path('audio') / filename
client_out.parent.mkdir(parents=True, exist_ok=True)
# generate TTS (blocking). Expected to write client_out
try:
sovits_gen(tts_read_text, output_wav_pth=str(client_out))
except TypeError:
# fallback if your function signature is sovits_gen(text, emotion, output_path)
sovits_gen(tts_read_text, str(client_out))
# copy to public path expected by VRM bridge
copy_to_public(client_out, public_out)
# determine duration
try:
duration = get_wav_duration(public_out)
except Exception:
duration = fallback_get_wav_duration(public_out)
# enqueue for sequential playback
playback.enqueue(public_out, expression, chunk, duration)
# 7) After streaming ends, append the full assistant message to history and save
final_text = full_assistant_text.strip()
print("[llm final]", final_text)
# append assistant to messages and save history
messages.append({
"role": "assistant",
"content": [
{"type": "output_text", "text": final_text}
]
})
save_history(messages)
# Optionally wait a short moment for queued audio to finish playing before next loop
# NOTE: playback worker will ensure queued audio plays sequentially; here we don't block.
time.sleep(0.1)
except KeyboardInterrupt:
print("Interrupted by user, stopping.")
playback.stop()
break
except Exception as e:
print("Error in main loop:", e)
time.sleep(1)
if __name__ == '__main__':
main_loop()