import asyncio import os import random import threading import time ### transcribe audio import uuid from pathlib import Path import soundfile as sf from faster_whisper import WhisperModel from process.asr_func.asr_auto_record import record_on_speech, transcribe_audio from process.asr_func.asr_push_to_talk import record_and_transcribe from process.asr_func.asr_text_clean import clean_asr_output from process.asr_func.asr_transcribe_groq import transcribe_audio_groq from process.extensions.emotion_class_func.emotion_classifier import ( get_emotion, load_emotion_model, map_emotion_to_expression, ) from process.faiss_func.faiss_memory import MemoryManager from process.llm_funcs.llm_scr import llm_response, llm_response_with_memory from process.mcp_funcs.mcp_llm_func import llm_response_mcp from process.tts_func.sovits_ping import ( get_wav_duration, play_audio, sovits_gen, sovits_gen_emotion, ) from process.tts_func.tts_preprocess import clean_llm_output from process.vision_func.gemini_vision import describe_image from process.vrm_func.vrm_ping import vrm_animate, vrm_talk from streaming_chat import animate_and_respond_streaming def get_wav_duration(path): with sf.SoundFile(path) as f: return len(f) / f.samplerate print(" \n ========= Starting Chat... ================ \n") # whisper_model = WhisperModel("base.en", device="cuda", compute_type="float16") # model_path = "/home/rayenfeng/riko_project_v1/server/models/model_quantized.onnx" # emotion_model, tokenizer = load_emotion_model(model_path) # memory = MemoryManager() def animate_and_respond(user_message, click_interact=False, mode="mcp"): """ Main function to play animations and responds, Mode: default: respond with no tool calling and no memory Memory: (TO BE IMPLMENTED DO NOT USE) calls with long term memory mcp: Model will connect to an available MCP server and respond (dynamic imports is not supported yet) """ if click_interact == False: ### play a thinking animation animationpath = Path("animations/mixamo") / "Thinking.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) # user_spoken_text = clean_asr_output(user_message) # depreciated because of UI maybe renable for calling mode? user_spoken_text = user_message print("\n User : ", user_spoken_text) ### pass to LLM and get a LLM output. if mode == "mcp": # Delegate entirely to the streaming pipeline (handles TTS, playback, idle) animate_and_respond_streaming(user_message, click_interact) return if mode == "Memory": context = "GET CONTEXT FROM FAISS HERE" llm_output = llm_response_with_memory(user_spoken_text, context) elif mode == "default": llm_output = llm_response(user_spoken_text) print("Riko : \n", llm_output) tts_read_text = clean_llm_output(llm_output) ### calculate emotion # emotion = get_emotion(llm_output, emotion_model, tokenizer) # expression = map_emotion_to_expression(emotion) # print("Expression : \n", expression) # currently under development! comment out once stable expression = "relaxed" ### file organization # 1. Generate a unique filename uid = uuid.uuid4().hex filename = f"output_{uid}.wav" base_dir = Path(__file__).resolve().parent.parent # ~/riko_project_v1 output_wav_path = base_dir / "client" / "audio" / filename public_audio_path = Path("audio") / filename output_wav_path.parent.mkdir(parents=True, exist_ok=True) ### generate audio and save it to client/audio # gen_aud_path = sovits_gen(tts_read_text,output_wav_path) gen_aud_path = sovits_gen_emotion(tts_read_text, expression, output_wav_path) print(f"Saving TTS to: ", gen_aud_path) # Example duration = get_wav_duration(output_wav_path) vrm_talk(str(public_audio_path), expression, llm_output, int(duration)) ## talking animmation if click_interact == False: animationpath = Path("animations/mixamo") / "Talking.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) print("waiting for audio to finish...") time.sleep(duration) animationpath = Path("animations/mixamo") / "Idle.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) def transcribe_and_respond(audio_file_path): audio_file_path = str(audio_file_path) # user_message = transcribe_audio(whisper_model, aud_path=audio_file_path) user_message = transcribe_audio_groq(aud_path=audio_file_path) animate_and_respond(user_message) async def transcribe_audio_wrapper(audio_file_path): audio_file_path = str(audio_file_path) # user_message = transcribe_audio(whisper_model, aud_path=audio_file_path) user_message = transcribe_audio_groq(aud_path=audio_file_path) return user_message def describe_image_and_respond(image_file_path, additional_text=None): # describe image image_message = describe_image(image_file_path) # if has text input if additional_text: print(f"Processing image with additional context: {additional_text}") user_message = f"{additional_text} \n [Image description: \n {image_message}]" animate_and_respond(user_message) else: user_message = f"[Image description: \n {image_message}]" animate_and_respond(user_message) # quick helper to schedule a return to idle def schedule_idle(delay: float = 2.0): threading.Timer( delay, lambda: vrm_animate( "start_mixamo", str(Path("animations/mixamo") / "Idle.fbx") ), ).start() def handle_click_interaction(payload: dict): region = payload.get("region", "").lower() bone = payload.get("bone", "") print("🎯 Playing animation for", region, "bone:", bone) # --- 1. Immediate feedback: random talk sound --- feedback_sounds = [ ("oh.wav", "oh?"), ("hey.wav", "hey"), ] sound_file, text = random.choice(feedback_sounds) soundpath = Path("public/sounds") / sound_file vrm_talk(str(soundpath), "relaxed", text, 1) # --- 2. Pick animation based on region/bone --- if region in {"chest", "bust", "belly"}: # Touch torso anim = Path("animations/vrma_xr") / "woah.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.85, crop_end=0.0 ) schedule_idle(3.0) elif bone in {"left_cat_ear", "right_cat_ear"}: # Touch ears anim = Path("animations/vrma_xr") / "touch_ears.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.95, crop_end=0.0 ) schedule_idle(3.0) elif any( part in region for part in [ "right_hand", "right_arm", "right_shoulder", "right_thigh", "right_shin", "right_foot", ] ): # Touch right side anim = Path("animations/vrma_xr") / "lookright.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.72, crop_end=0.0 ) schedule_idle(2.2) elif any( part in region for part in [ "left_hand", "left_arm", "left_shoulder", "left_thigh", "left_shin", "left_foot", ] ): # Touch left side anim = Path("animations/vrma_xr") / "lookleft.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.82, crop_end=0.0 ) schedule_idle(2.1) elif region in {"head", "neck", "hair"}: # Touch head/hair anim = Path("animations/vrma_xr") / "headpat_cover.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.72, crop_end=0.0 ) schedule_idle(2.8) else: # Default playful reaction anim = Path("animations/vrma_xr") / "stop_it.vrma" vrm_animate( "start_vrma", str(anim), play_once=False, crop_start=0.82, crop_end=0.0 ) schedule_idle(1.7) print(" \n ========= Finsihed Starting Chat... ================ \n") if __name__ == "__main__": while True: ## play an idle animation animationpath = Path("animations/mixamo") / "Idle.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) conversation_recording = output_wav_path = Path("audio") / "conversation.wav" conversation_recording.parent.mkdir(parents=True, exist_ok=True) record_on_speech( output_file=conversation_recording, samplerate=44100, channels=1, silence_threshold=0.02, # Adjust based on your microphone sensitivity silence_duration=1, # Stop after 3 seconds of silence device=None, # Use default device, or specify by ID or name ) ### play a thinking animation animationpath = Path("animations/mixamo") / "Thinking.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) # do functions user_spoken_text = transcribe_audio( whisper_model, aud_path=conversation_recording ) #### use push to talk. # user_spoken_text = record_and_transcribe(whisper_model, conversation_recording) user_spoken_text = clean_asr_output(user_spoken_text) print("\n User : ", user_spoken_text) # search for relative memories # context = memory.get_context_block(user_spoken_text, top_k= 2) # print("Memories : \n", context) ### pass to LLM and get a LLM output. llm_output = llm_response(user_spoken_text) # llm_output = llm_response_with_memory(user_spoken_text,context) print("Riko : \n", llm_output) tts_read_text = clean_llm_output(llm_output) ### calculate emotion emotion = get_emotion(llm_output, emotion_model, tokenizer) expression = map_emotion_to_expression(emotion) print("Expression : \n", expression) ### file organization # 1. Generate a unique filename uid = uuid.uuid4().hex filename = f"output_{uid}.wav" output_wav_path = Path("client", "audio") / filename public_audio_path = Path("audio") / filename output_wav_path.parent.mkdir(parents=True, exist_ok=True) ### generate audio and save it to client/audio # gen_aud_path = sovits_gen(tts_read_text,output_wav_path) gen_aud_path = sovits_gen_emotion(tts_read_text, expression, output_wav_path) # Example duration = get_wav_duration(output_wav_path) vrm_talk(str(public_audio_path), expression, llm_output, int(duration)) ## talking animmation animationpath = Path("animations/mixamo") / "Talking.fbx" # animationpath = Path("animations/mixamo_fbx") / "Angry.wav" vrm_animate("start_mixamo", str(animationpath)) print("waiting for audio to finish...") time.sleep(duration) # clean up audio files # [fp.unlink() for fp in Path("audio").glob("*.wav") if fp.is_file()]