import os from groq import Groq import yaml import gradio as gr import json import os from openai import OpenAI from dotenv import load_dotenv from pathlib import Path # 0. IMPORT ALL FILES! import os import sounddevice as sd import numpy as np import soundfile as sf import queue import sys from scipy.io.wavfile import read from faster_whisper import WhisperModel load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") groq_api_key = os.getenv("GROQ_API_KEY") client_groq = Groq(api_key = groq_api_key) def record_on_speech(output_file="conversation.wav", samplerate=44100, channels=1, silence_threshold=0.01, silence_duration=1, device=None): """ Records audio from the microphone, starting only when the user speaks and stopping after a period of silence. Args: output_file (str): Path to save the recorded audio. samplerate (int): Sampling rate in Hz. Default is 44100. channels (int): Number of audio channels. Default is 1 (mono). silence_threshold (float): RMS threshold to detect silence. Default is 0.01 (normalized amplitude). silence_duration (float): Duration in seconds of silence to stop recording. Default is 2. device (int or str): Input device ID or name. Default is None (use system default). Returns: None """ if os.path.exists(output_file): os.remove(output_file) print(f"Existing file '{output_file}' was deleted.") q = queue.Queue() def callback(indata, frames, time, status): """Callback for audio input.""" if status: print(status, file=sys.stderr) q.put(indata.copy()) def rms_level(data): """Calculate the RMS level of the audio.""" return np.sqrt(np.mean(np.square(data))) try: # Open the sound file with sf.SoundFile(output_file, mode='x', samplerate=samplerate, channels=channels, subtype='PCM_16') as file: with sd.InputStream(samplerate=samplerate, device=device, channels=channels, callback=callback): print("Listening for speech...") silent_time = 0 recording_started = False while True: data = q.get() rms = rms_level(data) if not recording_started: if rms > silence_threshold: print("Voice detected, starting recording...") recording_started = True if recording_started: file.write(data) if rms < silence_threshold: silent_time += len(data) / samplerate else: silent_time = 0 if silent_time >= silence_duration: print("Silence detected, stopping recording...") break except KeyboardInterrupt: print("\nRecording interrupted.") except Exception as e: print(f"Error: {type(e).__name__}: {e}", file=sys.stderr) return output_file def transcribe_audio_groq(aud_path = "conversation.wav"): with open(aud_path, "rb") as file: transcription = client_groq.audio.transcriptions.create( file=(aud_path, file.read()), model="whisper-large-v3-turbo", response_format="verbose_json", prompt="The following is a conversation between Hakuro and Koratex", ) print(transcription.text) return transcription.text if __name__ == "__main__": print('Running module') conversation_recording = "~/riko_project_v1/conversation.wav" conversation_recording = Path("audio") / "conversation.wav" conversation_recording = str(conversation_recording) record_on_speech( output_file=conversation_recording, samplerate=44100, channels=1, silence_threshold=0.02, # Adjust based on your microphone sensitivity silence_duration=1, # Stop after 3 seconds of silence device=None # Use default device, or specify by ID or name ) user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording) print(user_spoken_text)