Ai_Assistant/server/process/asr_func/asr_transcribe_groq.py


import os
from groq import Groq
import yaml
import gradio as gr
import json
import os
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path


# 0. IMPORT ALL FILES!
import os
import sounddevice as sd
import numpy as np
import soundfile as sf
import queue
import sys
from scipy.io.wavfile import read
from faster_whisper import WhisperModel


load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
client_groq = Groq(api_key = groq_api_key)


def record_on_speech(output_file="conversation.wav", samplerate=44100, channels=1, silence_threshold=0.01, silence_duration=1, device=None):
    """
    Records audio from the microphone, starting only when the user speaks and stopping after a period of silence.

    Args:
        output_file (str): Path to save the recorded audio.
        samplerate (int): Sampling rate in Hz. Default is 44100.
        channels (int): Number of audio channels. Default is 1 (mono).
        silence_threshold (float): RMS threshold to detect silence. Default is 0.01 (normalized amplitude).
        silence_duration (float): Duration in seconds of silence to stop recording. Default is 2.
        device (int or str): Input device ID or name. Default is None (use system default).

    Returns:
        None
    """

    if os.path.exists(output_file):
        os.remove(output_file)
        print(f"Existing file '{output_file}' was deleted.")

    q = queue.Queue()

    def callback(indata, frames, time, status):
        """Callback for audio input."""
        if status:
            print(status, file=sys.stderr)
        q.put(indata.copy())

    def rms_level(data):
        """Calculate the RMS level of the audio."""
        return np.sqrt(np.mean(np.square(data)))

    try:
        # Open the sound file
        with sf.SoundFile(output_file, mode='x', samplerate=samplerate,
                          channels=channels, subtype='PCM_16') as file:
            with sd.InputStream(samplerate=samplerate, device=device,
                                channels=channels, callback=callback):
                print("Listening for speech...")
                silent_time = 0
                recording_started = False

                while True:
                    data = q.get()
                    rms = rms_level(data)

                    if not recording_started:
                        if rms > silence_threshold:
                            print("Voice detected, starting recording...")
                            recording_started = True

                    if recording_started:
                        file.write(data)

                        if rms < silence_threshold:
                            silent_time += len(data) / samplerate
                        else:
                            silent_time = 0

                        if silent_time >= silence_duration:
                            print("Silence detected, stopping recording...")
                            break

    except KeyboardInterrupt:
        print("\nRecording interrupted.")
    except Exception as e:
        print(f"Error: {type(e).__name__}: {e}", file=sys.stderr)

    return output_file


def transcribe_audio_groq(aud_path = "conversation.wav"):
    with open(aud_path, "rb") as file:
        transcription = client_groq.audio.transcriptions.create(
        file=(aud_path, file.read()),
        model="whisper-large-v3-turbo",
        response_format="verbose_json",
        prompt="The following is a conversation between Riko and Rayen",
        )
        print(transcription.text)

        return transcription.text


if __name__ == "__main__":
    print('Running module')

    conversation_recording = "~/riko_project_v1/conversation.wav"
    conversation_recording = Path("audio") / "conversation.wav"
    conversation_recording = str(conversation_recording)

    record_on_speech(
            output_file=conversation_recording,
            samplerate=44100,
            channels=1,
            silence_threshold=0.02,  # Adjust based on your microphone sensitivity
            silence_duration=1,     # Stop after 3 seconds of silence
            device=None             # Use default device, or specify by ID or name
        )

    user_spoken_text = transcribe_audio_groq(aud_path=conversation_recording)
    print(user_spoken_text)