Initial commit

2024-10-29 22:05:42 +11:00 · 2024-10-29 22:05:42 +11:00 · 51efcd9c0f
commit 51efcd9c0f
25 changed files with 4973 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
+#
+
+This
--- a/language.py
+++ b/language.py
@ -0,0 +1,51 @@
+import os 
+from dotenv import load_dotenv
+load_dotenv()
+
+# https://github.com/rhasspy/piper/blob/master/VOICES.md download here
+
+ROOT_DIR = os.getenv('ROOT_DIR',os.path.dirname(__file__))
+PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
+LANGUAGES = ['en', 'fr', 'ru', 'zh', 'vi']
+
+# piper-tts
+
+config_file = {'en': 'glados.onnx.json',
+               'fr': 'fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json',
+                'ru': 'ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json',
+                'zh': 'zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json',
+                'vi': 'vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json'
+               } 
+onnx_file = {'en': 'glados.onnx',
+                'fr': 'fr_FR-mls-medium.onnx',
+                'ru': 'ru_RU-dmitri-medium.onnx',   
+                'zh': 'zh_CN-huayan-medium.onnx',
+                'vi': 'vi_VN-vais1000-medium.onnx'
+             }
+
+prompt = { 'en': 'Respond only in english in under 100 words.',
+            'fr': 'Répondez uniquement en français en moins de 100 mots.',
+            'ru': 'Отвечайте только на русском языке менее 100 слов.',
+            'zh': '只用中文回答，不超过100个字。一点英文都不可以用。',
+            'vi': 'Chỉ trả lời bằng tiếng Việt dưới 100 từ.'
+            }
+
+def piper_voice_language(lang):
+    if lang in LANGUAGES:
+        return lang
+    else:
+        return 'en'
+
+# retrieve the corresponding piper-tts voice files for the language
+def files_language(lang):
+    language = piper_voice_language(lang)
+    conf_path = os.path.join(PIPER_TTS_DIR, config_file[language])
+    onnx_path = os.path.join(PIPER_TTS_DIR, onnx_file[language])
+    return (onnx_path, conf_path)
+
+def get_variables(lang):
+    language = piper_voice_language(lang)
+    starting_prompt = prompt[language]
+    onnx_path, conf_path = files_language(language)
+    return (onnx_path, conf_path, language, starting_prompt)
+    
--- a/main.py
+++ b/main.py
@ -0,0 +1,274 @@
+##########################################################################################
+##### import basic libraries #####
+import os, time, random, json, requests, sys, pyaudio
+import pvporcupine
+import struct
+from datetime import datetime
+import language
+from whisper import whisper_pipeline
+import piper as pp
+from piper import play_audio, eng_piper, play_prompt
+import ollama as ol
+from dotenv import load_dotenv
+load_dotenv()
+##########################################################################################
+
+
+##########################################################################################
+# By default this requires at least 6GB VRAM if using a CUDA supported GPU. Requirements can be lowered if the Ollama and Whisper models are changed to a smaller model. 
+# You can also consider using Leopard for a small and relatively accurate audio transcription model. Edit the whisper.py file to use Leopard instead of WhisperX. Ensure the function output is just a string.
+# for basic audio transcription, you can also use vosk or google speech to text. Google speech is less accurate and subject to rate limiting/charges. Vosk is relatively inaccurate but uses minimal resources.
+
+# for more languages, look at the language.py file and download more languages from there into the piper-tts folder
+# ensure you have ollama installed. If you don't have the models downloaded from Ollama it may take some time depending on which model you choose
+##########################################################################################
+
+
+##########################################################################################
+### Root directory for where you store llm_media, wake_words and piper_tts folders and files
+# By default, it is the same directory as this file
+ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))   # for where you want to keep all your files
+
+### From the ROOT_DIR, create the necessary folders if they don't exist
+WAKE_WORDS_DIR = os.path.join(ROOT_DIR, 'wake_words')
+LLM_MEDIA_DIR = os.path.join(ROOT_DIR, 'llm_media')
+PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
+if not os.path.exists(WAKE_WORDS_DIR):
+    os.makedirs(WAKE_WORDS_DIR)
+    sys.exit(f"Wake words directory have just been created in {ROOT_DIR}. Please download the wake word files from the Picovoice Console and place them in the wake_words directory.")
+if not os.path.exists(LLM_MEDIA_DIR):
+    os.makedirs(LLM_MEDIA_DIR)
+if not os.path.exists(PIPER_TTS_DIR):
+    os.makedirs(PIPER_TTS_DIR)
+    sys.exit(f"Piper TTS directory have just been created in {ROOT_DIR}. Please download the piper-tts files from the Piper TTS repository and place them in the piper-tts directory.")
+
+### Custom prompts
+CUSTOM_PROMPTS_DIR = os.path.join(LLM_MEDIA_DIR, 'custom_prompts')
+if not os.path.exists(CUSTOM_PROMPTS_DIR):
+    os.makedirs(CUSTOM_PROMPTS_DIR)
+
+##########################################################################################
+
+
+##########################################################################################
+##### Environmental Variables #####
+# Please define the following environmental variables in a .env file or in your system environment. The only required ones are: PICOVOICE_ACCESS_KEY, WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE
+# WAKEWORD_FILE_INITIAL is the word you say to initiate the chat and WAKEWORD_FILE_CONTINUE is the word to continue the chat after the initial dialogue.
+
+### Llama Variables
+BASE_URL = os.getenv('BASE_URL', 'http://localhost:11434')
+LLM_MODEL = os.getenv('LLM_MODEL', 'llama3.2:latest') # about 3GB of VRAM/RAM required for LLAMA3.2 model
+
+### Threshold for audio peaks
+THRESHOLD = int(os.getenv('THRESHOLD', '1000'))
+
+### WhisperX Variables
+MODEL_ID = os.getenv('MODEL_ID', 'openai/whisper-large-v3-turbo')
+
+# Language code for whisper pipeline, defaults to en. Possible languages = ['en', 'fr', 'ru', 'zh', 'vi']
+LANG = 'en' 
+for arg in sys.argv:
+    if arg.lower().startswith('lang='):
+        LANG = arg.split('=')[1]
+
+### Piper variables
+# LANG is checked for validity else it defaults to 'en'. Prompt is used for the OLLAMA model at the start.
+PIPER_MODEL_FILE, PIPER_CONFIG_FILE, LANG, PROMPT = language.get_variables(LANG)
+
+# Prompt for the Llama model. Change the environment variable PROMPT to change the prompt.
+PROMPT = os.getenv('PROMPT', PROMPT)
+
+### Picovoice 
+ACCESS_KEY = os.getenv('PICOVOICE_ACCESS_KEY')
+WAKEWORD_FILE_INITIAL = os.getenv('WAKEWORD_FILE_INITIAL')
+WAKEWORD_FILE_CONTINUE = os.getenv('WAKEWORD_FILE_CONTINUE')
+WAKE_WORD_1 = os.getenv('WAKE_WORD_1', 'Hey Penguin') # Wake word to start and continue the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_INITIAL
+WAKE_WORD_2 = os.getenv('WAKE_WORD_2', 'Bye Penguin') # Wake word to end the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_CONTINUE
+
+if not ACCESS_KEY or not WAKEWORD_FILE_INITIAL or not WAKEWORD_FILE_CONTINUE:
+    sys.exit("""Please set the PICOVOICE_ACCESS_KEY environment variable to your Picovoice Access Key. It is free to setup an account and get an access key and everything is done locally after verifying your account.
+             Then set the WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE environment variables to be the basename of the wake word files in the wake_words directory. You can generate these files from the Picovoice Console.""")
+    
+##### Voice prompts by Piper for the conversation #####
+# You can change these to your own voice prompts
+
+WELCOME_PROMPT = os.getenv('WELCOME_PROMPT', 'Hello, I am Pengames. How can I help you today?')
+LISTENING_PROMPT = os.getenv('LISTENING_PROMPT', 'Listening...') # To know when the bot is listening
+GOODBYE_PROMPT = os.getenv('GOODBYE_PROMPT', 'Goodbye for now. Have an amazing day. Big COOL Penguin signing off.')
+CONTINUE_CONVO_INSTRUCTIONS_PROMPT = os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation? Call me {WAKE_WORD_1} to continue otherwise say {WAKE_WORD_2} to exit immediately.') # For the first continue prompt
+CONTINUE_CONVO_PROMPT= os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation?') # For subsequent continue prompts after you already know the instructions
+
+##### Custom audio files for responses
+# You can also set custom audio outputs for various responses by using the play_audio function in the ollama.py file instead of the piper function in the piper.py file. Put the files in the llm_media/custom_prompts
+USE_CUSTOM_AUDIO = True if os.getenv('USE_CUSTOM_AUDIO', False) == 'True' else False # Set to "True" if you want to use custom audio files for responses. Set to False to use the piper function for responses.
+CUSTOM_JSON_FILE = os.getenv('CUSTOM_JSON_FILE', 'custom_prompts.json')
+CUSTOM_JSON_FILE = os.path.join(ROOT_DIR, CUSTOM_JSON_FILE)
+
+print(USE_CUSTOM_AUDIO)
+with open(CUSTOM_JSON_FILE, 'r') as f:
+    custom_prompts = json.load(f)
+    CUSTOM_WELCOME_PROMPTS = custom_prompts.get('CUSTOM_WELCOME_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_welcome.wav')])
+    CUSTOM_LISTENING_PROMPTS = custom_prompts.get('CUSTOM_LISTENING_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_listening.wav')])
+    CUSTOM_CONTINUE_CONVO_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo.wav')])
+    CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo_instructions.wav')])
+    CUSTOM_GOODBYE_PROMPTS = custom_prompts.get('CUSTOM_GOODBYE_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_goodbye.wav')])
+
+    CUSTOM_WELCOME_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_WELCOME_PROMPTS]
+    CUSTOM_LISTENING_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_LISTENING_PROMPTS]
+    CUSTOM_CONTINUE_CONVO_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_PROMPTS]
+    CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS]
+    CUSTOM_GOODBYE_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_GOODBYE_PROMPTS]
+
+##########################################################################################
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Load the functions and define helper variables/functions. Do not edit these variables
+
+##### WHISPER PIPELINE #####
+pipe = whisper_pipeline(model_id=MODEL_ID, whisper_lang=LANG)
+
+##### Piper Function For TTS #####
+# Download piper-tts directly onto your system or via pip.
+piper = lambda text: pp.piper(text, model = PIPER_MODEL_FILE, config = PIPER_CONFIG_FILE)
+
+
+##### Llama Model #####
+LLM_RESPONSE_PATH = os.path.join(LLM_MEDIA_DIR, 'llm_response.txt') # File to store all the responses
+
+# Define the API endpoint
+API_URL = f"{BASE_URL}/api/generate"
+OLLAMA_JSON = os.path.join(LLM_MEDIA_DIR, 'llm_context.json')
+converse = lambda text, llm_model = LLM_MODEL, llm_response_path =  LLM_RESPONSE_PATH, have_context = False: ol.converse(text, ping_url=BASE_URL, api_url=API_URL, llm_model=llm_model, llm_response_path=llm_response_path, prompt=PROMPT, context_file=OLLAMA_JSON, have_context=have_context)
+##### Porcupine Wake Word Detection #####
+# Path to the wake word model file
+WAKE_WORD_PATH_1 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_INITIAL)
+WAKE_WORD_PATH_2 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_CONTINUE)
+
+##########################################################################################
+
+
+
+##########################################################################################
+##### Pico Voice Wake Word Detection With Porcupine #####
+def porcupine(ACCESS_KEY = ACCESS_KEY, WAKE_WORD_PATHS: list = []):
+    porcupine = pvporcupine.create(access_key=ACCESS_KEY, keyword_paths=WAKE_WORD_PATHS)
+
+    # Set up audio streams
+    pa = pyaudio.PyAudio()
+    audio_stream = pa.open(
+        rate=porcupine.sample_rate,
+        channels=1,
+        format=pyaudio.paInt16,
+        input=True,
+        frames_per_buffer=porcupine.frame_length
+    )
+    return porcupine, audio_stream, pa
+
+# Pipeline from capturing words to reply
+def speech_to_response(threshold = THRESHOLD, audio_input_file = pp.AUDIO_INPUT_FILE, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False):
+    # Input stream until no more words using pyaudio to identify peaks. Saved to the audio_input_file location.
+    pp.capture_audio_until_silence(threshold=threshold)
+
+    # Run whisper pipeline or leopard pipeline on the recently saved audio file to transcribe.
+    transcribed_text = pipe(audio_input_file)['text']
+
+    # Pipe the transcribed text straight into the Ollama LLAMA model and output response into a file
+    response = converse(transcribed_text, llm_model=llm_model, llm_response_path=llm_response_path, have_context=have_context)
+    print(response)
+
+    # Get Piper to read out the result.
+    piper(response)
+
+# Initial response to wake word
+def convo_initialised():
+    play_prompt(fallback_prompt=WELCOME_PROMPT, audio_files=CUSTOM_WELCOME_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
+    speech_to_response()
+    
+# Function to continue the conversation
+def continue_convo(time_limit=60):
+    play_prompt(fallback_prompt=CONTINUE_CONVO_INSTRUCTIONS_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
+    # Wake word detection to continue or exit the conversation
+    porcupine_continue, audio_stream_continue, pa_continue = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
+    porcupine_end, audio_stream_end, pa_end = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_1])
+    start = time.time()
+    try: 
+        while (time.time() - start < time_limit):
+            # Read a frame of audio
+            pcm_continue = audio_stream_continue.read(porcupine_continue.frame_length, exception_on_overflow=False)
+            pcm_unpacked_continue = struct.unpack_from("h" * porcupine_continue.frame_length, pcm_continue)
+
+            pcm_end = audio_stream_end.read(porcupine_end.frame_length, exception_on_overflow=False)
+            pcm_unpacked_end = struct.unpack_from("h" * porcupine_end.frame_length, pcm_end)
+
+            # Check if the wake word is detected
+            keyword_index_continue = porcupine_continue.process(pcm_unpacked_continue)
+            keyword_index_end = porcupine_end.process(pcm_unpacked_end)
+
+            if keyword_index_continue >= 0:
+                play_prompt(fallback_prompt=LISTENING_PROMPT, audio_files=CUSTOM_LISTENING_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
+                speech_to_response(have_context=True)
+                play_prompt(fallback_prompt=CONTINUE_CONVO_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
+                start = time.time()
+            elif keyword_index_end >= 0:
+                break
+
+    except KeyboardInterrupt:
+        print("Stopping...")
+
+    finally:
+        # Clean up resources
+        audio_stream_continue.stop_stream()
+        audio_stream_continue.close()
+        pa_continue.terminate()
+        porcupine_continue.delete()
+
+        audio_stream_end.stop_stream()
+        audio_stream_end.close()
+        pa_end.terminate()
+        porcupine_end.delete()
+
+        play_prompt(fallback_prompt=GOODBYE_PROMPT, audio_files=CUSTOM_GOODBYE_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
+##########################################################################################
+
+
+##########################################################################################
+##### Main Loop #####
+# Initialize Porcupine with custom wake word
+porcupine_initial, audio_stream_initial, pa_initial = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
+print("Listening for wake word...")
+try:
+    while True:
+        # Read a frame of audio
+        pcm_initial = audio_stream_initial.read(porcupine_initial.frame_length, exception_on_overflow=False)
+        pcm_unpacked_initial = struct.unpack_from("h" * porcupine_initial.frame_length, pcm_initial)
+
+        # Check if the wake word is detected
+        keyword_index = porcupine_initial.process(pcm_unpacked_initial)
+        if keyword_index >= 0:
+            convo_initialised()
+            continue_convo()
+except KeyboardInterrupt:
+    print("Stopping...")
+finally:
+    # Clean up resources
+    audio_stream_initial.stop_stream()
+    audio_stream_initial.close()
+    pa_initial.terminate()
+    porcupine_initial.delete()
+##########################################################################################
--- a/ollama.py
+++ b/ollama.py
@ -0,0 +1,81 @@
+import pyaudio, wave, time, os, numpy as np, requests, subprocess, sys, json
+from datetime import datetime
+from dotenv import load_dotenv
+load_dotenv()
+
+ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
+
+##########################################################################################
+
+##### OLLAMA ##### 
+def ping_api(url):
+    """Ping the specified API URL and return the response status."""
+    try:
+        # Send a GET request to the API
+        response = requests.get(url)
+        
+        # Check the response status code
+        if response.status_code == 200:
+            print("API is reachable.")
+            return True
+        else:
+            print(f"API responded with status code: {response.status_code}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"Error pinging API: {e}")
+        return False
+
+def start_ollama_server(llm_model):
+    """Start the Ollama server using subprocess."""
+    # Adjust the command as necessary for your setup
+    command = ["ollama", "run", "--keepalive", "24h", llm_model, 'Don\'t Say Anything']  # Replace with the correct command to start the server
+
+    # Start the server in a new process
+    subprocess.run(command)
+
+
+
+def converse(input_text, ping_url, api_url, llm_model, llm_response_path, prompt, context_file, have_context = False):
+    """
+    Send a prompt to the Ollama API and return the response.
+    input_text: The text to send to the API.
+    ping_url: The URL to ping to check if the API is running.
+    api_url: The URL of the Ollama API. 
+    llm_model: The LLM model to use.
+    llm_response_path: The path to save the LLM responses and prompts.
+    prompt: The prompt to use for the conversation.
+    context_file: The path to the context file.
+    """
+    # Ping the Llama
+    if not ping_api(ping_url):
+        try:
+            start_ollama_server(llm_model)
+        except Exception as e:
+            print(f"Error starting Ollama server: {e}. If you are using another Ollama server, please ensure you have correctly specified the BASE_URL and that the server is running and not firewalled off.")
+            sys.exit(1)
+
+    payload = { "model": llm_model, "prompt": f'{prompt} {input_text}', "stream": False, "keep_alive": "24h" }
+
+    if have_context:
+        # load json context file
+        with open(context_file, 'r') as f:
+            context = json.load(f).get('context')
+        payload.update({'context': context})
+
+    # Make the POST request
+    response = requests.post(api_url, json=payload)
+
+    # Check for errors and print the response
+    if not response.ok:
+        print("Error:", response.status_code, response.text)
+    
+    # Save the context and all other responses of the API call to a file
+    with open(context_file, 'w') as f:
+        json.dump(response.json(), f)
+
+    # Save the conversations to a file
+    with open(llm_response_path, "a") as f:
+        f.write(f'[{datetime.now().isoformat()}] Prompt: {input_text}\n')
+        f.write(f'[{response.json().get('created_at')}] Response: {response.json().get('response')}\n')
+
+    return response.json().get('response')
--- a/piper-tts/en_GB-jenny_dioco-medium.onnx
+++ b/piper-tts/en_GB-jenny_dioco-medium.onnx
--- a/piper-tts/en_GB-semaine-medium.onnx
+++ b/piper-tts/en_GB-semaine-medium.onnx
--- a/piper-tts/en_US-ryan-medium.onnx
+++ b/piper-tts/en_US-ryan-medium.onnx
--- a/piper-tts/en_en_GB_jenny_dioco_medium_en_GB-jenny_dioco-medium.onnx.json
+++ b/piper-tts/en_en_GB_jenny_dioco_medium_en_GB-jenny_dioco-medium.onnx.json
@ -0,0 +1,493 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "en-gb-x-rp"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "en_GB",
+    "family": "en",
+    "region": "GB",
+    "name_native": "English",
+    "name_english": "English",
+    "country_english": "Great Britain"
+  },
+  "dataset": "jenny_dioco"
+}
--- a/piper-tts/en_en_GB_semaine_medium_en_GB-semaine-medium.onnx.json
+++ b/piper-tts/en_en_GB_semaine_medium_en_GB-semaine-medium.onnx.json
@ -0,0 +1,508 @@
+{
+  "piper_version": "1.2.0",
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "en-gb-x-rp"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "\"": [
+      150
+    ],
+    "#": [
+      149
+    ],
+    "$": [
+      2
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "X": [
+      156
+    ],
+    "^": [
+      1
+    ],
+    "_": [
+      0
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "g": [
+      154
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʦ": [
+      155
+    ],
+    "ʰ": [
+      145
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "ˤ": [
+      146
+    ],
+    "̃": [
+      141
+    ],
+    "̧": [
+      140
+    ],
+    "̩": [
+      144
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ],
+    "β": [
+      125
+    ],
+    "ε": [
+      147
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "↑": [
+      151
+    ],
+    "↓": [
+      148
+    ],
+    "ⱱ": [
+      129
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 4,
+  "speaker_id_map": {
+    "prudence": 0,
+    "spike": 1,
+    "obadiah": 2,
+    "poppy": 3
+  },
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "en_GB",
+    "family": "en",
+    "region": "GB",
+    "name_native": "English",
+    "name_english": "English",
+    "country_english": "Great Britain"
+  },
+  "dataset": "semaine"
+}
--- a/piper-tts/en_en_US_ryan_medium_en_US-ryan-medium.onnx.json
+++ b/piper-tts/en_en_US_ryan_medium_en_US-ryan-medium.onnx.json
@ -0,0 +1,493 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "en-us"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "en_US",
+    "family": "en",
+    "region": "US",
+    "name_native": "English",
+    "name_english": "English",
+    "country_english": "United States"
+  },
+  "dataset": "ryan"
+}
--- a/piper-tts/fr_FR-mls-medium.onnx
+++ b/piper-tts/fr_FR-mls-medium.onnx
--- a/piper-tts/fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json
+++ b/piper-tts/fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json
@ -0,0 +1,634 @@
+{
+  "dataset": "mls",
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "fr"
+  },
+  "language": {
+    "code": "fr_FR",
+    "family": "fr",
+    "region": "FR",
+    "name_native": "Français",
+    "name_english": "French",
+    "country_english": "France"
+  },
+  "inference": {
+    "noise_scale": 0.333,
+    "length_scale": 1,
+    "noise_w": 0.333
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "\"": [
+      150
+    ],
+    "#": [
+      149
+    ],
+    "$": [
+      2
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "X": [
+      156
+    ],
+    "^": [
+      1
+    ],
+    "_": [
+      0
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "g": [
+      154
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʦ": [
+      155
+    ],
+    "ʰ": [
+      145
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "ˤ": [
+      146
+    ],
+    "̃": [
+      141
+    ],
+    "̊": [
+      158
+    ],
+    "̝": [
+      157
+    ],
+    "̧": [
+      140
+    ],
+    "̩": [
+      144
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ],
+    "β": [
+      125
+    ],
+    "ε": [
+      147
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "↑": [
+      151
+    ],
+    "↓": [
+      148
+    ],
+    "ⱱ": [
+      129
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 125,
+  "speaker_id_map": {
+    "1840": 0,
+    "3698": 1,
+    "123": 2,
+    "1474": 3,
+    "12709": 4,
+    "7423": 5,
+    "9242": 6,
+    "8778": 7,
+    "3060": 8,
+    "4512": 9,
+    "6249": 10,
+    "12541": 11,
+    "13634": 12,
+    "10065": 13,
+    "6128": 14,
+    "5232": 15,
+    "5764": 16,
+    "12713": 17,
+    "12823": 18,
+    "6070": 19,
+    "12501": 20,
+    "9121": 21,
+    "1649": 22,
+    "2776": 23,
+    "11772": 24,
+    "5612": 25,
+    "11822": 26,
+    "1590": 27,
+    "5525": 28,
+    "10827": 29,
+    "1243": 30,
+    "13142": 31,
+    "62": 32,
+    "13177": 33,
+    "10620": 34,
+    "8102": 35,
+    "8582": 36,
+    "11875": 37,
+    "7239": 38,
+    "9854": 39,
+    "7377": 40,
+    "10082": 41,
+    "12512": 42,
+    "1329": 43,
+    "2506": 44,
+    "6856": 45,
+    "10058": 46,
+    "103": 47,
+    "14": 48,
+    "6381": 49,
+    "1664": 50,
+    "11954": 51,
+    "66": 52,
+    "1127": 53,
+    "3270": 54,
+    "13611": 55,
+    "13658": 56,
+    "12968": 57,
+    "1989": 58,
+    "12981": 59,
+    "7193": 60,
+    "6348": 61,
+    "7679": 62,
+    "2284": 63,
+    "3182": 64,
+    "3503": 65,
+    "2033": 66,
+    "2771": 67,
+    "7614": 68,
+    "125": 69,
+    "3204": 70,
+    "5595": 71,
+    "5553": 72,
+    "694": 73,
+    "1624": 74,
+    "1887": 75,
+    "2926": 76,
+    "7150": 77,
+    "3190": 78,
+    "3344": 79,
+    "4699": 80,
+    "1798": 81,
+    "1745": 82,
+    "5077": 83,
+    "753": 84,
+    "52": 85,
+    "4174": 86,
+    "4018": 87,
+    "12899": 88,
+    "1844": 89,
+    "4396": 90,
+    "1817": 91,
+    "2155": 92,
+    "2946": 93,
+    "4336": 94,
+    "4609": 95,
+    "1977": 96,
+    "10957": 97,
+    "204": 98,
+    "4650": 99,
+    "5295": 100,
+    "5968": 101,
+    "4744": 102,
+    "2825": 103,
+    "9804": 104,
+    "707": 105,
+    "30": 106,
+    "115": 107,
+    "5840": 108,
+    "2587": 109,
+    "2607": 110,
+    "2544": 111,
+    "28": 112,
+    "27": 113,
+    "177": 114,
+    "112": 115,
+    "94": 116,
+    "2596": 117,
+    "3595": 118,
+    "7032": 119,
+    "7848": 120,
+    "11247": 121,
+    "7439": 122,
+    "2904": 123,
+    "6362": 124
+  },
+  "piper_version": "1.0.0"
+}
--- a/piper-tts/glados.onnx
+++ b/piper-tts/glados.onnx
--- a/piper-tts/glados.onnx.json
+++ b/piper-tts/glados.onnx.json
@ -0,0 +1,497 @@
+{
+    "dataset": "glados",
+    "audio": {
+        "sample_rate": 22050,
+        "quality": "stacked_llama"
+    },
+    "espeak": {
+        "voice": "en-us"
+    },
+    "language": {
+        "code": "en-us"
+    },
+    "inference": {
+        "noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_w": 0.8
+    },
+    "phoneme_type": "espeak",
+    "phoneme_map": {},
+    "phoneme_id_map": {
+        " ": [
+            3
+        ],
+        "!": [
+            4
+        ],
+        "\"": [
+            150
+        ],
+        "#": [
+            149
+        ],
+        "$": [
+            2
+        ],
+        "'": [
+            5
+        ],
+        "(": [
+            6
+        ],
+        ")": [
+            7
+        ],
+        ",": [
+            8
+        ],
+        "-": [
+            9
+        ],
+        ".": [
+            10
+        ],
+        "0": [
+            130
+        ],
+        "1": [
+            131
+        ],
+        "2": [
+            132
+        ],
+        "3": [
+            133
+        ],
+        "4": [
+            134
+        ],
+        "5": [
+            135
+        ],
+        "6": [
+            136
+        ],
+        "7": [
+            137
+        ],
+        "8": [
+            138
+        ],
+        "9": [
+            139
+        ],
+        ":": [
+            11
+        ],
+        ";": [
+            12
+        ],
+        "?": [
+            13
+        ],
+        "X": [
+            156
+        ],
+        "^": [
+            1
+        ],
+        "_": [
+            0
+        ],
+        "a": [
+            14
+        ],
+        "b": [
+            15
+        ],
+        "c": [
+            16
+        ],
+        "d": [
+            17
+        ],
+        "e": [
+            18
+        ],
+        "f": [
+            19
+        ],
+        "g": [
+            154
+        ],
+        "h": [
+            20
+        ],
+        "i": [
+            21
+        ],
+        "j": [
+            22
+        ],
+        "k": [
+            23
+        ],
+        "l": [
+            24
+        ],
+        "m": [
+            25
+        ],
+        "n": [
+            26
+        ],
+        "o": [
+            27
+        ],
+        "p": [
+            28
+        ],
+        "q": [
+            29
+        ],
+        "r": [
+            30
+        ],
+        "s": [
+            31
+        ],
+        "t": [
+            32
+        ],
+        "u": [
+            33
+        ],
+        "v": [
+            34
+        ],
+        "w": [
+            35
+        ],
+        "x": [
+            36
+        ],
+        "y": [
+            37
+        ],
+        "z": [
+            38
+        ],
+        "æ": [
+            39
+        ],
+        "ç": [
+            40
+        ],
+        "ð": [
+            41
+        ],
+        "ø": [
+            42
+        ],
+        "ħ": [
+            43
+        ],
+        "ŋ": [
+            44
+        ],
+        "œ": [
+            45
+        ],
+        "ǀ": [
+            46
+        ],
+        "ǁ": [
+            47
+        ],
+        "ǂ": [
+            48
+        ],
+        "ǃ": [
+            49
+        ],
+        "ɐ": [
+            50
+        ],
+        "ɑ": [
+            51
+        ],
+        "ɒ": [
+            52
+        ],
+        "ɓ": [
+            53
+        ],
+        "ɔ": [
+            54
+        ],
+        "ɕ": [
+            55
+        ],
+        "ɖ": [
+            56
+        ],
+        "ɗ": [
+            57
+        ],
+        "ɘ": [
+            58
+        ],
+        "ə": [
+            59
+        ],
+        "ɚ": [
+            60
+        ],
+        "ɛ": [
+            61
+        ],
+        "ɜ": [
+            62
+        ],
+        "ɞ": [
+            63
+        ],
+        "ɟ": [
+            64
+        ],
+        "ɠ": [
+            65
+        ],
+        "ɡ": [
+            66
+        ],
+        "ɢ": [
+            67
+        ],
+        "ɣ": [
+            68
+        ],
+        "ɤ": [
+            69
+        ],
+        "ɥ": [
+            70
+        ],
+        "ɦ": [
+            71
+        ],
+        "ɧ": [
+            72
+        ],
+        "ɨ": [
+            73
+        ],
+        "ɪ": [
+            74
+        ],
+        "ɫ": [
+            75
+        ],
+        "ɬ": [
+            76
+        ],
+        "ɭ": [
+            77
+        ],
+        "ɮ": [
+            78
+        ],
+        "ɯ": [
+            79
+        ],
+        "ɰ": [
+            80
+        ],
+        "ɱ": [
+            81
+        ],
+        "ɲ": [
+            82
+        ],
+        "ɳ": [
+            83
+        ],
+        "ɴ": [
+            84
+        ],
+        "ɵ": [
+            85
+        ],
+        "ɶ": [
+            86
+        ],
+        "ɸ": [
+            87
+        ],
+        "ɹ": [
+            88
+        ],
+        "ɺ": [
+            89
+        ],
+        "ɻ": [
+            90
+        ],
+        "ɽ": [
+            91
+        ],
+        "ɾ": [
+            92
+        ],
+        "ʀ": [
+            93
+        ],
+        "ʁ": [
+            94
+        ],
+        "ʂ": [
+            95
+        ],
+        "ʃ": [
+            96
+        ],
+        "ʄ": [
+            97
+        ],
+        "ʈ": [
+            98
+        ],
+        "ʉ": [
+            99
+        ],
+        "ʊ": [
+            100
+        ],
+        "ʋ": [
+            101
+        ],
+        "ʌ": [
+            102
+        ],
+        "ʍ": [
+            103
+        ],
+        "ʎ": [
+            104
+        ],
+        "ʏ": [
+            105
+        ],
+        "ʐ": [
+            106
+        ],
+        "ʑ": [
+            107
+        ],
+        "ʒ": [
+            108
+        ],
+        "ʔ": [
+            109
+        ],
+        "ʕ": [
+            110
+        ],
+        "ʘ": [
+            111
+        ],
+        "ʙ": [
+            112
+        ],
+        "ʛ": [
+            113
+        ],
+        "ʜ": [
+            114
+        ],
+        "ʝ": [
+            115
+        ],
+        "ʟ": [
+            116
+        ],
+        "ʡ": [
+            117
+        ],
+        "ʢ": [
+            118
+        ],
+        "ʦ": [
+            155
+        ],
+        "ʰ": [
+            145
+        ],
+        "ʲ": [
+            119
+        ],
+        "ˈ": [
+            120
+        ],
+        "ˌ": [
+            121
+        ],
+        "ː": [
+            122
+        ],
+        "ˑ": [
+            123
+        ],
+        "˞": [
+            124
+        ],
+        "ˤ": [
+            146
+        ],
+        "̃": [
+            141
+        ],
+        "̧": [
+            140
+        ],
+        "̩": [
+            144
+        ],
+        "̪": [
+            142
+        ],
+        "̯": [
+            143
+        ],
+        "̺": [
+            152
+        ],
+        "̻": [
+            153
+        ],
+        "β": [
+            125
+        ],
+        "ε": [
+            147
+        ],
+        "θ": [
+            126
+        ],
+        "χ": [
+            127
+        ],
+        "ᵻ": [
+            128
+        ],
+        "↑": [
+            151
+        ],
+        "↓": [
+            148
+        ],
+        "ⱱ": [
+            129
+        ]
+    },
+    "num_symbols": 256,
+    "num_speakers": 1,
+    "speaker_id_map": {},
+    "piper_version": "1.0.0"
+}
--- a/piper-tts/ru_RU-dmitri-medium.onnx
+++ b/piper-tts/ru_RU-dmitri-medium.onnx
--- a/piper-tts/ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json
+++ b/piper-tts/ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json
@ -0,0 +1,487 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "ru"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "ru_RU",
+    "family": "ru",
+    "region": "RU",
+    "name_native": "Русский",
+    "name_english": "Russian",
+    "country_english": "Russia"
+  },
+  "dataset": "dmitri"
+}
--- a/piper-tts/silero_vad.onnx
+++ b/piper-tts/silero_vad.onnx
--- a/piper-tts/vi_VN-vais1000-medium.onnx
+++ b/piper-tts/vi_VN-vais1000-medium.onnx
--- a/piper-tts/vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json
+++ b/piper-tts/vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json
@ -0,0 +1,492 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "vi"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ],
+    "̺": [
+      152
+    ],
+    "̻": [
+      153
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "vi_VN",
+    "family": "vi",
+    "region": "VN",
+    "name_native": "Tiếng Việt",
+    "name_english": "Vietnamese",
+    "country_english": "Vietnam"
+  },
+  "dataset": "vais1000"
+}
--- a/piper-tts/zh_CN-huayan-medium.onnx
+++ b/piper-tts/zh_CN-huayan-medium.onnx
--- a/piper-tts/zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json
+++ b/piper-tts/zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json
@ -0,0 +1,487 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "cmn"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "zh_CN",
+    "family": "zh",
+    "region": "CN",
+    "name_native": "简体中文",
+    "name_english": "Chinese",
+    "country_english": "China"
+  },
+  "dataset": "huayan"
+}
--- a/piper.py
+++ b/piper.py
@ -0,0 +1,116 @@
+import pygame, os, subprocess, language, wave, pyaudio, time, numpy as np, random
+from dotenv import load_dotenv
+load_dotenv()
+
+ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
+PIPER_AUDIO_OUTPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/chat_voice_out.wav')
+AUDIO_INPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/input_audio.wav')
+
+def play_audio(audio_file, volume=0.5):
+    # Initialize pygame mixer
+    pygame.mixer.init()
+    pygame.mixer.music.load(audio_file)
+    pygame.mixer.music.play()
+    pygame.mixer.music.set_volume(volume)
+    while pygame.mixer.music.get_busy():
+        continue
+
+
+
+def piper(text, model, config):
+    process1 = subprocess.run(['echo', text], stdout=subprocess.PIPE)
+    subprocess.run(['piper-tts', '--sentence-silence', '0.5', '--model', model, '--config', config, '--output_file', PIPER_AUDIO_OUTPUT_FILE], input=process1.stdout)
+    play_audio(PIPER_AUDIO_OUTPUT_FILE)
+
+def eng_piper(text):
+    piper(text, model = eng_piper_model, config = eng_piper_conf)
+
+
+
+def play_prompt(fallback_prompt: str = "Missing Fallback Prompt", audio_files: list = [],  use_custom: bool = True):
+    if use_custom:
+        valid_files = []
+        for i in audio_files:
+            if os.path.exists(i):
+                valid_files.append(i)
+        if len(valid_files) == 0:
+            eng_piper(fallback_prompt)
+            return
+        number_of_files = len(valid_files)
+        file = random.randint(0, number_of_files-1)
+        play_audio(valid_files[file], volume=1)
+    else:
+        eng_piper(fallback_prompt)
+
+eng_piper_model, eng_piper_conf = language.files_language('en')
+
+
+
+# Capture the audio input
+
+def capture_audio_until_silence(threshold=800, silence_duration=3, output_file = AUDIO_INPUT_FILE):
+    """
+    Capture audio until a period of silence is detected.
+    threshold: The audio level that defines silence.
+    silence_duration: The duration of silence to wait for before stopping.
+    output_file: The file to save the recorded audio to.
+    """
+    # PyAudio configuration
+    p = pyaudio.PyAudio()
+    chunk = 1024
+    sample_format = pyaudio.paInt16
+    channels = 2
+    rate = 44100
+
+    # Start recording
+    stream = p.open(format=sample_format,
+                    channels=channels,
+                    rate=rate,
+                    input=True,
+                    frames_per_buffer=chunk)
+
+    print("Listening...")
+
+    audio_frames = []
+    last_time = time.time()
+
+    try:
+        while True:
+            # Read audio data
+            data = stream.read(chunk)
+            audio_frames.append(data)
+
+            # Convert data to numpy array for analysis
+            audio_data = np.frombuffer(data, dtype=np.int16)
+            peak = np.abs(audio_data).max()
+
+            # Check if the sound level exceeds the threshold
+            if peak > threshold:
+                last_time = time.time()  # Reset the silence timer
+            else:
+                # Check for silence
+                if time.time() - last_time > silence_duration:
+                    print(f"No sound detected for {silence_duration} seconds. Stopping...")
+                    break
+
+    except KeyboardInterrupt:
+        print("Stopped by user.")
+
+    finally:
+        # Stop and close the stream
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+        with wave.open(output_file, 'wb') as wf:
+            wf.setnchannels(channels)
+            wf.setsampwidth(p.get_sample_size(sample_format))
+            wf.setframerate(rate)
+            wf.writeframes(b''.join(audio_frames))
+
+
+
+if __name__ == "__main__":
+    zh_piper_model, zh_piper_conf = language.files_language('zh')
+    eng_piper("Hello, I am Piper. I am a text-to-speech model.")
+    piper("你好，我是派珀。我是一个文本转语音模型。", model = zh_piper_model, config = zh_piper_conf)
+    # capture_audio_until_silence() # you can try out whisper on the recording you have just made by running whisper.py
--- a/record_audio.py
+++ b/record_audio.py
@ -0,0 +1,32 @@
+import sounddevice as sd
+import numpy as np
+import wave, os, time
+from dotenv import load_dotenv
+load_dotenv()
+def record_audio(duration, filename):
+    # Sampling frequency
+    fs = 44100
+
+    # Start recording
+    print("Recording...")
+    recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype=np.int16)
+    sd.wait()  # Wait until recording is finished
+    print("Recording finished")
+
+    # Save as WAV file
+    with wave.open(filename, 'wb') as wf:
+        wf.setnchannels(2)
+        wf.setsampwidth(2)
+        wf.setframerate(fs)
+        wf.writeframes(recording.tobytes())
+
+if __name__ == "__main__":
+  ROOT_DIR = os.getenv('ROOT_DIR', './')
+  for i in range(1, 5):
+    print(f"Recording {i}...")
+    duration = 5  # seconds
+    filename = os.path.join(ROOT_DIR, 'llm_media/recording{i}.wav')
+    record_audio(duration, filename)
+    print(f"Saved as {filename}")
+    print("Break for 3 seconds...")
+    time.sleep(3)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,282 @@
+accelerate==1.0.1
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+alembic==1.13.3
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.6.2.post1
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asteroid-filterbanks==0.4.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+audioread==3.0.1
+av==12.3.0
+babel==2.16.0
+bark==0.1.5
+beautifulsoup4==4.12.3
+bleach==6.1.0
+boto3==1.35.49
+botocore==1.35.49
+certifi==2024.8.30
+cffi==1.17.1
+chardet==3.0.4
+charset-normalizer==3.4.0
+click==8.1.7
+coloredlogs==15.0.1
+colorlog==6.8.2
+comm==0.2.2
+contourpy==1.3.0
+cpm-kernels==1.0.11
+ctranslate2==4.4.0
+cycler==0.12.1
+debugpy==1.8.7
+decorator==5.1.1
+deep-translator==1.11.4
+defusedxml==0.7.1
+diffusers==0.31.0
+dill==0.3.8
+docopt==0.6.2
+easyocr==1.7.2
+einops==0.8.0
+encodec==0.1.1
+executing==2.1.0
+fastapi==0.115.3
+faster-whisper==1.0.3
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+flatbuffers==24.3.25
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+funcy==2.0
+googletrans==3.0.0
+gradio==5.4.0
+gradio_client==1.4.2
+greenlet==3.1.1
+h11==0.14.0
+h2==3.2.0
+hpack==3.0.0
+hstspreload==2024.10.1
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.1
+humanfriendly==10.0
+hyperframe==5.2.0
+HyperPyYAML==1.2.2
+idna==2.10
+imageio==2.36.0
+importlib_metadata==8.5.0
+ipykernel==6.29.5
+ipython==8.29.0
+ipython-autotime==0.3.2
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.4
+jmespath==1.0.1
+joblib==1.4.2
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+julius==0.2.7
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.7
+langdetect==1.0.9
+langid==1.1.6
+latex2mathml==3.77.0
+lazy_loader==0.4
+librosa==0.10.2.post1
+lightning==2.4.0
+lightning-utilities==0.11.8
+llvmlite==0.43.0
+Mako==1.3.6
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdtex2html==1.3.0
+mdurl==0.1.2
+mistune==3.0.2
+MouseInfo==0.1.3
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+ninja==1.11.1.1
+nltk==3.9.1
+notebook==7.2.2
+notebook_shim==0.2.4
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+onnxruntime==1.19.2
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+optimum==1.23.2
+optuna==4.0.0
+orjson==3.10.10
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+pooch==1.8.2
+primePy==1.3
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+protobuf==5.28.3
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pvporcupine==3.0.3
+pyannote.audio==3.1.1
+pyannote.core==5.0.0
+pyannote.database==5.1.0
+pyannote.metrics==3.2.1
+pyannote.pipeline==3.0.1
+pyarrow==17.0.0
+PyAudio==0.2.14
+PyAutoGUI==0.9.54
+pyclipper==1.3.0.post6
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pygame==2.6.1
+PyGetWindow==0.0.9
+Pygments==2.18.0
+PyMsgBox==1.0.9
+pyparsing==3.2.0
+pyperclip==1.9.0
+pypinyin==0.53.0
+PyRect==0.2.0
+PyScreeze==1.0.1
+pytesseract==0.3.13
+python-bidi==0.6.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python3-xlib==0.15
+pytorch-lightning==2.4.0
+pytorch-metric-learning==2.6.0
+pyttsx3==2.98
+pytweening==1.2.0
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986==1.5.0
+rfc3986-validator==0.1.1
+rich==13.9.3
+rpds-py==0.20.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.12
+ruff==0.7.1
+s3transfer==0.10.3
+safehttpx==0.1.1
+safetensors==0.4.5
+scikit-image==0.24.0
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+semver==3.0.2
+Send2Trash==1.8.3
+sentencepiece==0.2.0
+setuptools==75.2.0
+shapely==2.0.6
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+sounddevice==0.5.1
+soundfile==0.12.1
+soupsieve==2.6
+soxr==0.5.0.post1
+speechbrain==1.0.1
+SpeechRecognition==3.11.0
+SQLAlchemy==2.0.36
+srt==3.5.3
+stack-data==0.6.3
+starlette==0.41.2
+sympy==1.13.1
+tabulate==0.9.0
+tensorboardX==2.6.2.2
+terminado==0.18.1
+threadpoolctl==3.5.0
+tifffile==2024.9.20
+tinycss2==1.4.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+torch==2.5.0
+torch-audiomentations==0.11.1
+torch_pitch_shift==1.2.5
+torchaudio==2.5.0
+torchmetrics==1.5.1
+torchvision==0.20.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.46.0
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+typing_extensions==4.12.2
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+vosk==0.3.45
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==12.0
+widgetsnbextension==4.0.13
+xxhash==3.5.0
+yarl==1.16.0
+zipp==3.20.2
--- a/whisper.py
+++ b/whisper.py
@ -0,0 +1,43 @@
+##########################################################################################
+##### WhisperX #####
+
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+
+def whisper_pipeline(model_id: str, whisper_lang: str):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
+    )
+    model.generation_config.language = whisper_lang  # define your language of choice here
+    model.to(device)
+
+    processor = AutoProcessor.from_pretrained(model_id)
+
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        generate_kwargs={"max_new_tokens": 128},
+        torch_dtype=torch_dtype,
+        device=device
+    )
+
+    return pipe
+##########################################################################################
+
+if __name__ == "__main__":
+    # Example Usage
+    import os
+    from dotenv import load_dotenv
+    load_dotenv()
+    ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
+    model_id = "openai/whisper-tiny"
+    whisper_lang = "en"
+    whisper = whisper_pipeline(model_id, whisper_lang)
+    audio_file = os.path.join(ROOT_DIR, "llm_media/input_audio.wav")
+    result = whisper(audio_file)['text']
+    print(result)