commit 51efcd9c0fb1e0e72a3cd8bf60ece9cbe6a216c4 Author: chickenflyshigh Date: Tue Oct 29 22:05:42 2024 +1100 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..e1ac3cc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# + +This diff --git a/language.py b/language.py new file mode 100644 index 0000000..ea5f4b3 --- /dev/null +++ b/language.py @@ -0,0 +1,51 @@ +import os +from dotenv import load_dotenv +load_dotenv() + +# https://github.com/rhasspy/piper/blob/master/VOICES.md download here + +ROOT_DIR = os.getenv('ROOT_DIR',os.path.dirname(__file__)) +PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts') +LANGUAGES = ['en', 'fr', 'ru', 'zh', 'vi'] + +# piper-tts + +config_file = {'en': 'glados.onnx.json', + 'fr': 'fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json', + 'ru': 'ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json', + 'zh': 'zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json', + 'vi': 'vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json' + } +onnx_file = {'en': 'glados.onnx', + 'fr': 'fr_FR-mls-medium.onnx', + 'ru': 'ru_RU-dmitri-medium.onnx', + 'zh': 'zh_CN-huayan-medium.onnx', + 'vi': 'vi_VN-vais1000-medium.onnx' + } + +prompt = { 'en': 'Respond only in english in under 100 words.', + 'fr': 'Répondez uniquement en français en moins de 100 mots.', + 'ru': 'Отвечайте только на русском языке менее 100 слов.', + 'zh': '只用中文回答,不超过100个字。一点英文都不可以用。', + 'vi': 'Chỉ trả lời bằng tiếng Việt dưới 100 từ.' + } + +def piper_voice_language(lang): + if lang in LANGUAGES: + return lang + else: + return 'en' + +# retrieve the corresponding piper-tts voice files for the language +def files_language(lang): + language = piper_voice_language(lang) + conf_path = os.path.join(PIPER_TTS_DIR, config_file[language]) + onnx_path = os.path.join(PIPER_TTS_DIR, onnx_file[language]) + return (onnx_path, conf_path) + +def get_variables(lang): + language = piper_voice_language(lang) + starting_prompt = prompt[language] + onnx_path, conf_path = files_language(language) + return (onnx_path, conf_path, language, starting_prompt) + diff --git a/main.py b/main.py new file mode 100644 index 0000000..387f141 --- /dev/null +++ b/main.py @@ -0,0 +1,274 @@ +########################################################################################## +##### import basic libraries ##### +import os, time, random, json, requests, sys, pyaudio +import pvporcupine +import struct +from datetime import datetime +import language +from whisper import whisper_pipeline +import piper as pp +from piper import play_audio, eng_piper, play_prompt +import ollama as ol +from dotenv import load_dotenv +load_dotenv() +########################################################################################## + + +########################################################################################## +# By default this requires at least 6GB VRAM if using a CUDA supported GPU. Requirements can be lowered if the Ollama and Whisper models are changed to a smaller model. +# You can also consider using Leopard for a small and relatively accurate audio transcription model. Edit the whisper.py file to use Leopard instead of WhisperX. Ensure the function output is just a string. +# for basic audio transcription, you can also use vosk or google speech to text. Google speech is less accurate and subject to rate limiting/charges. Vosk is relatively inaccurate but uses minimal resources. + +# for more languages, look at the language.py file and download more languages from there into the piper-tts folder +# ensure you have ollama installed. If you don't have the models downloaded from Ollama it may take some time depending on which model you choose +########################################################################################## + + +########################################################################################## +### Root directory for where you store llm_media, wake_words and piper_tts folders and files +# By default, it is the same directory as this file +ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) # for where you want to keep all your files + +### From the ROOT_DIR, create the necessary folders if they don't exist +WAKE_WORDS_DIR = os.path.join(ROOT_DIR, 'wake_words') +LLM_MEDIA_DIR = os.path.join(ROOT_DIR, 'llm_media') +PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts') +if not os.path.exists(WAKE_WORDS_DIR): + os.makedirs(WAKE_WORDS_DIR) + sys.exit(f"Wake words directory have just been created in {ROOT_DIR}. Please download the wake word files from the Picovoice Console and place them in the wake_words directory.") +if not os.path.exists(LLM_MEDIA_DIR): + os.makedirs(LLM_MEDIA_DIR) +if not os.path.exists(PIPER_TTS_DIR): + os.makedirs(PIPER_TTS_DIR) + sys.exit(f"Piper TTS directory have just been created in {ROOT_DIR}. Please download the piper-tts files from the Piper TTS repository and place them in the piper-tts directory.") + +### Custom prompts +CUSTOM_PROMPTS_DIR = os.path.join(LLM_MEDIA_DIR, 'custom_prompts') +if not os.path.exists(CUSTOM_PROMPTS_DIR): + os.makedirs(CUSTOM_PROMPTS_DIR) + +########################################################################################## + + +########################################################################################## +##### Environmental Variables ##### +# Please define the following environmental variables in a .env file or in your system environment. The only required ones are: PICOVOICE_ACCESS_KEY, WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE +# WAKEWORD_FILE_INITIAL is the word you say to initiate the chat and WAKEWORD_FILE_CONTINUE is the word to continue the chat after the initial dialogue. + +### Llama Variables +BASE_URL = os.getenv('BASE_URL', 'http://localhost:11434') +LLM_MODEL = os.getenv('LLM_MODEL', 'llama3.2:latest') # about 3GB of VRAM/RAM required for LLAMA3.2 model + +### Threshold for audio peaks +THRESHOLD = int(os.getenv('THRESHOLD', '1000')) + +### WhisperX Variables +MODEL_ID = os.getenv('MODEL_ID', 'openai/whisper-large-v3-turbo') + +# Language code for whisper pipeline, defaults to en. Possible languages = ['en', 'fr', 'ru', 'zh', 'vi'] +LANG = 'en' +for arg in sys.argv: + if arg.lower().startswith('lang='): + LANG = arg.split('=')[1] + +### Piper variables +# LANG is checked for validity else it defaults to 'en'. Prompt is used for the OLLAMA model at the start. +PIPER_MODEL_FILE, PIPER_CONFIG_FILE, LANG, PROMPT = language.get_variables(LANG) + +# Prompt for the Llama model. Change the environment variable PROMPT to change the prompt. +PROMPT = os.getenv('PROMPT', PROMPT) + +### Picovoice +ACCESS_KEY = os.getenv('PICOVOICE_ACCESS_KEY') +WAKEWORD_FILE_INITIAL = os.getenv('WAKEWORD_FILE_INITIAL') +WAKEWORD_FILE_CONTINUE = os.getenv('WAKEWORD_FILE_CONTINUE') +WAKE_WORD_1 = os.getenv('WAKE_WORD_1', 'Hey Penguin') # Wake word to start and continue the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_INITIAL +WAKE_WORD_2 = os.getenv('WAKE_WORD_2', 'Bye Penguin') # Wake word to end the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_CONTINUE + +if not ACCESS_KEY or not WAKEWORD_FILE_INITIAL or not WAKEWORD_FILE_CONTINUE: + sys.exit("""Please set the PICOVOICE_ACCESS_KEY environment variable to your Picovoice Access Key. It is free to setup an account and get an access key and everything is done locally after verifying your account. + Then set the WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE environment variables to be the basename of the wake word files in the wake_words directory. You can generate these files from the Picovoice Console.""") + +##### Voice prompts by Piper for the conversation ##### +# You can change these to your own voice prompts + +WELCOME_PROMPT = os.getenv('WELCOME_PROMPT', 'Hello, I am Pengames. How can I help you today?') +LISTENING_PROMPT = os.getenv('LISTENING_PROMPT', 'Listening...') # To know when the bot is listening +GOODBYE_PROMPT = os.getenv('GOODBYE_PROMPT', 'Goodbye for now. Have an amazing day. Big COOL Penguin signing off.') +CONTINUE_CONVO_INSTRUCTIONS_PROMPT = os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation? Call me {WAKE_WORD_1} to continue otherwise say {WAKE_WORD_2} to exit immediately.') # For the first continue prompt +CONTINUE_CONVO_PROMPT= os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation?') # For subsequent continue prompts after you already know the instructions + +##### Custom audio files for responses +# You can also set custom audio outputs for various responses by using the play_audio function in the ollama.py file instead of the piper function in the piper.py file. Put the files in the llm_media/custom_prompts +USE_CUSTOM_AUDIO = True if os.getenv('USE_CUSTOM_AUDIO', False) == 'True' else False # Set to "True" if you want to use custom audio files for responses. Set to False to use the piper function for responses. +CUSTOM_JSON_FILE = os.getenv('CUSTOM_JSON_FILE', 'custom_prompts.json') +CUSTOM_JSON_FILE = os.path.join(ROOT_DIR, CUSTOM_JSON_FILE) + +print(USE_CUSTOM_AUDIO) +with open(CUSTOM_JSON_FILE, 'r') as f: + custom_prompts = json.load(f) + CUSTOM_WELCOME_PROMPTS = custom_prompts.get('CUSTOM_WELCOME_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_welcome.wav')]) + CUSTOM_LISTENING_PROMPTS = custom_prompts.get('CUSTOM_LISTENING_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_listening.wav')]) + CUSTOM_CONTINUE_CONVO_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo.wav')]) + CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo_instructions.wav')]) + CUSTOM_GOODBYE_PROMPTS = custom_prompts.get('CUSTOM_GOODBYE_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_goodbye.wav')]) + + CUSTOM_WELCOME_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_WELCOME_PROMPTS] + CUSTOM_LISTENING_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_LISTENING_PROMPTS] + CUSTOM_CONTINUE_CONVO_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_PROMPTS] + CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS] + CUSTOM_GOODBYE_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_GOODBYE_PROMPTS] + +########################################################################################## + + + + + + + + + + + + + + + + + + +# Load the functions and define helper variables/functions. Do not edit these variables + +##### WHISPER PIPELINE ##### +pipe = whisper_pipeline(model_id=MODEL_ID, whisper_lang=LANG) + +##### Piper Function For TTS ##### +# Download piper-tts directly onto your system or via pip. +piper = lambda text: pp.piper(text, model = PIPER_MODEL_FILE, config = PIPER_CONFIG_FILE) + + +##### Llama Model ##### +LLM_RESPONSE_PATH = os.path.join(LLM_MEDIA_DIR, 'llm_response.txt') # File to store all the responses + +# Define the API endpoint +API_URL = f"{BASE_URL}/api/generate" +OLLAMA_JSON = os.path.join(LLM_MEDIA_DIR, 'llm_context.json') +converse = lambda text, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False: ol.converse(text, ping_url=BASE_URL, api_url=API_URL, llm_model=llm_model, llm_response_path=llm_response_path, prompt=PROMPT, context_file=OLLAMA_JSON, have_context=have_context) +##### Porcupine Wake Word Detection ##### +# Path to the wake word model file +WAKE_WORD_PATH_1 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_INITIAL) +WAKE_WORD_PATH_2 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_CONTINUE) + +########################################################################################## + + + +########################################################################################## +##### Pico Voice Wake Word Detection With Porcupine ##### +def porcupine(ACCESS_KEY = ACCESS_KEY, WAKE_WORD_PATHS: list = []): + porcupine = pvporcupine.create(access_key=ACCESS_KEY, keyword_paths=WAKE_WORD_PATHS) + + # Set up audio streams + pa = pyaudio.PyAudio() + audio_stream = pa.open( + rate=porcupine.sample_rate, + channels=1, + format=pyaudio.paInt16, + input=True, + frames_per_buffer=porcupine.frame_length + ) + return porcupine, audio_stream, pa + +# Pipeline from capturing words to reply +def speech_to_response(threshold = THRESHOLD, audio_input_file = pp.AUDIO_INPUT_FILE, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False): + # Input stream until no more words using pyaudio to identify peaks. Saved to the audio_input_file location. + pp.capture_audio_until_silence(threshold=threshold) + + # Run whisper pipeline or leopard pipeline on the recently saved audio file to transcribe. + transcribed_text = pipe(audio_input_file)['text'] + + # Pipe the transcribed text straight into the Ollama LLAMA model and output response into a file + response = converse(transcribed_text, llm_model=llm_model, llm_response_path=llm_response_path, have_context=have_context) + print(response) + + # Get Piper to read out the result. + piper(response) + +# Initial response to wake word +def convo_initialised(): + play_prompt(fallback_prompt=WELCOME_PROMPT, audio_files=CUSTOM_WELCOME_PROMPTS, use_custom=USE_CUSTOM_AUDIO) + speech_to_response() + +# Function to continue the conversation +def continue_convo(time_limit=60): + play_prompt(fallback_prompt=CONTINUE_CONVO_INSTRUCTIONS_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS, use_custom=USE_CUSTOM_AUDIO) + # Wake word detection to continue or exit the conversation + porcupine_continue, audio_stream_continue, pa_continue = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2]) + porcupine_end, audio_stream_end, pa_end = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_1]) + start = time.time() + try: + while (time.time() - start < time_limit): + # Read a frame of audio + pcm_continue = audio_stream_continue.read(porcupine_continue.frame_length, exception_on_overflow=False) + pcm_unpacked_continue = struct.unpack_from("h" * porcupine_continue.frame_length, pcm_continue) + + pcm_end = audio_stream_end.read(porcupine_end.frame_length, exception_on_overflow=False) + pcm_unpacked_end = struct.unpack_from("h" * porcupine_end.frame_length, pcm_end) + + # Check if the wake word is detected + keyword_index_continue = porcupine_continue.process(pcm_unpacked_continue) + keyword_index_end = porcupine_end.process(pcm_unpacked_end) + + if keyword_index_continue >= 0: + play_prompt(fallback_prompt=LISTENING_PROMPT, audio_files=CUSTOM_LISTENING_PROMPTS, use_custom=USE_CUSTOM_AUDIO) + speech_to_response(have_context=True) + play_prompt(fallback_prompt=CONTINUE_CONVO_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_PROMPTS, use_custom=USE_CUSTOM_AUDIO) + start = time.time() + elif keyword_index_end >= 0: + break + + except KeyboardInterrupt: + print("Stopping...") + + finally: + # Clean up resources + audio_stream_continue.stop_stream() + audio_stream_continue.close() + pa_continue.terminate() + porcupine_continue.delete() + + audio_stream_end.stop_stream() + audio_stream_end.close() + pa_end.terminate() + porcupine_end.delete() + + play_prompt(fallback_prompt=GOODBYE_PROMPT, audio_files=CUSTOM_GOODBYE_PROMPTS, use_custom=USE_CUSTOM_AUDIO) +########################################################################################## + + +########################################################################################## +##### Main Loop ##### +# Initialize Porcupine with custom wake word +porcupine_initial, audio_stream_initial, pa_initial = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2]) +print("Listening for wake word...") +try: + while True: + # Read a frame of audio + pcm_initial = audio_stream_initial.read(porcupine_initial.frame_length, exception_on_overflow=False) + pcm_unpacked_initial = struct.unpack_from("h" * porcupine_initial.frame_length, pcm_initial) + + # Check if the wake word is detected + keyword_index = porcupine_initial.process(pcm_unpacked_initial) + if keyword_index >= 0: + convo_initialised() + continue_convo() +except KeyboardInterrupt: + print("Stopping...") +finally: + # Clean up resources + audio_stream_initial.stop_stream() + audio_stream_initial.close() + pa_initial.terminate() + porcupine_initial.delete() +########################################################################################## \ No newline at end of file diff --git a/ollama.py b/ollama.py new file mode 100644 index 0000000..8271540 --- /dev/null +++ b/ollama.py @@ -0,0 +1,81 @@ +import pyaudio, wave, time, os, numpy as np, requests, subprocess, sys, json +from datetime import datetime +from dotenv import load_dotenv +load_dotenv() + +ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) + +########################################################################################## + +##### OLLAMA ##### +def ping_api(url): + """Ping the specified API URL and return the response status.""" + try: + # Send a GET request to the API + response = requests.get(url) + + # Check the response status code + if response.status_code == 200: + print("API is reachable.") + return True + else: + print(f"API responded with status code: {response.status_code}") + return False + except requests.exceptions.RequestException as e: + print(f"Error pinging API: {e}") + return False + +def start_ollama_server(llm_model): + """Start the Ollama server using subprocess.""" + # Adjust the command as necessary for your setup + command = ["ollama", "run", "--keepalive", "24h", llm_model, 'Don\'t Say Anything'] # Replace with the correct command to start the server + + # Start the server in a new process + subprocess.run(command) + + + +def converse(input_text, ping_url, api_url, llm_model, llm_response_path, prompt, context_file, have_context = False): + """ + Send a prompt to the Ollama API and return the response. + input_text: The text to send to the API. + ping_url: The URL to ping to check if the API is running. + api_url: The URL of the Ollama API. + llm_model: The LLM model to use. + llm_response_path: The path to save the LLM responses and prompts. + prompt: The prompt to use for the conversation. + context_file: The path to the context file. + """ + # Ping the Llama + if not ping_api(ping_url): + try: + start_ollama_server(llm_model) + except Exception as e: + print(f"Error starting Ollama server: {e}. If you are using another Ollama server, please ensure you have correctly specified the BASE_URL and that the server is running and not firewalled off.") + sys.exit(1) + + payload = { "model": llm_model, "prompt": f'{prompt} {input_text}', "stream": False, "keep_alive": "24h" } + + if have_context: + # load json context file + with open(context_file, 'r') as f: + context = json.load(f).get('context') + payload.update({'context': context}) + + # Make the POST request + response = requests.post(api_url, json=payload) + + # Check for errors and print the response + if not response.ok: + print("Error:", response.status_code, response.text) + + # Save the context and all other responses of the API call to a file + with open(context_file, 'w') as f: + json.dump(response.json(), f) + + # Save the conversations to a file + with open(llm_response_path, "a") as f: + f.write(f'[{datetime.now().isoformat()}] Prompt: {input_text}\n') + f.write(f'[{response.json().get('created_at')}] Response: {response.json().get('response')}\n') + + return response.json().get('response') \ No newline at end of file diff --git a/piper-tts/en_GB-jenny_dioco-medium.onnx b/piper-tts/en_GB-jenny_dioco-medium.onnx new file mode 100644 index 0000000..463c55e Binary files /dev/null and b/piper-tts/en_GB-jenny_dioco-medium.onnx differ diff --git a/piper-tts/en_GB-semaine-medium.onnx b/piper-tts/en_GB-semaine-medium.onnx new file mode 100644 index 0000000..14cfda5 Binary files /dev/null and b/piper-tts/en_GB-semaine-medium.onnx differ diff --git a/piper-tts/en_US-ryan-medium.onnx b/piper-tts/en_US-ryan-medium.onnx new file mode 100644 index 0000000..abb1756 Binary files /dev/null and b/piper-tts/en_US-ryan-medium.onnx differ diff --git a/piper-tts/en_en_GB_jenny_dioco_medium_en_GB-jenny_dioco-medium.onnx.json b/piper-tts/en_en_GB_jenny_dioco_medium_en_GB-jenny_dioco-medium.onnx.json new file mode 100644 index 0000000..8828175 --- /dev/null +++ b/piper-tts/en_en_GB_jenny_dioco_medium_en_GB-jenny_dioco-medium.onnx.json @@ -0,0 +1,493 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "en-gb-x-rp" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "en_GB", + "family": "en", + "region": "GB", + "name_native": "English", + "name_english": "English", + "country_english": "Great Britain" + }, + "dataset": "jenny_dioco" +} \ No newline at end of file diff --git a/piper-tts/en_en_GB_semaine_medium_en_GB-semaine-medium.onnx.json b/piper-tts/en_en_GB_semaine_medium_en_GB-semaine-medium.onnx.json new file mode 100644 index 0000000..896245a --- /dev/null +++ b/piper-tts/en_en_GB_semaine_medium_en_GB-semaine-medium.onnx.json @@ -0,0 +1,508 @@ +{ + "piper_version": "1.2.0", + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "en-gb-x-rp" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + " ": [ + 3 + ], + "!": [ + 4 + ], + "\"": [ + 150 + ], + "#": [ + 149 + ], + "$": [ + 2 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "X": [ + 156 + ], + "^": [ + 1 + ], + "_": [ + 0 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "g": [ + 154 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʦ": [ + 155 + ], + "ʰ": [ + 145 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "ˤ": [ + 146 + ], + "̃": [ + 141 + ], + "̧": [ + 140 + ], + "̩": [ + 144 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ], + "β": [ + 125 + ], + "ε": [ + 147 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "↑": [ + 151 + ], + "↓": [ + 148 + ], + "ⱱ": [ + 129 + ] + }, + "num_symbols": 256, + "num_speakers": 4, + "speaker_id_map": { + "prudence": 0, + "spike": 1, + "obadiah": 2, + "poppy": 3 + }, + "piper_version": "1.0.0", + "language": { + "code": "en_GB", + "family": "en", + "region": "GB", + "name_native": "English", + "name_english": "English", + "country_english": "Great Britain" + }, + "dataset": "semaine" +} diff --git a/piper-tts/en_en_US_ryan_medium_en_US-ryan-medium.onnx.json b/piper-tts/en_en_US_ryan_medium_en_US-ryan-medium.onnx.json new file mode 100644 index 0000000..90e0706 --- /dev/null +++ b/piper-tts/en_en_US_ryan_medium_en_US-ryan-medium.onnx.json @@ -0,0 +1,493 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "en-us" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "en_US", + "family": "en", + "region": "US", + "name_native": "English", + "name_english": "English", + "country_english": "United States" + }, + "dataset": "ryan" +} \ No newline at end of file diff --git a/piper-tts/fr_FR-mls-medium.onnx b/piper-tts/fr_FR-mls-medium.onnx new file mode 100644 index 0000000..438463b Binary files /dev/null and b/piper-tts/fr_FR-mls-medium.onnx differ diff --git a/piper-tts/fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json b/piper-tts/fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json new file mode 100644 index 0000000..1d116b3 --- /dev/null +++ b/piper-tts/fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json @@ -0,0 +1,634 @@ +{ + "dataset": "mls", + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "fr" + }, + "language": { + "code": "fr_FR", + "family": "fr", + "region": "FR", + "name_native": "Français", + "name_english": "French", + "country_english": "France" + }, + "inference": { + "noise_scale": 0.333, + "length_scale": 1, + "noise_w": 0.333 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + " ": [ + 3 + ], + "!": [ + 4 + ], + "\"": [ + 150 + ], + "#": [ + 149 + ], + "$": [ + 2 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "X": [ + 156 + ], + "^": [ + 1 + ], + "_": [ + 0 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "g": [ + 154 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʦ": [ + 155 + ], + "ʰ": [ + 145 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "ˤ": [ + 146 + ], + "̃": [ + 141 + ], + "̊": [ + 158 + ], + "̝": [ + 157 + ], + "̧": [ + 140 + ], + "̩": [ + 144 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ], + "β": [ + 125 + ], + "ε": [ + 147 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "↑": [ + 151 + ], + "↓": [ + 148 + ], + "ⱱ": [ + 129 + ] + }, + "num_symbols": 256, + "num_speakers": 125, + "speaker_id_map": { + "1840": 0, + "3698": 1, + "123": 2, + "1474": 3, + "12709": 4, + "7423": 5, + "9242": 6, + "8778": 7, + "3060": 8, + "4512": 9, + "6249": 10, + "12541": 11, + "13634": 12, + "10065": 13, + "6128": 14, + "5232": 15, + "5764": 16, + "12713": 17, + "12823": 18, + "6070": 19, + "12501": 20, + "9121": 21, + "1649": 22, + "2776": 23, + "11772": 24, + "5612": 25, + "11822": 26, + "1590": 27, + "5525": 28, + "10827": 29, + "1243": 30, + "13142": 31, + "62": 32, + "13177": 33, + "10620": 34, + "8102": 35, + "8582": 36, + "11875": 37, + "7239": 38, + "9854": 39, + "7377": 40, + "10082": 41, + "12512": 42, + "1329": 43, + "2506": 44, + "6856": 45, + "10058": 46, + "103": 47, + "14": 48, + "6381": 49, + "1664": 50, + "11954": 51, + "66": 52, + "1127": 53, + "3270": 54, + "13611": 55, + "13658": 56, + "12968": 57, + "1989": 58, + "12981": 59, + "7193": 60, + "6348": 61, + "7679": 62, + "2284": 63, + "3182": 64, + "3503": 65, + "2033": 66, + "2771": 67, + "7614": 68, + "125": 69, + "3204": 70, + "5595": 71, + "5553": 72, + "694": 73, + "1624": 74, + "1887": 75, + "2926": 76, + "7150": 77, + "3190": 78, + "3344": 79, + "4699": 80, + "1798": 81, + "1745": 82, + "5077": 83, + "753": 84, + "52": 85, + "4174": 86, + "4018": 87, + "12899": 88, + "1844": 89, + "4396": 90, + "1817": 91, + "2155": 92, + "2946": 93, + "4336": 94, + "4609": 95, + "1977": 96, + "10957": 97, + "204": 98, + "4650": 99, + "5295": 100, + "5968": 101, + "4744": 102, + "2825": 103, + "9804": 104, + "707": 105, + "30": 106, + "115": 107, + "5840": 108, + "2587": 109, + "2607": 110, + "2544": 111, + "28": 112, + "27": 113, + "177": 114, + "112": 115, + "94": 116, + "2596": 117, + "3595": 118, + "7032": 119, + "7848": 120, + "11247": 121, + "7439": 122, + "2904": 123, + "6362": 124 + }, + "piper_version": "1.0.0" +} \ No newline at end of file diff --git a/piper-tts/glados.onnx b/piper-tts/glados.onnx new file mode 100644 index 0000000..ab20f6b Binary files /dev/null and b/piper-tts/glados.onnx differ diff --git a/piper-tts/glados.onnx.json b/piper-tts/glados.onnx.json new file mode 100644 index 0000000..f901241 --- /dev/null +++ b/piper-tts/glados.onnx.json @@ -0,0 +1,497 @@ +{ + "dataset": "glados", + "audio": { + "sample_rate": 22050, + "quality": "stacked_llama" + }, + "espeak": { + "voice": "en-us" + }, + "language": { + "code": "en-us" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + " ": [ + 3 + ], + "!": [ + 4 + ], + "\"": [ + 150 + ], + "#": [ + 149 + ], + "$": [ + 2 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "X": [ + 156 + ], + "^": [ + 1 + ], + "_": [ + 0 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "g": [ + 154 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʦ": [ + 155 + ], + "ʰ": [ + 145 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "ˤ": [ + 146 + ], + "̃": [ + 141 + ], + "̧": [ + 140 + ], + "̩": [ + 144 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ], + "β": [ + 125 + ], + "ε": [ + 147 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "↑": [ + 151 + ], + "↓": [ + 148 + ], + "ⱱ": [ + 129 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0" +} diff --git a/piper-tts/ru_RU-dmitri-medium.onnx b/piper-tts/ru_RU-dmitri-medium.onnx new file mode 100644 index 0000000..850fd01 Binary files /dev/null and b/piper-tts/ru_RU-dmitri-medium.onnx differ diff --git a/piper-tts/ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json b/piper-tts/ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json new file mode 100644 index 0000000..89ff93d --- /dev/null +++ b/piper-tts/ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json @@ -0,0 +1,487 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "ru" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "ru_RU", + "family": "ru", + "region": "RU", + "name_native": "Русский", + "name_english": "Russian", + "country_english": "Russia" + }, + "dataset": "dmitri" +} \ No newline at end of file diff --git a/piper-tts/silero_vad.onnx b/piper-tts/silero_vad.onnx new file mode 100644 index 0000000..e6db48d Binary files /dev/null and b/piper-tts/silero_vad.onnx differ diff --git a/piper-tts/vi_VN-vais1000-medium.onnx b/piper-tts/vi_VN-vais1000-medium.onnx new file mode 100644 index 0000000..41c8c51 Binary files /dev/null and b/piper-tts/vi_VN-vais1000-medium.onnx differ diff --git a/piper-tts/vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json b/piper-tts/vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json new file mode 100644 index 0000000..b11c938 --- /dev/null +++ b/piper-tts/vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json @@ -0,0 +1,492 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "vi" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ], + "̺": [ + 152 + ], + "̻": [ + 153 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "vi_VN", + "family": "vi", + "region": "VN", + "name_native": "Tiếng Việt", + "name_english": "Vietnamese", + "country_english": "Vietnam" + }, + "dataset": "vais1000" +} \ No newline at end of file diff --git a/piper-tts/zh_CN-huayan-medium.onnx b/piper-tts/zh_CN-huayan-medium.onnx new file mode 100644 index 0000000..fd1b1c7 Binary files /dev/null and b/piper-tts/zh_CN-huayan-medium.onnx differ diff --git a/piper-tts/zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json b/piper-tts/zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json new file mode 100644 index 0000000..f0e6e6e --- /dev/null +++ b/piper-tts/zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json @@ -0,0 +1,487 @@ +{ + "audio": { + "sample_rate": 22050, + "quality": "medium" + }, + "espeak": { + "voice": "cmn" + }, + "inference": { + "noise_scale": 0.667, + "length_scale": 1, + "noise_w": 0.8 + }, + "phoneme_type": "espeak", + "phoneme_map": {}, + "phoneme_id_map": { + "_": [ + 0 + ], + "^": [ + 1 + ], + "$": [ + 2 + ], + " ": [ + 3 + ], + "!": [ + 4 + ], + "'": [ + 5 + ], + "(": [ + 6 + ], + ")": [ + 7 + ], + ",": [ + 8 + ], + "-": [ + 9 + ], + ".": [ + 10 + ], + ":": [ + 11 + ], + ";": [ + 12 + ], + "?": [ + 13 + ], + "a": [ + 14 + ], + "b": [ + 15 + ], + "c": [ + 16 + ], + "d": [ + 17 + ], + "e": [ + 18 + ], + "f": [ + 19 + ], + "h": [ + 20 + ], + "i": [ + 21 + ], + "j": [ + 22 + ], + "k": [ + 23 + ], + "l": [ + 24 + ], + "m": [ + 25 + ], + "n": [ + 26 + ], + "o": [ + 27 + ], + "p": [ + 28 + ], + "q": [ + 29 + ], + "r": [ + 30 + ], + "s": [ + 31 + ], + "t": [ + 32 + ], + "u": [ + 33 + ], + "v": [ + 34 + ], + "w": [ + 35 + ], + "x": [ + 36 + ], + "y": [ + 37 + ], + "z": [ + 38 + ], + "æ": [ + 39 + ], + "ç": [ + 40 + ], + "ð": [ + 41 + ], + "ø": [ + 42 + ], + "ħ": [ + 43 + ], + "ŋ": [ + 44 + ], + "œ": [ + 45 + ], + "ǀ": [ + 46 + ], + "ǁ": [ + 47 + ], + "ǂ": [ + 48 + ], + "ǃ": [ + 49 + ], + "ɐ": [ + 50 + ], + "ɑ": [ + 51 + ], + "ɒ": [ + 52 + ], + "ɓ": [ + 53 + ], + "ɔ": [ + 54 + ], + "ɕ": [ + 55 + ], + "ɖ": [ + 56 + ], + "ɗ": [ + 57 + ], + "ɘ": [ + 58 + ], + "ə": [ + 59 + ], + "ɚ": [ + 60 + ], + "ɛ": [ + 61 + ], + "ɜ": [ + 62 + ], + "ɞ": [ + 63 + ], + "ɟ": [ + 64 + ], + "ɠ": [ + 65 + ], + "ɡ": [ + 66 + ], + "ɢ": [ + 67 + ], + "ɣ": [ + 68 + ], + "ɤ": [ + 69 + ], + "ɥ": [ + 70 + ], + "ɦ": [ + 71 + ], + "ɧ": [ + 72 + ], + "ɨ": [ + 73 + ], + "ɪ": [ + 74 + ], + "ɫ": [ + 75 + ], + "ɬ": [ + 76 + ], + "ɭ": [ + 77 + ], + "ɮ": [ + 78 + ], + "ɯ": [ + 79 + ], + "ɰ": [ + 80 + ], + "ɱ": [ + 81 + ], + "ɲ": [ + 82 + ], + "ɳ": [ + 83 + ], + "ɴ": [ + 84 + ], + "ɵ": [ + 85 + ], + "ɶ": [ + 86 + ], + "ɸ": [ + 87 + ], + "ɹ": [ + 88 + ], + "ɺ": [ + 89 + ], + "ɻ": [ + 90 + ], + "ɽ": [ + 91 + ], + "ɾ": [ + 92 + ], + "ʀ": [ + 93 + ], + "ʁ": [ + 94 + ], + "ʂ": [ + 95 + ], + "ʃ": [ + 96 + ], + "ʄ": [ + 97 + ], + "ʈ": [ + 98 + ], + "ʉ": [ + 99 + ], + "ʊ": [ + 100 + ], + "ʋ": [ + 101 + ], + "ʌ": [ + 102 + ], + "ʍ": [ + 103 + ], + "ʎ": [ + 104 + ], + "ʏ": [ + 105 + ], + "ʐ": [ + 106 + ], + "ʑ": [ + 107 + ], + "ʒ": [ + 108 + ], + "ʔ": [ + 109 + ], + "ʕ": [ + 110 + ], + "ʘ": [ + 111 + ], + "ʙ": [ + 112 + ], + "ʛ": [ + 113 + ], + "ʜ": [ + 114 + ], + "ʝ": [ + 115 + ], + "ʟ": [ + 116 + ], + "ʡ": [ + 117 + ], + "ʢ": [ + 118 + ], + "ʲ": [ + 119 + ], + "ˈ": [ + 120 + ], + "ˌ": [ + 121 + ], + "ː": [ + 122 + ], + "ˑ": [ + 123 + ], + "˞": [ + 124 + ], + "β": [ + 125 + ], + "θ": [ + 126 + ], + "χ": [ + 127 + ], + "ᵻ": [ + 128 + ], + "ⱱ": [ + 129 + ], + "0": [ + 130 + ], + "1": [ + 131 + ], + "2": [ + 132 + ], + "3": [ + 133 + ], + "4": [ + 134 + ], + "5": [ + 135 + ], + "6": [ + 136 + ], + "7": [ + 137 + ], + "8": [ + 138 + ], + "9": [ + 139 + ], + "̧": [ + 140 + ], + "̃": [ + 141 + ], + "̪": [ + 142 + ], + "̯": [ + 143 + ], + "̩": [ + 144 + ], + "ʰ": [ + 145 + ], + "ˤ": [ + 146 + ], + "ε": [ + 147 + ], + "↓": [ + 148 + ], + "#": [ + 149 + ], + "\"": [ + 150 + ], + "↑": [ + 151 + ] + }, + "num_symbols": 256, + "num_speakers": 1, + "speaker_id_map": {}, + "piper_version": "1.0.0", + "language": { + "code": "zh_CN", + "family": "zh", + "region": "CN", + "name_native": "简体中文", + "name_english": "Chinese", + "country_english": "China" + }, + "dataset": "huayan" +} \ No newline at end of file diff --git a/piper.py b/piper.py new file mode 100644 index 0000000..17eba80 --- /dev/null +++ b/piper.py @@ -0,0 +1,116 @@ +import pygame, os, subprocess, language, wave, pyaudio, time, numpy as np, random +from dotenv import load_dotenv +load_dotenv() + +ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) +PIPER_AUDIO_OUTPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/chat_voice_out.wav') +AUDIO_INPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/input_audio.wav') + +def play_audio(audio_file, volume=0.5): + # Initialize pygame mixer + pygame.mixer.init() + pygame.mixer.music.load(audio_file) + pygame.mixer.music.play() + pygame.mixer.music.set_volume(volume) + while pygame.mixer.music.get_busy(): + continue + + + +def piper(text, model, config): + process1 = subprocess.run(['echo', text], stdout=subprocess.PIPE) + subprocess.run(['piper-tts', '--sentence-silence', '0.5', '--model', model, '--config', config, '--output_file', PIPER_AUDIO_OUTPUT_FILE], input=process1.stdout) + play_audio(PIPER_AUDIO_OUTPUT_FILE) + +def eng_piper(text): + piper(text, model = eng_piper_model, config = eng_piper_conf) + + + +def play_prompt(fallback_prompt: str = "Missing Fallback Prompt", audio_files: list = [], use_custom: bool = True): + if use_custom: + valid_files = [] + for i in audio_files: + if os.path.exists(i): + valid_files.append(i) + if len(valid_files) == 0: + eng_piper(fallback_prompt) + return + number_of_files = len(valid_files) + file = random.randint(0, number_of_files-1) + play_audio(valid_files[file], volume=1) + else: + eng_piper(fallback_prompt) + +eng_piper_model, eng_piper_conf = language.files_language('en') + + + +# Capture the audio input + +def capture_audio_until_silence(threshold=800, silence_duration=3, output_file = AUDIO_INPUT_FILE): + """ + Capture audio until a period of silence is detected. + threshold: The audio level that defines silence. + silence_duration: The duration of silence to wait for before stopping. + output_file: The file to save the recorded audio to. + """ + # PyAudio configuration + p = pyaudio.PyAudio() + chunk = 1024 + sample_format = pyaudio.paInt16 + channels = 2 + rate = 44100 + + # Start recording + stream = p.open(format=sample_format, + channels=channels, + rate=rate, + input=True, + frames_per_buffer=chunk) + + print("Listening...") + + audio_frames = [] + last_time = time.time() + + try: + while True: + # Read audio data + data = stream.read(chunk) + audio_frames.append(data) + + # Convert data to numpy array for analysis + audio_data = np.frombuffer(data, dtype=np.int16) + peak = np.abs(audio_data).max() + + # Check if the sound level exceeds the threshold + if peak > threshold: + last_time = time.time() # Reset the silence timer + else: + # Check for silence + if time.time() - last_time > silence_duration: + print(f"No sound detected for {silence_duration} seconds. Stopping...") + break + + except KeyboardInterrupt: + print("Stopped by user.") + + finally: + # Stop and close the stream + stream.stop_stream() + stream.close() + p.terminate() + with wave.open(output_file, 'wb') as wf: + wf.setnchannels(channels) + wf.setsampwidth(p.get_sample_size(sample_format)) + wf.setframerate(rate) + wf.writeframes(b''.join(audio_frames)) + + + +if __name__ == "__main__": + zh_piper_model, zh_piper_conf = language.files_language('zh') + eng_piper("Hello, I am Piper. I am a text-to-speech model.") + piper("你好,我是派珀。我是一个文本转语音模型。", model = zh_piper_model, config = zh_piper_conf) + # capture_audio_until_silence() # you can try out whisper on the recording you have just made by running whisper.py \ No newline at end of file diff --git a/record_audio.py b/record_audio.py new file mode 100644 index 0000000..7085f7a --- /dev/null +++ b/record_audio.py @@ -0,0 +1,32 @@ +import sounddevice as sd +import numpy as np +import wave, os, time +from dotenv import load_dotenv +load_dotenv() +def record_audio(duration, filename): + # Sampling frequency + fs = 44100 + + # Start recording + print("Recording...") + recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype=np.int16) + sd.wait() # Wait until recording is finished + print("Recording finished") + + # Save as WAV file + with wave.open(filename, 'wb') as wf: + wf.setnchannels(2) + wf.setsampwidth(2) + wf.setframerate(fs) + wf.writeframes(recording.tobytes()) + +if __name__ == "__main__": + ROOT_DIR = os.getenv('ROOT_DIR', './') + for i in range(1, 5): + print(f"Recording {i}...") + duration = 5 # seconds + filename = os.path.join(ROOT_DIR, 'llm_media/recording{i}.wav') + record_audio(duration, filename) + print(f"Saved as {filename}") + print("Break for 3 seconds...") + time.sleep(3) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..831e382 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,282 @@ +accelerate==1.0.1 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp==3.10.10 +aiosignal==1.3.1 +alembic==1.13.3 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.6.2.post1 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asteroid-filterbanks==0.4.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +audioread==3.0.1 +av==12.3.0 +babel==2.16.0 +bark==0.1.5 +beautifulsoup4==4.12.3 +bleach==6.1.0 +boto3==1.35.49 +botocore==1.35.49 +certifi==2024.8.30 +cffi==1.17.1 +chardet==3.0.4 +charset-normalizer==3.4.0 +click==8.1.7 +coloredlogs==15.0.1 +colorlog==6.8.2 +comm==0.2.2 +contourpy==1.3.0 +cpm-kernels==1.0.11 +ctranslate2==4.4.0 +cycler==0.12.1 +debugpy==1.8.7 +decorator==5.1.1 +deep-translator==1.11.4 +defusedxml==0.7.1 +diffusers==0.31.0 +dill==0.3.8 +docopt==0.6.2 +easyocr==1.7.2 +einops==0.8.0 +encodec==0.1.1 +executing==2.1.0 +fastapi==0.115.3 +faster-whisper==1.0.3 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +flatbuffers==24.3.25 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.5.0 +fsspec==2024.9.0 +funcy==2.0 +googletrans==3.0.0 +gradio==5.4.0 +gradio_client==1.4.2 +greenlet==3.1.1 +h11==0.14.0 +h2==3.2.0 +hpack==3.0.0 +hstspreload==2024.10.1 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.1 +humanfriendly==10.0 +hyperframe==5.2.0 +HyperPyYAML==1.2.2 +idna==2.10 +imageio==2.36.0 +importlib_metadata==8.5.0 +ipykernel==6.29.5 +ipython==8.29.0 +ipython-autotime==0.3.2 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +Jinja2==3.1.4 +jmespath==1.0.1 +joblib==1.4.2 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +julius==0.2.7 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +kiwisolver==1.4.7 +langdetect==1.0.9 +langid==1.1.6 +latex2mathml==3.77.0 +lazy_loader==0.4 +librosa==0.10.2.post1 +lightning==2.4.0 +lightning-utilities==0.11.8 +llvmlite==0.43.0 +Mako==1.3.6 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +mdtex2html==1.3.0 +mdurl==0.1.2 +mistune==3.0.2 +MouseInfo==0.1.3 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.1 +nltk==3.9.1 +notebook==7.2.2 +notebook_shim==0.2.4 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +omegaconf==2.3.0 +onnxruntime==1.19.2 +opencv-python==4.10.0.84 +opencv-python-headless==4.10.0.84 +optimum==1.23.2 +optuna==4.0.0 +orjson==3.10.10 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.0.0 +platformdirs==4.3.6 +pooch==1.8.2 +primePy==1.3 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +protobuf==5.28.3 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pvporcupine==3.0.3 +pyannote.audio==3.1.1 +pyannote.core==5.0.0 +pyannote.database==5.1.0 +pyannote.metrics==3.2.1 +pyannote.pipeline==3.0.1 +pyarrow==17.0.0 +PyAudio==0.2.14 +PyAutoGUI==0.9.54 +pyclipper==1.3.0.post6 +pycparser==2.22 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pygame==2.6.1 +PyGetWindow==0.0.9 +Pygments==2.18.0 +PyMsgBox==1.0.9 +pyparsing==3.2.0 +pyperclip==1.9.0 +pypinyin==0.53.0 +PyRect==0.2.0 +PyScreeze==1.0.1 +pytesseract==0.3.13 +python-bidi==0.6.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python3-xlib==0.15 +pytorch-lightning==2.4.0 +pytorch-metric-learning==2.6.0 +pyttsx3==2.98 +pytweening==1.2.0 +pytz==2024.2 +PyYAML==6.0.2 +pyzmq==26.2.0 +referencing==0.35.1 +regex==2024.9.11 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986==1.5.0 +rfc3986-validator==0.1.1 +rich==13.9.3 +rpds-py==0.20.0 +ruamel.yaml==0.18.6 +ruamel.yaml.clib==0.2.12 +ruff==0.7.1 +s3transfer==0.10.3 +safehttpx==0.1.1 +safetensors==0.4.5 +scikit-image==0.24.0 +scikit-learn==1.5.2 +scipy==1.14.1 +semantic-version==2.10.0 +semver==3.0.2 +Send2Trash==1.8.3 +sentencepiece==0.2.0 +setuptools==75.2.0 +shapely==2.0.6 +shellingham==1.5.4 +six==1.16.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +sounddevice==0.5.1 +soundfile==0.12.1 +soupsieve==2.6 +soxr==0.5.0.post1 +speechbrain==1.0.1 +SpeechRecognition==3.11.0 +SQLAlchemy==2.0.36 +srt==3.5.3 +stack-data==0.6.3 +starlette==0.41.2 +sympy==1.13.1 +tabulate==0.9.0 +tensorboardX==2.6.2.2 +terminado==0.18.1 +threadpoolctl==3.5.0 +tifffile==2024.9.20 +tinycss2==1.4.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +torch==2.5.0 +torch-audiomentations==0.11.1 +torch_pitch_shift==1.2.5 +torchaudio==2.5.0 +torchmetrics==1.5.1 +torchvision==0.20.0 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.46.0 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +typing_extensions==4.12.2 +tzdata==2024.2 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +vosk==0.3.45 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==12.0 +widgetsnbextension==4.0.13 +xxhash==3.5.0 +yarl==1.16.0 +zipp==3.20.2 diff --git a/whisper.py b/whisper.py new file mode 100644 index 0000000..67b6b20 --- /dev/null +++ b/whisper.py @@ -0,0 +1,43 @@ +########################################################################################## +##### WhisperX ##### + +import torch +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + +def whisper_pipeline(model_id: str, whisper_lang: str): + device = "cuda:0" if torch.cuda.is_available() else "cpu" + torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + + model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa" + ) + model.generation_config.language = whisper_lang # define your language of choice here + model.to(device) + + processor = AutoProcessor.from_pretrained(model_id) + + pipe = pipeline( + "automatic-speech-recognition", + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + generate_kwargs={"max_new_tokens": 128}, + torch_dtype=torch_dtype, + device=device + ) + + return pipe +########################################################################################## + +if __name__ == "__main__": + # Example Usage + import os + from dotenv import load_dotenv + load_dotenv() + ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) + model_id = "openai/whisper-tiny" + whisper_lang = "en" + whisper = whisper_pipeline(model_id, whisper_lang) + audio_file = os.path.join(ROOT_DIR, "llm_media/input_audio.wav") + result = whisper(audio_file)['text'] + print(result)