Initial commit

This commit is contained in:
chickenflyshigh 2024-10-29 22:05:42 +11:00
commit 51efcd9c0f
25 changed files with 4973 additions and 0 deletions

3
README.md Normal file
View File

@ -0,0 +1,3 @@
#
This

51
language.py Normal file
View File

@ -0,0 +1,51 @@
import os
from dotenv import load_dotenv
load_dotenv()
# https://github.com/rhasspy/piper/blob/master/VOICES.md download here
ROOT_DIR = os.getenv('ROOT_DIR',os.path.dirname(__file__))
PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
LANGUAGES = ['en', 'fr', 'ru', 'zh', 'vi']
# piper-tts
config_file = {'en': 'glados.onnx.json',
'fr': 'fr_fr_FR_mls_medium_fr_FR-mls-medium.onnx.json',
'ru': 'ru_ru_RU_dmitri_medium_ru_RU-dmitri-medium.onnx.json',
'zh': 'zh_zh_CN_huayan_medium_zh_CN-huayan-medium.onnx.json',
'vi': 'vi_vi_VN_vais1000_medium_vi_VN-vais1000-medium.onnx.json'
}
onnx_file = {'en': 'glados.onnx',
'fr': 'fr_FR-mls-medium.onnx',
'ru': 'ru_RU-dmitri-medium.onnx',
'zh': 'zh_CN-huayan-medium.onnx',
'vi': 'vi_VN-vais1000-medium.onnx'
}
prompt = { 'en': 'Respond only in english in under 100 words.',
'fr': 'Répondez uniquement en français en moins de 100 mots.',
'ru': 'Отвечайте только на русском языке менее 100 слов.',
'zh': '只用中文回答不超过100个字。一点英文都不可以用。',
'vi': 'Chỉ trả lời bằng tiếng Việt dưới 100 từ.'
}
def piper_voice_language(lang):
if lang in LANGUAGES:
return lang
else:
return 'en'
# retrieve the corresponding piper-tts voice files for the language
def files_language(lang):
language = piper_voice_language(lang)
conf_path = os.path.join(PIPER_TTS_DIR, config_file[language])
onnx_path = os.path.join(PIPER_TTS_DIR, onnx_file[language])
return (onnx_path, conf_path)
def get_variables(lang):
language = piper_voice_language(lang)
starting_prompt = prompt[language]
onnx_path, conf_path = files_language(language)
return (onnx_path, conf_path, language, starting_prompt)

274
main.py Normal file
View File

@ -0,0 +1,274 @@
##########################################################################################
##### import basic libraries #####
import os, time, random, json, requests, sys, pyaudio
import pvporcupine
import struct
from datetime import datetime
import language
from whisper import whisper_pipeline
import piper as pp
from piper import play_audio, eng_piper, play_prompt
import ollama as ol
from dotenv import load_dotenv
load_dotenv()
##########################################################################################
##########################################################################################
# By default this requires at least 6GB VRAM if using a CUDA supported GPU. Requirements can be lowered if the Ollama and Whisper models are changed to a smaller model.
# You can also consider using Leopard for a small and relatively accurate audio transcription model. Edit the whisper.py file to use Leopard instead of WhisperX. Ensure the function output is just a string.
# for basic audio transcription, you can also use vosk or google speech to text. Google speech is less accurate and subject to rate limiting/charges. Vosk is relatively inaccurate but uses minimal resources.
# for more languages, look at the language.py file and download more languages from there into the piper-tts folder
# ensure you have ollama installed. If you don't have the models downloaded from Ollama it may take some time depending on which model you choose
##########################################################################################
##########################################################################################
### Root directory for where you store llm_media, wake_words and piper_tts folders and files
# By default, it is the same directory as this file
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) # for where you want to keep all your files
### From the ROOT_DIR, create the necessary folders if they don't exist
WAKE_WORDS_DIR = os.path.join(ROOT_DIR, 'wake_words')
LLM_MEDIA_DIR = os.path.join(ROOT_DIR, 'llm_media')
PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
if not os.path.exists(WAKE_WORDS_DIR):
os.makedirs(WAKE_WORDS_DIR)
sys.exit(f"Wake words directory have just been created in {ROOT_DIR}. Please download the wake word files from the Picovoice Console and place them in the wake_words directory.")
if not os.path.exists(LLM_MEDIA_DIR):
os.makedirs(LLM_MEDIA_DIR)
if not os.path.exists(PIPER_TTS_DIR):
os.makedirs(PIPER_TTS_DIR)
sys.exit(f"Piper TTS directory have just been created in {ROOT_DIR}. Please download the piper-tts files from the Piper TTS repository and place them in the piper-tts directory.")
### Custom prompts
CUSTOM_PROMPTS_DIR = os.path.join(LLM_MEDIA_DIR, 'custom_prompts')
if not os.path.exists(CUSTOM_PROMPTS_DIR):
os.makedirs(CUSTOM_PROMPTS_DIR)
##########################################################################################
##########################################################################################
##### Environmental Variables #####
# Please define the following environmental variables in a .env file or in your system environment. The only required ones are: PICOVOICE_ACCESS_KEY, WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE
# WAKEWORD_FILE_INITIAL is the word you say to initiate the chat and WAKEWORD_FILE_CONTINUE is the word to continue the chat after the initial dialogue.
### Llama Variables
BASE_URL = os.getenv('BASE_URL', 'http://localhost:11434')
LLM_MODEL = os.getenv('LLM_MODEL', 'llama3.2:latest') # about 3GB of VRAM/RAM required for LLAMA3.2 model
### Threshold for audio peaks
THRESHOLD = int(os.getenv('THRESHOLD', '1000'))
### WhisperX Variables
MODEL_ID = os.getenv('MODEL_ID', 'openai/whisper-large-v3-turbo')
# Language code for whisper pipeline, defaults to en. Possible languages = ['en', 'fr', 'ru', 'zh', 'vi']
LANG = 'en'
for arg in sys.argv:
if arg.lower().startswith('lang='):
LANG = arg.split('=')[1]
### Piper variables
# LANG is checked for validity else it defaults to 'en'. Prompt is used for the OLLAMA model at the start.
PIPER_MODEL_FILE, PIPER_CONFIG_FILE, LANG, PROMPT = language.get_variables(LANG)
# Prompt for the Llama model. Change the environment variable PROMPT to change the prompt.
PROMPT = os.getenv('PROMPT', PROMPT)
### Picovoice
ACCESS_KEY = os.getenv('PICOVOICE_ACCESS_KEY')
WAKEWORD_FILE_INITIAL = os.getenv('WAKEWORD_FILE_INITIAL')
WAKEWORD_FILE_CONTINUE = os.getenv('WAKEWORD_FILE_CONTINUE')
WAKE_WORD_1 = os.getenv('WAKE_WORD_1', 'Hey Penguin') # Wake word to start and continue the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_INITIAL
WAKE_WORD_2 = os.getenv('WAKE_WORD_2', 'Bye Penguin') # Wake word to end the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_CONTINUE
if not ACCESS_KEY or not WAKEWORD_FILE_INITIAL or not WAKEWORD_FILE_CONTINUE:
sys.exit("""Please set the PICOVOICE_ACCESS_KEY environment variable to your Picovoice Access Key. It is free to setup an account and get an access key and everything is done locally after verifying your account.
Then set the WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE environment variables to be the basename of the wake word files in the wake_words directory. You can generate these files from the Picovoice Console.""")
##### Voice prompts by Piper for the conversation #####
# You can change these to your own voice prompts
WELCOME_PROMPT = os.getenv('WELCOME_PROMPT', 'Hello, I am Pengames. How can I help you today?')
LISTENING_PROMPT = os.getenv('LISTENING_PROMPT', 'Listening...') # To know when the bot is listening
GOODBYE_PROMPT = os.getenv('GOODBYE_PROMPT', 'Goodbye for now. Have an amazing day. Big COOL Penguin signing off.')
CONTINUE_CONVO_INSTRUCTIONS_PROMPT = os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation? Call me {WAKE_WORD_1} to continue otherwise say {WAKE_WORD_2} to exit immediately.') # For the first continue prompt
CONTINUE_CONVO_PROMPT= os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation?') # For subsequent continue prompts after you already know the instructions
##### Custom audio files for responses
# You can also set custom audio outputs for various responses by using the play_audio function in the ollama.py file instead of the piper function in the piper.py file. Put the files in the llm_media/custom_prompts
USE_CUSTOM_AUDIO = True if os.getenv('USE_CUSTOM_AUDIO', False) == 'True' else False # Set to "True" if you want to use custom audio files for responses. Set to False to use the piper function for responses.
CUSTOM_JSON_FILE = os.getenv('CUSTOM_JSON_FILE', 'custom_prompts.json')
CUSTOM_JSON_FILE = os.path.join(ROOT_DIR, CUSTOM_JSON_FILE)
print(USE_CUSTOM_AUDIO)
with open(CUSTOM_JSON_FILE, 'r') as f:
custom_prompts = json.load(f)
CUSTOM_WELCOME_PROMPTS = custom_prompts.get('CUSTOM_WELCOME_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_welcome.wav')])
CUSTOM_LISTENING_PROMPTS = custom_prompts.get('CUSTOM_LISTENING_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_listening.wav')])
CUSTOM_CONTINUE_CONVO_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo.wav')])
CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo_instructions.wav')])
CUSTOM_GOODBYE_PROMPTS = custom_prompts.get('CUSTOM_GOODBYE_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_goodbye.wav')])
CUSTOM_WELCOME_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_WELCOME_PROMPTS]
CUSTOM_LISTENING_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_LISTENING_PROMPTS]
CUSTOM_CONTINUE_CONVO_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_PROMPTS]
CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS]
CUSTOM_GOODBYE_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_GOODBYE_PROMPTS]
##########################################################################################
# Load the functions and define helper variables/functions. Do not edit these variables
##### WHISPER PIPELINE #####
pipe = whisper_pipeline(model_id=MODEL_ID, whisper_lang=LANG)
##### Piper Function For TTS #####
# Download piper-tts directly onto your system or via pip.
piper = lambda text: pp.piper(text, model = PIPER_MODEL_FILE, config = PIPER_CONFIG_FILE)
##### Llama Model #####
LLM_RESPONSE_PATH = os.path.join(LLM_MEDIA_DIR, 'llm_response.txt') # File to store all the responses
# Define the API endpoint
API_URL = f"{BASE_URL}/api/generate"
OLLAMA_JSON = os.path.join(LLM_MEDIA_DIR, 'llm_context.json')
converse = lambda text, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False: ol.converse(text, ping_url=BASE_URL, api_url=API_URL, llm_model=llm_model, llm_response_path=llm_response_path, prompt=PROMPT, context_file=OLLAMA_JSON, have_context=have_context)
##### Porcupine Wake Word Detection #####
# Path to the wake word model file
WAKE_WORD_PATH_1 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_INITIAL)
WAKE_WORD_PATH_2 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_CONTINUE)
##########################################################################################
##########################################################################################
##### Pico Voice Wake Word Detection With Porcupine #####
def porcupine(ACCESS_KEY = ACCESS_KEY, WAKE_WORD_PATHS: list = []):
porcupine = pvporcupine.create(access_key=ACCESS_KEY, keyword_paths=WAKE_WORD_PATHS)
# Set up audio streams
pa = pyaudio.PyAudio()
audio_stream = pa.open(
rate=porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=porcupine.frame_length
)
return porcupine, audio_stream, pa
# Pipeline from capturing words to reply
def speech_to_response(threshold = THRESHOLD, audio_input_file = pp.AUDIO_INPUT_FILE, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False):
# Input stream until no more words using pyaudio to identify peaks. Saved to the audio_input_file location.
pp.capture_audio_until_silence(threshold=threshold)
# Run whisper pipeline or leopard pipeline on the recently saved audio file to transcribe.
transcribed_text = pipe(audio_input_file)['text']
# Pipe the transcribed text straight into the Ollama LLAMA model and output response into a file
response = converse(transcribed_text, llm_model=llm_model, llm_response_path=llm_response_path, have_context=have_context)
print(response)
# Get Piper to read out the result.
piper(response)
# Initial response to wake word
def convo_initialised():
play_prompt(fallback_prompt=WELCOME_PROMPT, audio_files=CUSTOM_WELCOME_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
speech_to_response()
# Function to continue the conversation
def continue_convo(time_limit=60):
play_prompt(fallback_prompt=CONTINUE_CONVO_INSTRUCTIONS_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
# Wake word detection to continue or exit the conversation
porcupine_continue, audio_stream_continue, pa_continue = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
porcupine_end, audio_stream_end, pa_end = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_1])
start = time.time()
try:
while (time.time() - start < time_limit):
# Read a frame of audio
pcm_continue = audio_stream_continue.read(porcupine_continue.frame_length, exception_on_overflow=False)
pcm_unpacked_continue = struct.unpack_from("h" * porcupine_continue.frame_length, pcm_continue)
pcm_end = audio_stream_end.read(porcupine_end.frame_length, exception_on_overflow=False)
pcm_unpacked_end = struct.unpack_from("h" * porcupine_end.frame_length, pcm_end)
# Check if the wake word is detected
keyword_index_continue = porcupine_continue.process(pcm_unpacked_continue)
keyword_index_end = porcupine_end.process(pcm_unpacked_end)
if keyword_index_continue >= 0:
play_prompt(fallback_prompt=LISTENING_PROMPT, audio_files=CUSTOM_LISTENING_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
speech_to_response(have_context=True)
play_prompt(fallback_prompt=CONTINUE_CONVO_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
start = time.time()
elif keyword_index_end >= 0:
break
except KeyboardInterrupt:
print("Stopping...")
finally:
# Clean up resources
audio_stream_continue.stop_stream()
audio_stream_continue.close()
pa_continue.terminate()
porcupine_continue.delete()
audio_stream_end.stop_stream()
audio_stream_end.close()
pa_end.terminate()
porcupine_end.delete()
play_prompt(fallback_prompt=GOODBYE_PROMPT, audio_files=CUSTOM_GOODBYE_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
##########################################################################################
##########################################################################################
##### Main Loop #####
# Initialize Porcupine with custom wake word
porcupine_initial, audio_stream_initial, pa_initial = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
print("Listening for wake word...")
try:
while True:
# Read a frame of audio
pcm_initial = audio_stream_initial.read(porcupine_initial.frame_length, exception_on_overflow=False)
pcm_unpacked_initial = struct.unpack_from("h" * porcupine_initial.frame_length, pcm_initial)
# Check if the wake word is detected
keyword_index = porcupine_initial.process(pcm_unpacked_initial)
if keyword_index >= 0:
convo_initialised()
continue_convo()
except KeyboardInterrupt:
print("Stopping...")
finally:
# Clean up resources
audio_stream_initial.stop_stream()
audio_stream_initial.close()
pa_initial.terminate()
porcupine_initial.delete()
##########################################################################################

81
ollama.py Normal file
View File

@ -0,0 +1,81 @@
import pyaudio, wave, time, os, numpy as np, requests, subprocess, sys, json
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
##########################################################################################
##### OLLAMA #####
def ping_api(url):
"""Ping the specified API URL and return the response status."""
try:
# Send a GET request to the API
response = requests.get(url)
# Check the response status code
if response.status_code == 200:
print("API is reachable.")
return True
else:
print(f"API responded with status code: {response.status_code}")
return False
except requests.exceptions.RequestException as e:
print(f"Error pinging API: {e}")
return False
def start_ollama_server(llm_model):
"""Start the Ollama server using subprocess."""
# Adjust the command as necessary for your setup
command = ["ollama", "run", "--keepalive", "24h", llm_model, 'Don\'t Say Anything'] # Replace with the correct command to start the server
# Start the server in a new process
subprocess.run(command)
def converse(input_text, ping_url, api_url, llm_model, llm_response_path, prompt, context_file, have_context = False):
"""
Send a prompt to the Ollama API and return the response.
input_text: The text to send to the API.
ping_url: The URL to ping to check if the API is running.
api_url: The URL of the Ollama API.
llm_model: The LLM model to use.
llm_response_path: The path to save the LLM responses and prompts.
prompt: The prompt to use for the conversation.
context_file: The path to the context file.
"""
# Ping the Llama
if not ping_api(ping_url):
try:
start_ollama_server(llm_model)
except Exception as e:
print(f"Error starting Ollama server: {e}. If you are using another Ollama server, please ensure you have correctly specified the BASE_URL and that the server is running and not firewalled off.")
sys.exit(1)
payload = { "model": llm_model, "prompt": f'{prompt} {input_text}', "stream": False, "keep_alive": "24h" }
if have_context:
# load json context file
with open(context_file, 'r') as f:
context = json.load(f).get('context')
payload.update({'context': context})
# Make the POST request
response = requests.post(api_url, json=payload)
# Check for errors and print the response
if not response.ok:
print("Error:", response.status_code, response.text)
# Save the context and all other responses of the API call to a file
with open(context_file, 'w') as f:
json.dump(response.json(), f)
# Save the conversations to a file
with open(llm_response_path, "a") as f:
f.write(f'[{datetime.now().isoformat()}] Prompt: {input_text}\n')
f.write(f'[{response.json().get('created_at')}] Response: {response.json().get('response')}\n')
return response.json().get('response')

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,493 @@
{
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "en-gb-x-rp"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
"_": [
0
],
"^": [
1
],
"$": [
2
],
" ": [
3
],
"!": [
4
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
":": [
11
],
";": [
12
],
"?": [
13
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"β": [
125
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"ⱱ": [
129
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
"̧": [
140
],
"̃": [
141
],
"̪": [
142
],
"̯": [
143
],
"̩": [
144
],
"ʰ": [
145
],
"ˤ": [
146
],
"ε": [
147
],
"↓": [
148
],
"#": [
149
],
"\"": [
150
],
"↑": [
151
],
"̺": [
152
],
"̻": [
153
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0",
"language": {
"code": "en_GB",
"family": "en",
"region": "GB",
"name_native": "English",
"name_english": "English",
"country_english": "Great Britain"
},
"dataset": "jenny_dioco"
}

View File

@ -0,0 +1,508 @@
{
"piper_version": "1.2.0",
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "en-gb-x-rp"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
" ": [
3
],
"!": [
4
],
"\"": [
150
],
"#": [
149
],
"$": [
2
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
":": [
11
],
";": [
12
],
"?": [
13
],
"X": [
156
],
"^": [
1
],
"_": [
0
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"g": [
154
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʦ": [
155
],
"ʰ": [
145
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"ˤ": [
146
],
"̃": [
141
],
"̧": [
140
],
"̩": [
144
],
"̪": [
142
],
"̯": [
143
],
"̺": [
152
],
"̻": [
153
],
"β": [
125
],
"ε": [
147
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"↑": [
151
],
"↓": [
148
],
"ⱱ": [
129
]
},
"num_symbols": 256,
"num_speakers": 4,
"speaker_id_map": {
"prudence": 0,
"spike": 1,
"obadiah": 2,
"poppy": 3
},
"piper_version": "1.0.0",
"language": {
"code": "en_GB",
"family": "en",
"region": "GB",
"name_native": "English",
"name_english": "English",
"country_english": "Great Britain"
},
"dataset": "semaine"
}

View File

@ -0,0 +1,493 @@
{
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "en-us"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
"_": [
0
],
"^": [
1
],
"$": [
2
],
" ": [
3
],
"!": [
4
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
":": [
11
],
";": [
12
],
"?": [
13
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"β": [
125
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"ⱱ": [
129
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
"̧": [
140
],
"̃": [
141
],
"̪": [
142
],
"̯": [
143
],
"̩": [
144
],
"ʰ": [
145
],
"ˤ": [
146
],
"ε": [
147
],
"↓": [
148
],
"#": [
149
],
"\"": [
150
],
"↑": [
151
],
"̺": [
152
],
"̻": [
153
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0",
"language": {
"code": "en_US",
"family": "en",
"region": "US",
"name_native": "English",
"name_english": "English",
"country_english": "United States"
},
"dataset": "ryan"
}

Binary file not shown.

View File

@ -0,0 +1,634 @@
{
"dataset": "mls",
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "fr"
},
"language": {
"code": "fr_FR",
"family": "fr",
"region": "FR",
"name_native": "Français",
"name_english": "French",
"country_english": "France"
},
"inference": {
"noise_scale": 0.333,
"length_scale": 1,
"noise_w": 0.333
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
" ": [
3
],
"!": [
4
],
"\"": [
150
],
"#": [
149
],
"$": [
2
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
":": [
11
],
";": [
12
],
"?": [
13
],
"X": [
156
],
"^": [
1
],
"_": [
0
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"g": [
154
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʦ": [
155
],
"ʰ": [
145
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"ˤ": [
146
],
"̃": [
141
],
"̊": [
158
],
"̝": [
157
],
"̧": [
140
],
"̩": [
144
],
"̪": [
142
],
"̯": [
143
],
"̺": [
152
],
"̻": [
153
],
"β": [
125
],
"ε": [
147
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"↑": [
151
],
"↓": [
148
],
"ⱱ": [
129
]
},
"num_symbols": 256,
"num_speakers": 125,
"speaker_id_map": {
"1840": 0,
"3698": 1,
"123": 2,
"1474": 3,
"12709": 4,
"7423": 5,
"9242": 6,
"8778": 7,
"3060": 8,
"4512": 9,
"6249": 10,
"12541": 11,
"13634": 12,
"10065": 13,
"6128": 14,
"5232": 15,
"5764": 16,
"12713": 17,
"12823": 18,
"6070": 19,
"12501": 20,
"9121": 21,
"1649": 22,
"2776": 23,
"11772": 24,
"5612": 25,
"11822": 26,
"1590": 27,
"5525": 28,
"10827": 29,
"1243": 30,
"13142": 31,
"62": 32,
"13177": 33,
"10620": 34,
"8102": 35,
"8582": 36,
"11875": 37,
"7239": 38,
"9854": 39,
"7377": 40,
"10082": 41,
"12512": 42,
"1329": 43,
"2506": 44,
"6856": 45,
"10058": 46,
"103": 47,
"14": 48,
"6381": 49,
"1664": 50,
"11954": 51,
"66": 52,
"1127": 53,
"3270": 54,
"13611": 55,
"13658": 56,
"12968": 57,
"1989": 58,
"12981": 59,
"7193": 60,
"6348": 61,
"7679": 62,
"2284": 63,
"3182": 64,
"3503": 65,
"2033": 66,
"2771": 67,
"7614": 68,
"125": 69,
"3204": 70,
"5595": 71,
"5553": 72,
"694": 73,
"1624": 74,
"1887": 75,
"2926": 76,
"7150": 77,
"3190": 78,
"3344": 79,
"4699": 80,
"1798": 81,
"1745": 82,
"5077": 83,
"753": 84,
"52": 85,
"4174": 86,
"4018": 87,
"12899": 88,
"1844": 89,
"4396": 90,
"1817": 91,
"2155": 92,
"2946": 93,
"4336": 94,
"4609": 95,
"1977": 96,
"10957": 97,
"204": 98,
"4650": 99,
"5295": 100,
"5968": 101,
"4744": 102,
"2825": 103,
"9804": 104,
"707": 105,
"30": 106,
"115": 107,
"5840": 108,
"2587": 109,
"2607": 110,
"2544": 111,
"28": 112,
"27": 113,
"177": 114,
"112": 115,
"94": 116,
"2596": 117,
"3595": 118,
"7032": 119,
"7848": 120,
"11247": 121,
"7439": 122,
"2904": 123,
"6362": 124
},
"piper_version": "1.0.0"
}

BIN
piper-tts/glados.onnx Normal file

Binary file not shown.

497
piper-tts/glados.onnx.json Normal file
View File

@ -0,0 +1,497 @@
{
"dataset": "glados",
"audio": {
"sample_rate": 22050,
"quality": "stacked_llama"
},
"espeak": {
"voice": "en-us"
},
"language": {
"code": "en-us"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
" ": [
3
],
"!": [
4
],
"\"": [
150
],
"#": [
149
],
"$": [
2
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
":": [
11
],
";": [
12
],
"?": [
13
],
"X": [
156
],
"^": [
1
],
"_": [
0
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"g": [
154
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʦ": [
155
],
"ʰ": [
145
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"ˤ": [
146
],
"̃": [
141
],
"̧": [
140
],
"̩": [
144
],
"̪": [
142
],
"̯": [
143
],
"̺": [
152
],
"̻": [
153
],
"β": [
125
],
"ε": [
147
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"↑": [
151
],
"↓": [
148
],
"ⱱ": [
129
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0"
}

Binary file not shown.

View File

@ -0,0 +1,487 @@
{
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "ru"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
"_": [
0
],
"^": [
1
],
"$": [
2
],
" ": [
3
],
"!": [
4
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
":": [
11
],
";": [
12
],
"?": [
13
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"β": [
125
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"ⱱ": [
129
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
"̧": [
140
],
"̃": [
141
],
"̪": [
142
],
"̯": [
143
],
"̩": [
144
],
"ʰ": [
145
],
"ˤ": [
146
],
"ε": [
147
],
"↓": [
148
],
"#": [
149
],
"\"": [
150
],
"↑": [
151
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0",
"language": {
"code": "ru_RU",
"family": "ru",
"region": "RU",
"name_native": "Русский",
"name_english": "Russian",
"country_english": "Russia"
},
"dataset": "dmitri"
}

BIN
piper-tts/silero_vad.onnx Normal file

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,492 @@
{
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "vi"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_map": {},
"phoneme_id_map": {
"_": [
0
],
"^": [
1
],
"$": [
2
],
" ": [
3
],
"!": [
4
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
":": [
11
],
";": [
12
],
"?": [
13
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"β": [
125
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"ⱱ": [
129
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
"̧": [
140
],
"̃": [
141
],
"̪": [
142
],
"̯": [
143
],
"̩": [
144
],
"ʰ": [
145
],
"ˤ": [
146
],
"ε": [
147
],
"↓": [
148
],
"#": [
149
],
"\"": [
150
],
"↑": [
151
],
"̺": [
152
],
"̻": [
153
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0",
"language": {
"code": "vi_VN",
"family": "vi",
"region": "VN",
"name_native": "Tiếng Việt",
"name_english": "Vietnamese",
"country_english": "Vietnam"
},
"dataset": "vais1000"
}

Binary file not shown.

View File

@ -0,0 +1,487 @@
{
"audio": {
"sample_rate": 22050,
"quality": "medium"
},
"espeak": {
"voice": "cmn"
},
"inference": {
"noise_scale": 0.667,
"length_scale": 1,
"noise_w": 0.8
},
"phoneme_type": "espeak",
"phoneme_map": {},
"phoneme_id_map": {
"_": [
0
],
"^": [
1
],
"$": [
2
],
" ": [
3
],
"!": [
4
],
"'": [
5
],
"(": [
6
],
")": [
7
],
",": [
8
],
"-": [
9
],
".": [
10
],
":": [
11
],
";": [
12
],
"?": [
13
],
"a": [
14
],
"b": [
15
],
"c": [
16
],
"d": [
17
],
"e": [
18
],
"f": [
19
],
"h": [
20
],
"i": [
21
],
"j": [
22
],
"k": [
23
],
"l": [
24
],
"m": [
25
],
"n": [
26
],
"o": [
27
],
"p": [
28
],
"q": [
29
],
"r": [
30
],
"s": [
31
],
"t": [
32
],
"u": [
33
],
"v": [
34
],
"w": [
35
],
"x": [
36
],
"y": [
37
],
"z": [
38
],
"æ": [
39
],
"ç": [
40
],
"ð": [
41
],
"ø": [
42
],
"ħ": [
43
],
"ŋ": [
44
],
"œ": [
45
],
"ǀ": [
46
],
"ǁ": [
47
],
"ǂ": [
48
],
"ǃ": [
49
],
"ɐ": [
50
],
"ɑ": [
51
],
"ɒ": [
52
],
"ɓ": [
53
],
"ɔ": [
54
],
"ɕ": [
55
],
"ɖ": [
56
],
"ɗ": [
57
],
"ɘ": [
58
],
"ə": [
59
],
"ɚ": [
60
],
"ɛ": [
61
],
"ɜ": [
62
],
"ɞ": [
63
],
"ɟ": [
64
],
"ɠ": [
65
],
"ɡ": [
66
],
"ɢ": [
67
],
"ɣ": [
68
],
"ɤ": [
69
],
"ɥ": [
70
],
"ɦ": [
71
],
"ɧ": [
72
],
"ɨ": [
73
],
"ɪ": [
74
],
"ɫ": [
75
],
"ɬ": [
76
],
"ɭ": [
77
],
"ɮ": [
78
],
"ɯ": [
79
],
"ɰ": [
80
],
"ɱ": [
81
],
"ɲ": [
82
],
"ɳ": [
83
],
"ɴ": [
84
],
"ɵ": [
85
],
"ɶ": [
86
],
"ɸ": [
87
],
"ɹ": [
88
],
"ɺ": [
89
],
"ɻ": [
90
],
"ɽ": [
91
],
"ɾ": [
92
],
"ʀ": [
93
],
"ʁ": [
94
],
"ʂ": [
95
],
"ʃ": [
96
],
"ʄ": [
97
],
"ʈ": [
98
],
"ʉ": [
99
],
"ʊ": [
100
],
"ʋ": [
101
],
"ʌ": [
102
],
"ʍ": [
103
],
"ʎ": [
104
],
"ʏ": [
105
],
"ʐ": [
106
],
"ʑ": [
107
],
"ʒ": [
108
],
"ʔ": [
109
],
"ʕ": [
110
],
"ʘ": [
111
],
"ʙ": [
112
],
"ʛ": [
113
],
"ʜ": [
114
],
"ʝ": [
115
],
"ʟ": [
116
],
"ʡ": [
117
],
"ʢ": [
118
],
"ʲ": [
119
],
"ˈ": [
120
],
"ˌ": [
121
],
"ː": [
122
],
"ˑ": [
123
],
"˞": [
124
],
"β": [
125
],
"θ": [
126
],
"χ": [
127
],
"ᵻ": [
128
],
"ⱱ": [
129
],
"0": [
130
],
"1": [
131
],
"2": [
132
],
"3": [
133
],
"4": [
134
],
"5": [
135
],
"6": [
136
],
"7": [
137
],
"8": [
138
],
"9": [
139
],
"̧": [
140
],
"̃": [
141
],
"̪": [
142
],
"̯": [
143
],
"̩": [
144
],
"ʰ": [
145
],
"ˤ": [
146
],
"ε": [
147
],
"↓": [
148
],
"#": [
149
],
"\"": [
150
],
"↑": [
151
]
},
"num_symbols": 256,
"num_speakers": 1,
"speaker_id_map": {},
"piper_version": "1.0.0",
"language": {
"code": "zh_CN",
"family": "zh",
"region": "CN",
"name_native": "简体中文",
"name_english": "Chinese",
"country_english": "China"
},
"dataset": "huayan"
}

116
piper.py Normal file
View File

@ -0,0 +1,116 @@
import pygame, os, subprocess, language, wave, pyaudio, time, numpy as np, random
from dotenv import load_dotenv
load_dotenv()
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
PIPER_AUDIO_OUTPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/chat_voice_out.wav')
AUDIO_INPUT_FILE = os.path.join(ROOT_DIR, 'llm_media/input_audio.wav')
def play_audio(audio_file, volume=0.5):
# Initialize pygame mixer
pygame.mixer.init()
pygame.mixer.music.load(audio_file)
pygame.mixer.music.play()
pygame.mixer.music.set_volume(volume)
while pygame.mixer.music.get_busy():
continue
def piper(text, model, config):
process1 = subprocess.run(['echo', text], stdout=subprocess.PIPE)
subprocess.run(['piper-tts', '--sentence-silence', '0.5', '--model', model, '--config', config, '--output_file', PIPER_AUDIO_OUTPUT_FILE], input=process1.stdout)
play_audio(PIPER_AUDIO_OUTPUT_FILE)
def eng_piper(text):
piper(text, model = eng_piper_model, config = eng_piper_conf)
def play_prompt(fallback_prompt: str = "Missing Fallback Prompt", audio_files: list = [], use_custom: bool = True):
if use_custom:
valid_files = []
for i in audio_files:
if os.path.exists(i):
valid_files.append(i)
if len(valid_files) == 0:
eng_piper(fallback_prompt)
return
number_of_files = len(valid_files)
file = random.randint(0, number_of_files-1)
play_audio(valid_files[file], volume=1)
else:
eng_piper(fallback_prompt)
eng_piper_model, eng_piper_conf = language.files_language('en')
# Capture the audio input
def capture_audio_until_silence(threshold=800, silence_duration=3, output_file = AUDIO_INPUT_FILE):
"""
Capture audio until a period of silence is detected.
threshold: The audio level that defines silence.
silence_duration: The duration of silence to wait for before stopping.
output_file: The file to save the recorded audio to.
"""
# PyAudio configuration
p = pyaudio.PyAudio()
chunk = 1024
sample_format = pyaudio.paInt16
channels = 2
rate = 44100
# Start recording
stream = p.open(format=sample_format,
channels=channels,
rate=rate,
input=True,
frames_per_buffer=chunk)
print("Listening...")
audio_frames = []
last_time = time.time()
try:
while True:
# Read audio data
data = stream.read(chunk)
audio_frames.append(data)
# Convert data to numpy array for analysis
audio_data = np.frombuffer(data, dtype=np.int16)
peak = np.abs(audio_data).max()
# Check if the sound level exceeds the threshold
if peak > threshold:
last_time = time.time() # Reset the silence timer
else:
# Check for silence
if time.time() - last_time > silence_duration:
print(f"No sound detected for {silence_duration} seconds. Stopping...")
break
except KeyboardInterrupt:
print("Stopped by user.")
finally:
# Stop and close the stream
stream.stop_stream()
stream.close()
p.terminate()
with wave.open(output_file, 'wb') as wf:
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(rate)
wf.writeframes(b''.join(audio_frames))
if __name__ == "__main__":
zh_piper_model, zh_piper_conf = language.files_language('zh')
eng_piper("Hello, I am Piper. I am a text-to-speech model.")
piper("你好,我是派珀。我是一个文本转语音模型。", model = zh_piper_model, config = zh_piper_conf)
# capture_audio_until_silence() # you can try out whisper on the recording you have just made by running whisper.py

32
record_audio.py Normal file
View File

@ -0,0 +1,32 @@
import sounddevice as sd
import numpy as np
import wave, os, time
from dotenv import load_dotenv
load_dotenv()
def record_audio(duration, filename):
# Sampling frequency
fs = 44100
# Start recording
print("Recording...")
recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype=np.int16)
sd.wait() # Wait until recording is finished
print("Recording finished")
# Save as WAV file
with wave.open(filename, 'wb') as wf:
wf.setnchannels(2)
wf.setsampwidth(2)
wf.setframerate(fs)
wf.writeframes(recording.tobytes())
if __name__ == "__main__":
ROOT_DIR = os.getenv('ROOT_DIR', './')
for i in range(1, 5):
print(f"Recording {i}...")
duration = 5 # seconds
filename = os.path.join(ROOT_DIR, 'llm_media/recording{i}.wav')
record_audio(duration, filename)
print(f"Saved as {filename}")
print("Break for 3 seconds...")
time.sleep(3)

282
requirements.txt Normal file
View File

@ -0,0 +1,282 @@
accelerate==1.0.1
aiofiles==23.2.1
aiohappyeyeballs==2.4.3
aiohttp==3.10.10
aiosignal==1.3.1
alembic==1.13.3
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.6.2.post1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asteroid-filterbanks==0.4.0
asttokens==2.4.1
async-lru==2.0.4
attrs==24.2.0
audioread==3.0.1
av==12.3.0
babel==2.16.0
bark==0.1.5
beautifulsoup4==4.12.3
bleach==6.1.0
boto3==1.35.49
botocore==1.35.49
certifi==2024.8.30
cffi==1.17.1
chardet==3.0.4
charset-normalizer==3.4.0
click==8.1.7
coloredlogs==15.0.1
colorlog==6.8.2
comm==0.2.2
contourpy==1.3.0
cpm-kernels==1.0.11
ctranslate2==4.4.0
cycler==0.12.1
debugpy==1.8.7
decorator==5.1.1
deep-translator==1.11.4
defusedxml==0.7.1
diffusers==0.31.0
dill==0.3.8
docopt==0.6.2
easyocr==1.7.2
einops==0.8.0
encodec==0.1.1
executing==2.1.0
fastapi==0.115.3
faster-whisper==1.0.3
fastjsonschema==2.20.0
ffmpy==0.4.0
filelock==3.16.1
flatbuffers==24.3.25
fonttools==4.54.1
fqdn==1.5.1
frozenlist==1.5.0
fsspec==2024.9.0
funcy==2.0
googletrans==3.0.0
gradio==5.4.0
gradio_client==1.4.2
greenlet==3.1.1
h11==0.14.0
h2==3.2.0
hpack==3.0.0
hstspreload==2024.10.1
httpcore==1.0.6
httpx==0.27.2
huggingface-hub==0.26.1
humanfriendly==10.0
hyperframe==5.2.0
HyperPyYAML==1.2.2
idna==2.10
imageio==2.36.0
importlib_metadata==8.5.0
ipykernel==6.29.5
ipython==8.29.0
ipython-autotime==0.3.2
ipywidgets==8.1.5
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.4
jmespath==1.0.1
joblib==1.4.2
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
julius==0.2.7
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.5
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.13
kiwisolver==1.4.7
langdetect==1.0.9
langid==1.1.6
latex2mathml==3.77.0
lazy_loader==0.4
librosa==0.10.2.post1
lightning==2.4.0
lightning-utilities==0.11.8
llvmlite==0.43.0
Mako==1.3.6
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.2
matplotlib-inline==0.1.7
mdtex2html==1.3.0
mdurl==0.1.2
mistune==3.0.2
MouseInfo==0.1.3
mpmath==1.3.0
msgpack==1.1.0
multidict==6.1.0
multiprocess==0.70.16
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==3.4.2
ninja==1.11.1.1
nltk==3.9.1
notebook==7.2.2
notebook_shim==0.2.4
numba==0.60.0
numpy==1.26.4
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
omegaconf==2.3.0
onnxruntime==1.19.2
opencv-python==4.10.0.84
opencv-python-headless==4.10.0.84
optimum==1.23.2
optuna==4.0.0
orjson==3.10.10
overrides==7.7.0
packaging==24.1
pandas==2.2.3
pandocfilters==1.5.1
parso==0.8.4
pexpect==4.9.0
pillow==11.0.0
platformdirs==4.3.6
pooch==1.8.2
primePy==1.3
prometheus_client==0.21.0
prompt_toolkit==3.0.48
propcache==0.2.0
protobuf==5.28.3
psutil==6.1.0
ptyprocess==0.7.0
pure_eval==0.2.3
pvporcupine==3.0.3
pyannote.audio==3.1.1
pyannote.core==5.0.0
pyannote.database==5.1.0
pyannote.metrics==3.2.1
pyannote.pipeline==3.0.1
pyarrow==17.0.0
PyAudio==0.2.14
PyAutoGUI==0.9.54
pyclipper==1.3.0.post6
pycparser==2.22
pydantic==2.9.2
pydantic_core==2.23.4
pydub==0.25.1
pygame==2.6.1
PyGetWindow==0.0.9
Pygments==2.18.0
PyMsgBox==1.0.9
pyparsing==3.2.0
pyperclip==1.9.0
pypinyin==0.53.0
PyRect==0.2.0
PyScreeze==1.0.1
pytesseract==0.3.13
python-bidi==0.6.3
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-json-logger==2.0.7
python-multipart==0.0.12
python3-xlib==0.15
pytorch-lightning==2.4.0
pytorch-metric-learning==2.6.0
pyttsx3==2.98
pytweening==1.2.0
pytz==2024.2
PyYAML==6.0.2
pyzmq==26.2.0
referencing==0.35.1
regex==2024.9.11
requests==2.32.3
rfc3339-validator==0.1.4
rfc3986==1.5.0
rfc3986-validator==0.1.1
rich==13.9.3
rpds-py==0.20.0
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.12
ruff==0.7.1
s3transfer==0.10.3
safehttpx==0.1.1
safetensors==0.4.5
scikit-image==0.24.0
scikit-learn==1.5.2
scipy==1.14.1
semantic-version==2.10.0
semver==3.0.2
Send2Trash==1.8.3
sentencepiece==0.2.0
setuptools==75.2.0
shapely==2.0.6
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
sounddevice==0.5.1
soundfile==0.12.1
soupsieve==2.6
soxr==0.5.0.post1
speechbrain==1.0.1
SpeechRecognition==3.11.0
SQLAlchemy==2.0.36
srt==3.5.3
stack-data==0.6.3
starlette==0.41.2
sympy==1.13.1
tabulate==0.9.0
tensorboardX==2.6.2.2
terminado==0.18.1
threadpoolctl==3.5.0
tifffile==2024.9.20
tinycss2==1.4.0
tokenizers==0.20.1
tomlkit==0.12.0
torch==2.5.0
torch-audiomentations==0.11.1
torch_pitch_shift==1.2.5
torchaudio==2.5.0
torchmetrics==1.5.1
torchvision==0.20.0
tornado==6.4.1
tqdm==4.66.5
traitlets==5.14.3
transformers==4.46.0
triton==3.1.0
typer==0.12.5
types-python-dateutil==2.9.0.20241003
typing_extensions==4.12.2
tzdata==2024.2
uri-template==1.3.0
urllib3==2.2.3
uvicorn==0.32.0
vosk==0.3.45
wcwidth==0.2.13
webcolors==24.8.0
webencodings==0.5.1
websocket-client==1.8.0
websockets==12.0
widgetsnbextension==4.0.13
xxhash==3.5.0
yarl==1.16.0
zipp==3.20.2

43
whisper.py Normal file
View File

@ -0,0 +1,43 @@
##########################################################################################
##### WhisperX #####
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
def whisper_pipeline(model_id: str, whisper_lang: str):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
model.generation_config.language = whisper_lang # define your language of choice here
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
generate_kwargs={"max_new_tokens": 128},
torch_dtype=torch_dtype,
device=device
)
return pipe
##########################################################################################
if __name__ == "__main__":
# Example Usage
import os
from dotenv import load_dotenv
load_dotenv()
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))
model_id = "openai/whisper-tiny"
whisper_lang = "en"
whisper = whisper_pipeline(model_id, whisper_lang)
audio_file = os.path.join(ROOT_DIR, "llm_media/input_audio.wav")
result = whisper(audio_file)['text']
print(result)