274 lines
14 KiB
Python
274 lines
14 KiB
Python
##########################################################################################
|
|
##### import basic libraries #####
|
|
import os, time, random, json, requests, sys, pyaudio
|
|
import pvporcupine
|
|
import struct
|
|
from datetime import datetime
|
|
import language
|
|
from whisper import whisper_pipeline
|
|
import piper as pp
|
|
from piper import play_audio, eng_piper, play_prompt
|
|
import ollama as ol
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
##########################################################################################
|
|
|
|
|
|
##########################################################################################
|
|
# By default this requires at least 6GB VRAM if using a CUDA supported GPU. Requirements can be lowered if the Ollama and Whisper models are changed to a smaller model.
|
|
# You can also consider using Leopard for a small and relatively accurate audio transcription model. Edit the whisper.py file to use Leopard instead of WhisperX. Ensure the function output is just a string.
|
|
# for basic audio transcription, you can also use vosk or google speech to text. Google speech is less accurate and subject to rate limiting/charges. Vosk is relatively inaccurate but uses minimal resources.
|
|
|
|
# for more languages, look at the language.py file and download more languages from there into the piper-tts folder
|
|
# ensure you have ollama installed. If you don't have the models downloaded from Ollama it may take some time depending on which model you choose
|
|
##########################################################################################
|
|
|
|
|
|
##########################################################################################
|
|
### Root directory for where you store llm_media, wake_words and piper_tts folders and files
|
|
# By default, it is the same directory as this file
|
|
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) # for where you want to keep all your files
|
|
|
|
### From the ROOT_DIR, create the necessary folders if they don't exist
|
|
WAKE_WORDS_DIR = os.path.join(ROOT_DIR, 'wake_words')
|
|
LLM_MEDIA_DIR = os.path.join(ROOT_DIR, 'llm_media')
|
|
PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
|
|
if not os.path.exists(WAKE_WORDS_DIR):
|
|
os.makedirs(WAKE_WORDS_DIR)
|
|
sys.exit(f"Wake words directory have just been created in {ROOT_DIR}. Please download the wake word files from the Picovoice Console and place them in the wake_words directory.")
|
|
if not os.path.exists(LLM_MEDIA_DIR):
|
|
os.makedirs(LLM_MEDIA_DIR)
|
|
if not os.path.exists(PIPER_TTS_DIR):
|
|
os.makedirs(PIPER_TTS_DIR)
|
|
sys.exit(f"Piper TTS directory have just been created in {ROOT_DIR}. Please download the piper-tts files from the Piper TTS repository and place them in the piper-tts directory.")
|
|
|
|
### Custom prompts
|
|
CUSTOM_PROMPTS_DIR = os.path.join(LLM_MEDIA_DIR, 'custom_prompts')
|
|
if not os.path.exists(CUSTOM_PROMPTS_DIR):
|
|
os.makedirs(CUSTOM_PROMPTS_DIR)
|
|
|
|
##########################################################################################
|
|
|
|
|
|
##########################################################################################
|
|
##### Environmental Variables #####
|
|
# Please define the following environmental variables in a .env file or in your system environment. The only required ones are: PICOVOICE_ACCESS_KEY, WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE
|
|
# WAKEWORD_FILE_INITIAL is the word you say to initiate the chat and WAKEWORD_FILE_CONTINUE is the word to continue the chat after the initial dialogue.
|
|
|
|
### Llama Variables
|
|
BASE_URL = os.getenv('BASE_URL', 'http://localhost:11434')
|
|
LLM_MODEL = os.getenv('LLM_MODEL', 'llama3.2:latest') # about 3GB of VRAM/RAM required for LLAMA3.2 model
|
|
|
|
### Threshold for audio peaks
|
|
THRESHOLD = int(os.getenv('THRESHOLD', '1000'))
|
|
|
|
### WhisperX Variables
|
|
MODEL_ID = os.getenv('MODEL_ID', 'openai/whisper-large-v3-turbo')
|
|
|
|
# Language code for whisper pipeline, defaults to en. Possible languages = ['en', 'fr', 'ru', 'zh', 'vi']
|
|
LANG = 'en'
|
|
for arg in sys.argv:
|
|
if arg.lower().startswith('lang='):
|
|
LANG = arg.split('=')[1]
|
|
|
|
### Piper variables
|
|
# LANG is checked for validity else it defaults to 'en'. Prompt is used for the OLLAMA model at the start.
|
|
PIPER_MODEL_FILE, PIPER_CONFIG_FILE, LANG, PROMPT = language.get_variables(LANG)
|
|
|
|
# Prompt for the Llama model. Change the environment variable PROMPT to change the prompt.
|
|
PROMPT = os.getenv('PROMPT', PROMPT)
|
|
|
|
### Picovoice
|
|
ACCESS_KEY = os.getenv('PICOVOICE_ACCESS_KEY')
|
|
WAKEWORD_FILE_INITIAL = os.getenv('WAKEWORD_FILE_INITIAL')
|
|
WAKEWORD_FILE_CONTINUE = os.getenv('WAKEWORD_FILE_CONTINUE')
|
|
WAKE_WORD_1 = os.getenv('WAKE_WORD_1', 'Hey Penguin') # Wake word to start and continue the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_INITIAL
|
|
WAKE_WORD_2 = os.getenv('WAKE_WORD_2', 'Bye Penguin') # Wake word to end the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_CONTINUE
|
|
|
|
if not ACCESS_KEY or not WAKEWORD_FILE_INITIAL or not WAKEWORD_FILE_CONTINUE:
|
|
sys.exit("""Please set the PICOVOICE_ACCESS_KEY environment variable to your Picovoice Access Key. It is free to setup an account and get an access key and everything is done locally after verifying your account.
|
|
Then set the WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE environment variables to be the basename of the wake word files in the wake_words directory. You can generate these files from the Picovoice Console.""")
|
|
|
|
##### Voice prompts by Piper for the conversation #####
|
|
# You can change these to your own voice prompts
|
|
|
|
WELCOME_PROMPT = os.getenv('WELCOME_PROMPT', 'Hello, I am Pengames. How can I help you today?')
|
|
LISTENING_PROMPT = os.getenv('LISTENING_PROMPT', 'Listening...') # To know when the bot is listening
|
|
GOODBYE_PROMPT = os.getenv('GOODBYE_PROMPT', 'Goodbye for now. Have an amazing day. Big COOL Penguin signing off.')
|
|
CONTINUE_CONVO_INSTRUCTIONS_PROMPT = os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation? Call me {WAKE_WORD_1} to continue otherwise say {WAKE_WORD_2} to exit immediately.') # For the first continue prompt
|
|
CONTINUE_CONVO_PROMPT= os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation?') # For subsequent continue prompts after you already know the instructions
|
|
|
|
##### Custom audio files for responses
|
|
# You can also set custom audio outputs for various responses by using the play_audio function in the ollama.py file instead of the piper function in the piper.py file. Put the files in the llm_media/custom_prompts
|
|
USE_CUSTOM_AUDIO = True if os.getenv('USE_CUSTOM_AUDIO', False) == 'True' else False # Set to "True" if you want to use custom audio files for responses. Set to False to use the piper function for responses.
|
|
CUSTOM_JSON_FILE = os.getenv('CUSTOM_JSON_FILE', 'custom_prompts.json')
|
|
CUSTOM_JSON_FILE = os.path.join(ROOT_DIR, CUSTOM_JSON_FILE)
|
|
|
|
print(USE_CUSTOM_AUDIO)
|
|
with open(CUSTOM_JSON_FILE, 'r') as f:
|
|
custom_prompts = json.load(f)
|
|
CUSTOM_WELCOME_PROMPTS = custom_prompts.get('CUSTOM_WELCOME_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_welcome.wav')])
|
|
CUSTOM_LISTENING_PROMPTS = custom_prompts.get('CUSTOM_LISTENING_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_listening.wav')])
|
|
CUSTOM_CONTINUE_CONVO_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo.wav')])
|
|
CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo_instructions.wav')])
|
|
CUSTOM_GOODBYE_PROMPTS = custom_prompts.get('CUSTOM_GOODBYE_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_goodbye.wav')])
|
|
|
|
CUSTOM_WELCOME_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_WELCOME_PROMPTS]
|
|
CUSTOM_LISTENING_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_LISTENING_PROMPTS]
|
|
CUSTOM_CONTINUE_CONVO_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_PROMPTS]
|
|
CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS]
|
|
CUSTOM_GOODBYE_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_GOODBYE_PROMPTS]
|
|
|
|
##########################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Load the functions and define helper variables/functions. Do not edit these variables
|
|
|
|
##### WHISPER PIPELINE #####
|
|
pipe = whisper_pipeline(model_id=MODEL_ID, whisper_lang=LANG)
|
|
|
|
##### Piper Function For TTS #####
|
|
# Download piper-tts directly onto your system or via pip.
|
|
piper = lambda text: pp.piper(text, model = PIPER_MODEL_FILE, config = PIPER_CONFIG_FILE)
|
|
|
|
|
|
##### Llama Model #####
|
|
LLM_RESPONSE_PATH = os.path.join(LLM_MEDIA_DIR, 'llm_response.txt') # File to store all the responses
|
|
|
|
# Define the API endpoint
|
|
API_URL = f"{BASE_URL}/api/generate"
|
|
OLLAMA_JSON = os.path.join(LLM_MEDIA_DIR, 'llm_context.json')
|
|
converse = lambda text, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False: ol.converse(text, ping_url=BASE_URL, api_url=API_URL, llm_model=llm_model, llm_response_path=llm_response_path, prompt=PROMPT, context_file=OLLAMA_JSON, have_context=have_context)
|
|
##### Porcupine Wake Word Detection #####
|
|
# Path to the wake word model file
|
|
WAKE_WORD_PATH_1 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_INITIAL)
|
|
WAKE_WORD_PATH_2 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_CONTINUE)
|
|
|
|
##########################################################################################
|
|
|
|
|
|
|
|
##########################################################################################
|
|
##### Pico Voice Wake Word Detection With Porcupine #####
|
|
def porcupine(ACCESS_KEY = ACCESS_KEY, WAKE_WORD_PATHS: list = []):
|
|
porcupine = pvporcupine.create(access_key=ACCESS_KEY, keyword_paths=WAKE_WORD_PATHS)
|
|
|
|
# Set up audio streams
|
|
pa = pyaudio.PyAudio()
|
|
audio_stream = pa.open(
|
|
rate=porcupine.sample_rate,
|
|
channels=1,
|
|
format=pyaudio.paInt16,
|
|
input=True,
|
|
frames_per_buffer=porcupine.frame_length
|
|
)
|
|
return porcupine, audio_stream, pa
|
|
|
|
# Pipeline from capturing words to reply
|
|
def speech_to_response(threshold = THRESHOLD, audio_input_file = pp.AUDIO_INPUT_FILE, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False):
|
|
# Input stream until no more words using pyaudio to identify peaks. Saved to the audio_input_file location.
|
|
pp.capture_audio_until_silence(threshold=threshold)
|
|
|
|
# Run whisper pipeline or leopard pipeline on the recently saved audio file to transcribe.
|
|
transcribed_text = pipe(audio_input_file)['text']
|
|
|
|
# Pipe the transcribed text straight into the Ollama LLAMA model and output response into a file
|
|
response = converse(transcribed_text, llm_model=llm_model, llm_response_path=llm_response_path, have_context=have_context)
|
|
print(response)
|
|
|
|
# Get Piper to read out the result.
|
|
piper(response)
|
|
|
|
# Initial response to wake word
|
|
def convo_initialised():
|
|
play_prompt(fallback_prompt=WELCOME_PROMPT, audio_files=CUSTOM_WELCOME_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
|
|
speech_to_response()
|
|
|
|
# Function to continue the conversation
|
|
def continue_convo(time_limit=60):
|
|
play_prompt(fallback_prompt=CONTINUE_CONVO_INSTRUCTIONS_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
|
|
# Wake word detection to continue or exit the conversation
|
|
porcupine_continue, audio_stream_continue, pa_continue = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
|
|
porcupine_end, audio_stream_end, pa_end = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_1])
|
|
start = time.time()
|
|
try:
|
|
while (time.time() - start < time_limit):
|
|
# Read a frame of audio
|
|
pcm_continue = audio_stream_continue.read(porcupine_continue.frame_length, exception_on_overflow=False)
|
|
pcm_unpacked_continue = struct.unpack_from("h" * porcupine_continue.frame_length, pcm_continue)
|
|
|
|
pcm_end = audio_stream_end.read(porcupine_end.frame_length, exception_on_overflow=False)
|
|
pcm_unpacked_end = struct.unpack_from("h" * porcupine_end.frame_length, pcm_end)
|
|
|
|
# Check if the wake word is detected
|
|
keyword_index_continue = porcupine_continue.process(pcm_unpacked_continue)
|
|
keyword_index_end = porcupine_end.process(pcm_unpacked_end)
|
|
|
|
if keyword_index_continue >= 0:
|
|
play_prompt(fallback_prompt=LISTENING_PROMPT, audio_files=CUSTOM_LISTENING_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
|
|
speech_to_response(have_context=True)
|
|
play_prompt(fallback_prompt=CONTINUE_CONVO_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
|
|
start = time.time()
|
|
elif keyword_index_end >= 0:
|
|
break
|
|
|
|
except KeyboardInterrupt:
|
|
print("Stopping...")
|
|
|
|
finally:
|
|
# Clean up resources
|
|
audio_stream_continue.stop_stream()
|
|
audio_stream_continue.close()
|
|
pa_continue.terminate()
|
|
porcupine_continue.delete()
|
|
|
|
audio_stream_end.stop_stream()
|
|
audio_stream_end.close()
|
|
pa_end.terminate()
|
|
porcupine_end.delete()
|
|
|
|
play_prompt(fallback_prompt=GOODBYE_PROMPT, audio_files=CUSTOM_GOODBYE_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
|
|
##########################################################################################
|
|
|
|
|
|
##########################################################################################
|
|
##### Main Loop #####
|
|
# Initialize Porcupine with custom wake word
|
|
porcupine_initial, audio_stream_initial, pa_initial = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
|
|
print("Listening for wake word...")
|
|
try:
|
|
while True:
|
|
# Read a frame of audio
|
|
pcm_initial = audio_stream_initial.read(porcupine_initial.frame_length, exception_on_overflow=False)
|
|
pcm_unpacked_initial = struct.unpack_from("h" * porcupine_initial.frame_length, pcm_initial)
|
|
|
|
# Check if the wake word is detected
|
|
keyword_index = porcupine_initial.process(pcm_unpacked_initial)
|
|
if keyword_index >= 0:
|
|
convo_initialised()
|
|
continue_convo()
|
|
except KeyboardInterrupt:
|
|
print("Stopping...")
|
|
finally:
|
|
# Clean up resources
|
|
audio_stream_initial.stop_stream()
|
|
audio_stream_initial.close()
|
|
pa_initial.terminate()
|
|
porcupine_initial.delete()
|
|
########################################################################################## |