voice-assistant-chatbot/main.py

##########################################################################################
##### import basic libraries #####
import os, time, random, json, requests, sys, pyaudio
import pvporcupine
import struct
from datetime import datetime
import language
from whisper import whisper_pipeline
import piper as pp
from piper import play_audio, eng_piper, play_prompt
import ollama as ol
from dotenv import load_dotenv
load_dotenv()
##########################################################################################


##########################################################################################
# By default this requires at least 6GB VRAM if using a CUDA supported GPU. Requirements can be lowered if the Ollama and Whisper models are changed to a smaller model.
# You can also consider using Leopard for a small and relatively accurate audio transcription model. Edit the whisper.py file to use Leopard instead of WhisperX. Ensure the function output is just a string.
# for basic audio transcription, you can also use vosk or google speech to text. Google speech is less accurate and subject to rate limiting/charges. Vosk is relatively inaccurate but uses minimal resources.

# for more languages, look at the language.py file and download more languages from there into the piper-tts folder
# ensure you have ollama installed. If you don't have the models downloaded from Ollama it may take some time depending on which model you choose
##########################################################################################


##########################################################################################
### Root directory for where you store llm_media, wake_words and piper_tts folders and files
# By default, it is the same directory as this file
ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__))   # for where you want to keep all your files

### From the ROOT_DIR, create the necessary folders if they don't exist
WAKE_WORDS_DIR = os.path.join(ROOT_DIR, 'wake_words')
LLM_MEDIA_DIR = os.path.join(ROOT_DIR, 'llm_media')
PIPER_TTS_DIR = os.path.join(ROOT_DIR, 'piper-tts')
if not os.path.exists(WAKE_WORDS_DIR):
    os.makedirs(WAKE_WORDS_DIR)
    sys.exit(f"Wake words directory have just been created in {ROOT_DIR}. Please download the wake word files from the Picovoice Console and place them in the wake_words directory.")
if not os.path.exists(LLM_MEDIA_DIR):
    os.makedirs(LLM_MEDIA_DIR)
if not os.path.exists(PIPER_TTS_DIR):
    os.makedirs(PIPER_TTS_DIR)
    sys.exit(f"Piper TTS directory have just been created in {ROOT_DIR}. Please download the piper-tts files from the Piper TTS repository and place them in the piper-tts directory.")

### Custom prompts
CUSTOM_PROMPTS_DIR = os.path.join(LLM_MEDIA_DIR, 'custom_prompts')
if not os.path.exists(CUSTOM_PROMPTS_DIR):
    os.makedirs(CUSTOM_PROMPTS_DIR)

##########################################################################################


##########################################################################################
##### Environmental Variables #####
# Please define the following environmental variables in a .env file or in your system environment. The only required ones are: PICOVOICE_ACCESS_KEY, WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE
# WAKEWORD_FILE_INITIAL is the word you say to initiate the chat and WAKEWORD_FILE_CONTINUE is the word to continue the chat after the initial dialogue.

### Llama Variables
BASE_URL = os.getenv('BASE_URL', 'http://localhost:11434')
LLM_MODEL = os.getenv('LLM_MODEL', 'llama3.2:latest') # about 3GB of VRAM/RAM required for LLAMA3.2 model

### Threshold for audio peaks
THRESHOLD = int(os.getenv('THRESHOLD', '1000'))

### WhisperX Variables
MODEL_ID = os.getenv('MODEL_ID', 'openai/whisper-large-v3-turbo')

# Language code for whisper pipeline, defaults to en. Possible languages = ['en', 'fr', 'ru', 'zh', 'vi']
LANG = 'en'
for arg in sys.argv:
    if arg.lower().startswith('lang='):
        LANG = arg.split('=')[1]

### Piper variables
# LANG is checked for validity else it defaults to 'en'. Prompt is used for the OLLAMA model at the start.
PIPER_MODEL_FILE, PIPER_CONFIG_FILE, LANG, PROMPT = language.get_variables(LANG)

# Prompt for the Llama model. Change the environment variable PROMPT to change the prompt.
PROMPT = os.getenv('PROMPT', PROMPT)

### Picovoice
ACCESS_KEY = os.getenv('PICOVOICE_ACCESS_KEY')
WAKEWORD_FILE_INITIAL = os.getenv('WAKEWORD_FILE_INITIAL')
WAKEWORD_FILE_CONTINUE = os.getenv('WAKEWORD_FILE_CONTINUE')
WAKE_WORD_1 = os.getenv('WAKE_WORD_1', 'Hey Penguin') # Wake word to start and continue the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_INITIAL
WAKE_WORD_2 = os.getenv('WAKE_WORD_2', 'Bye Penguin') # Wake word to end the conversation. You set this up in the Picovoice Console. Corresponds to WAKEWORD_FILE_CONTINUE

if not ACCESS_KEY or not WAKEWORD_FILE_INITIAL or not WAKEWORD_FILE_CONTINUE:
    sys.exit("""Please set the PICOVOICE_ACCESS_KEY environment variable to your Picovoice Access Key. It is free to setup an account and get an access key and everything is done locally after verifying your account.
             Then set the WAKEWORD_FILE_INITIAL and WAKEWORD_FILE_CONTINUE environment variables to be the basename of the wake word files in the wake_words directory. You can generate these files from the Picovoice Console.""")

##### Voice prompts by Piper for the conversation #####
# You can change these to your own voice prompts

WELCOME_PROMPT = os.getenv('WELCOME_PROMPT', 'Hello, I am Pengames. How can I help you today?')
LISTENING_PROMPT = os.getenv('LISTENING_PROMPT', 'Listening...') # To know when the bot is listening
GOODBYE_PROMPT = os.getenv('GOODBYE_PROMPT', 'Goodbye for now. Have an amazing day. Big COOL Penguin signing off.')
CONTINUE_CONVO_INSTRUCTIONS_PROMPT = os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation? Call me {WAKE_WORD_1} to continue otherwise say {WAKE_WORD_2} to exit immediately.') # For the first continue prompt
CONTINUE_CONVO_PROMPT= os.getenv('CONTINUE_CONVO', f'Do you want to continue the conversation?') # For subsequent continue prompts after you already know the instructions

##### Custom audio files for responses
# You can also set custom audio outputs for various responses by using the play_audio function in the ollama.py file instead of the piper function in the piper.py file. Put the files in the llm_media/custom_prompts
USE_CUSTOM_AUDIO = True if os.getenv('USE_CUSTOM_AUDIO', False) == 'True' else False # Set to "True" if you want to use custom audio files for responses. Set to False to use the piper function for responses.
CUSTOM_JSON_FILE = os.getenv('CUSTOM_JSON_FILE', 'custom_prompts.json')
CUSTOM_JSON_FILE = os.path.join(ROOT_DIR, CUSTOM_JSON_FILE)

print(USE_CUSTOM_AUDIO)
with open(CUSTOM_JSON_FILE, 'r') as f:
    custom_prompts = json.load(f)
    CUSTOM_WELCOME_PROMPTS = custom_prompts.get('CUSTOM_WELCOME_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_welcome.wav')])
    CUSTOM_LISTENING_PROMPTS = custom_prompts.get('CUSTOM_LISTENING_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_listening.wav')])
    CUSTOM_CONTINUE_CONVO_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo.wav')])
    CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = custom_prompts.get('CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_continue_convo_instructions.wav')])
    CUSTOM_GOODBYE_PROMPTS = custom_prompts.get('CUSTOM_GOODBYE_PROMPTS', [os.path.join(CUSTOM_PROMPTS_DIR, 'custom_goodbye.wav')])

    CUSTOM_WELCOME_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_WELCOME_PROMPTS]
    CUSTOM_LISTENING_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_LISTENING_PROMPTS]
    CUSTOM_CONTINUE_CONVO_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_PROMPTS]
    CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS]
    CUSTOM_GOODBYE_PROMPTS = [os.path.join(CUSTOM_PROMPTS_DIR, i) for i in CUSTOM_GOODBYE_PROMPTS]

##########################################################################################


# Load the functions and define helper variables/functions. Do not edit these variables

##### WHISPER PIPELINE #####
pipe = whisper_pipeline(model_id=MODEL_ID, whisper_lang=LANG)

##### Piper Function For TTS #####
# Download piper-tts directly onto your system or via pip.
piper = lambda text: pp.piper(text, model = PIPER_MODEL_FILE, config = PIPER_CONFIG_FILE)


##### Llama Model #####
LLM_RESPONSE_PATH = os.path.join(LLM_MEDIA_DIR, 'llm_response.txt') # File to store all the responses

# Define the API endpoint
API_URL = f"{BASE_URL}/api/generate"
OLLAMA_JSON = os.path.join(LLM_MEDIA_DIR, 'llm_context.json')
converse = lambda text, llm_model = LLM_MODEL, llm_response_path =  LLM_RESPONSE_PATH, have_context = False: ol.converse(text, ping_url=BASE_URL, api_url=API_URL, llm_model=llm_model, llm_response_path=llm_response_path, prompt=PROMPT, context_file=OLLAMA_JSON, have_context=have_context)
##### Porcupine Wake Word Detection #####
# Path to the wake word model file
WAKE_WORD_PATH_1 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_INITIAL)
WAKE_WORD_PATH_2 = os.path.join(WAKE_WORDS_DIR, WAKEWORD_FILE_CONTINUE)

##########################################################################################


##########################################################################################
##### Pico Voice Wake Word Detection With Porcupine #####
def porcupine(ACCESS_KEY = ACCESS_KEY, WAKE_WORD_PATHS: list = []):
    porcupine = pvporcupine.create(access_key=ACCESS_KEY, keyword_paths=WAKE_WORD_PATHS)

    # Set up audio streams
    pa = pyaudio.PyAudio()
    audio_stream = pa.open(
        rate=porcupine.sample_rate,
        channels=1,
        format=pyaudio.paInt16,
        input=True,
        frames_per_buffer=porcupine.frame_length
    )
    return porcupine, audio_stream, pa

# Pipeline from capturing words to reply
def speech_to_response(threshold = THRESHOLD, audio_input_file = pp.AUDIO_INPUT_FILE, llm_model = LLM_MODEL, llm_response_path = LLM_RESPONSE_PATH, have_context = False):
    # Input stream until no more words using pyaudio to identify peaks. Saved to the audio_input_file location.
    pp.capture_audio_until_silence(threshold=threshold)

    # Run whisper pipeline or leopard pipeline on the recently saved audio file to transcribe.
    transcribed_text = pipe(audio_input_file)['text']

    # Pipe the transcribed text straight into the Ollama LLAMA model and output response into a file
    response = converse(transcribed_text, llm_model=llm_model, llm_response_path=llm_response_path, have_context=have_context)
    print(response)

    # Get Piper to read out the result.
    piper(response)

# Initial response to wake word
def convo_initialised():
    play_prompt(fallback_prompt=WELCOME_PROMPT, audio_files=CUSTOM_WELCOME_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
    speech_to_response()

# Function to continue the conversation
def continue_convo(time_limit=60):
    play_prompt(fallback_prompt=CONTINUE_CONVO_INSTRUCTIONS_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_INSTRUCTIONS_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
    # Wake word detection to continue or exit the conversation
    porcupine_continue, audio_stream_continue, pa_continue = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
    porcupine_end, audio_stream_end, pa_end = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_1])
    start = time.time()
    try:
        while (time.time() - start < time_limit):
            # Read a frame of audio
            pcm_continue = audio_stream_continue.read(porcupine_continue.frame_length, exception_on_overflow=False)
            pcm_unpacked_continue = struct.unpack_from("h" * porcupine_continue.frame_length, pcm_continue)

            pcm_end = audio_stream_end.read(porcupine_end.frame_length, exception_on_overflow=False)
            pcm_unpacked_end = struct.unpack_from("h" * porcupine_end.frame_length, pcm_end)

            # Check if the wake word is detected
            keyword_index_continue = porcupine_continue.process(pcm_unpacked_continue)
            keyword_index_end = porcupine_end.process(pcm_unpacked_end)

            if keyword_index_continue >= 0:
                play_prompt(fallback_prompt=LISTENING_PROMPT, audio_files=CUSTOM_LISTENING_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
                speech_to_response(have_context=True)
                play_prompt(fallback_prompt=CONTINUE_CONVO_PROMPT, audio_files=CUSTOM_CONTINUE_CONVO_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
                start = time.time()
            elif keyword_index_end >= 0:
                break

    except KeyboardInterrupt:
        print("Stopping...")

    finally:
        # Clean up resources
        audio_stream_continue.stop_stream()
        audio_stream_continue.close()
        pa_continue.terminate()
        porcupine_continue.delete()

        audio_stream_end.stop_stream()
        audio_stream_end.close()
        pa_end.terminate()
        porcupine_end.delete()

        play_prompt(fallback_prompt=GOODBYE_PROMPT, audio_files=CUSTOM_GOODBYE_PROMPTS, use_custom=USE_CUSTOM_AUDIO)
##########################################################################################


##########################################################################################
##### Main Loop #####
# Initialize Porcupine with custom wake word
porcupine_initial, audio_stream_initial, pa_initial = porcupine(ACCESS_KEY=ACCESS_KEY, WAKE_WORD_PATHS=[WAKE_WORD_PATH_2])
print("Listening for wake word...")
try:
    while True:
        # Read a frame of audio
        pcm_initial = audio_stream_initial.read(porcupine_initial.frame_length, exception_on_overflow=False)
        pcm_unpacked_initial = struct.unpack_from("h" * porcupine_initial.frame_length, pcm_initial)

        # Check if the wake word is detected
        keyword_index = porcupine_initial.process(pcm_unpacked_initial)
        if keyword_index >= 0:
            convo_initialised()
            continue_convo()
except KeyboardInterrupt:
    print("Stopping...")
finally:
    # Clean up resources
    audio_stream_initial.stop_stream()
    audio_stream_initial.close()
    pa_initial.terminate()
    porcupine_initial.delete()
##########################################################################################