########################################################################################## ##### WhisperX ##### import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline def whisper_pipeline(model_id: str, whisper_lang: str): device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa" ) model.generation_config.language = whisper_lang # define your language of choice here model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, generate_kwargs={"max_new_tokens": 128}, torch_dtype=torch_dtype, device=device ) return pipe ########################################################################################## if __name__ == "__main__": # Example Usage import os from dotenv import load_dotenv load_dotenv() ROOT_DIR = os.getenv('ROOT_DIR', os.path.dirname(__file__)) model_id = "openai/whisper-tiny" whisper_lang = "en" whisper = whisper_pipeline(model_id, whisper_lang) audio_file = os.path.join(ROOT_DIR, "llm_media/input_audio.wav") result = whisper(audio_file)['text'] print(result)