Hey, I've introduced the following two modifications for my own use and figured you may want to take a look and see if it's something you'd like to implement. This is pretty crude and needs some refinement for sure but works. The following code is a drop-in replacement (you will probably want to add relevant config.py settings). The first snippet is for whisperX, the second one adds AllTalk TTS support. AllTalk TTS is a little bit more demanding than piper but offers way better voice quality. WhisperX lets you run this app 100% offline. With 12GB VRAM I'm running the tiny whisper model, a 7B/8B LLM (currently testing wizardlm2 and llama3 via Ollama) and my custom AllTalk model.
import whisperx as wx
from pydub import AudioSegment
import os
from dotenv import load_dotenv
from config import AUDIO_FILE_DIR
import gc
# Load .env file if present
load_dotenv()
device = "cuda"
batch_size = 16
compute_type = "int8"
model_dir = "C:\\test"
language = "en"
model = wx.load_model("tiny", device, language=language, compute_type=compute_type, download_root=model_dir)
def transcribe_audio(file_path):
try:
audio = AudioSegment.from_file(f"{AUDIO_FILE_DIR}/{file_path}")
chunk_size = 10 * 60 * 1000
num_chunks = len(audio) // chunk_size + (1 if len(audio) % chunk_size else 0)
transcript = ""
file_size = os.path.getsize(f"{AUDIO_FILE_DIR}/{file_path}")
if file_size <= 24 * 1024 * 1024:
result = model.transcribe(f"{AUDIO_FILE_DIR}/{file_path}", batch_size=batch_size)
for segment in result['segments']:
transcript += segment['text'] + " "
else:
for i in range(num_chunks):
temp_chunk_path = f"{AUDIO_FILE_DIR}/temp_chunk.mp3"
chunk = audio[i*chunk_size:(i+1)*chunk_size]
with open(temp_chunk_path, 'wb') as f:
chunk.export(f, format="mp3")
try:
result = model.transcribe(temp_chunk_path, batch_size=batch_size)
for segment in result['segments']:
transcript += segment['text'] + " "
finally:
os.remove(temp_chunk_path)
os.remove(f"{AUDIO_FILE_DIR}/{file_path}")
return transcript
except FileNotFoundError as e:
raise FileNotFoundError(f"The audio file {file_path} was not found.") from e
except Exception as e:
raise Exception(f"An error occurred during the transcription process: {e}") from e
def cleanup_model():
global model
del model
gc.collect()
import os
import soundfile as sf
import sounddevice as sd
from openai import OpenAI
from dotenv import load_dotenv
import subprocess
import threading
import queue
import config
import tempfile
import utils
import requests
import shutil
...
...
def TTS_Alltalk(self, text_to_speak, output_file):
# Sanitize the input text by removing unsuitable characters
text_to_speak = utils.sanitize_text(text_to_speak)
# If there is no text left after sanitization, return "failed"
if not text_to_speak.strip():
return "failed"
try:
# Define the API endpoint
api_url = "http://127.0.0.1:7851/api/tts-generate"
# Prepare the data payload for the POST request
data = {
"text_input": text_to_speak,
"text_filtering": "none",
"character_voice_gen": "female_03.wav",
"narrator_enabled": "false",
"narrator_voice_gen": "arnold.wav",
"text_not_inside": "character",
"language": "en",
"output_file_name": "output",
"output_file_timestamp": "true",
"autoplay": "false",
"autoplay_volume": "0.8"
}
response = requests.post(api_url, data=data)
print(response.content)
response.raise_for_status()
response_data = response.json()
if response_data["status"] == "generate-success":
local_audio_path = response_data["output_file_path"]
# Copy the file from the local path to the desired output file
shutil.copyfile(local_audio_path, output_file)
return "success"
except requests.RequestException as e:
print(f"Error calling TTS API: {e}")
return "failed"
The latter snippet is not really an efficient solution as there is no need to copy the AllTalk generated wavs over to the AlwaysReddy audio_files directory. It would make more sense to change the AUDIO_FILE_DIR in config.py to point to the AllTalk output folder. Or change the output directory in AllTalk to point to AUDIO_FILE_DIR. If you think this may come in handy in any way, please feel free to use this code as you see fit.
Hey, I've introduced the following two modifications for my own use and figured you may want to take a look and see if it's something you'd like to implement. This is pretty crude and needs some refinement for sure but works. The following code is a drop-in replacement (you will probably want to add relevant config.py settings). The first snippet is for whisperX, the second one adds AllTalk TTS support. AllTalk TTS is a little bit more demanding than piper but offers way better voice quality. WhisperX lets you run this app 100% offline. With 12GB VRAM I'm running the tiny whisper model, a 7B/8B LLM (currently testing wizardlm2 and llama3 via Ollama) and my custom AllTalk model.
The latter snippet is not really an efficient solution as there is no need to copy the AllTalk generated wavs over to the AlwaysReddy audio_files directory. It would make more sense to change the AUDIO_FILE_DIR in config.py to point to the AllTalk output folder. Or change the output directory in AllTalk to point to AUDIO_FILE_DIR. If you think this may come in handy in any way, please feel free to use this code as you see fit.