Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from pathlib import Path


def add_duration_column(filename, output_filename):
all_info = []
with open(filename) as f:
for line in f:
file_path, pitch_path, transcript = line.strip().split('|', maxsplit=2)
name_stem = Path(os.path.basename(file_path)).stem
# stop hard-coding which columns already exist (no mels or speakers)
all_info.append('|'.join([file_path,
pitch_path,
f'durations/{name_stem}.pt',
transcript]))

with open(output_filename, 'w') as f:
f.writelines('\n'.join(all_info))


if __name__ == '__main__':
filelists = {'filelists/ljs_audio_pitch_text_test.txt': 'filelists/ljs_audio_pitch_durs_text_test.txt',
'filelists/ljs_audio_pitch_text_train_v3.txt': 'filelists/ljs_audio_pitch_durs_text_train_v3.txt',
'filelists/ljs_audio_pitch_text_val.txt': 'filelists/ljs_audio_pitch_durs_text_val.txt'}
for file_name, output_name in filelists.items():
add_duration_column(file_name, output_name)
5 changes: 3 additions & 2 deletions PyTorch/SpeechSynthesis/FastPitch/common/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,9 @@ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
self.n_mel_channels = n_mel_channels
self.sampling_rate = sampling_rate
self.stft_fn = STFT(filter_length, hop_length, win_length)
mel_basis = librosa_mel_fn(
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=filter_length,
n_mels=n_mel_channels,
fmin=mel_fmin, fmax=mel_fmax)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer('mel_basis', mel_basis)

Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/FastPitch/common/stft.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(self, filter_length=800, hop_length=200, win_length=800,
assert(filter_length >= win_length)
# get window and zero center pad it to filter_length
fft_window = get_window(window, win_length, fftbins=True)
fft_window = pad_center(fft_window, filter_length)
fft_window = pad_center(fft_window, size=filter_length)
fft_window = torch.from_numpy(fft_window).float()

# window the bases
Expand Down
8 changes: 5 additions & 3 deletions PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in valid_symbols]
# In phones extracted from MFA TextGrid
_silences = ['@sp', '@sil']


def get_symbols(symbol_set='english_basic'):
Expand All @@ -17,20 +19,20 @@ def get_symbols(symbol_set='english_basic'):
_punctuation = '!\'(),.:;? '
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
elif symbol_set == 'english_basic_lowercase':
_pad = '_'
_punctuation = '!\'"(),.:;? '
_special = '-'
_letters = 'abcdefghijklmnopqrstuvwxyz'
symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
elif symbol_set == 'english_expanded':
_punctuation = '!\'",.:;? '
_math = '#%&*+-/[]()'
_special = '_@©°½—₩€$'
_accented = 'áçéêëñöøćž'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet
symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + _silences
else:
raise Exception("{} symbol set does not exist".format(symbol_set))

Expand Down
23 changes: 10 additions & 13 deletions PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def clean_text(self, text):
def symbols_to_sequence(self, symbols):
return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]

def arpabet_list_to_sequence(self, text):
return self.symbols_to_sequence(['@' + s for s in text])

def arpabet_to_sequence(self, text):
return self.symbols_to_sequence(['@' + s for s in text.split()])

Expand Down Expand Up @@ -118,9 +121,7 @@ def get_arpabet(self, word):
else:
arpabet = arpabet[0]

arpabet = "{" + arpabet + arpabet_suffix + "}"

return arpabet
return arpabet + arpabet_suffix

def encode_text(self, text, return_all=False):
if self.expand_currency:
Expand All @@ -144,20 +145,16 @@ def encode_text(self, text, return_all=False):
text = text_arpabet
elif self.handle_arpabet == 'word':
words = _words_re.findall(text)
text_arpabet = [
word[1] if word[0] == '' else (
self.get_arpabet(word[0])
if np.random.uniform() < self.p_arpabet
else word[0])
for word in words]
text_arpabet = ''.join(text_arpabet)
text_arpabet = [[word[1]] if word[0] == ''
else self.get_arpabet(word[0]).split(' ')
for word in words]
text_arpabet = [phone for phone_list in text_arpabet
for phone in phone_list if phone != ' ']
text = text_arpabet
elif self.handle_arpabet != '':
raise Exception("{} handle_arpabet is not supported".format(
self.handle_arpabet))

text_encoded = self.text_to_sequence(text)

text_encoded = self.arpabet_list_to_sequence(text)
if return_all:
return text_encoded, text_clean, text_arpabet

Expand Down
36 changes: 36 additions & 0 deletions PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import os
import pathlib

from common.utils import load_filepaths_and_text


def create_lab_files(dataset_path, filelist, n_speakers):
# Expect a list of filenames
if type(filelist) is str:
filelist = [filelist]

# difficulty: dealing with 'are there speaker codes are not'?
dataset_entries = load_filepaths_and_text(filelist, dataset_path,
(n_speakers > 1))

for filepath, text in dataset_entries:
wav_name = pathlib.Path(filepath).stem
# lab extension is hardcoded
# so is the use of the wavs subdirectory
lab_filepath = os.path.join(dataset_path, f'{wav_name}.lab')
with open(lab_filepath, 'w') as f:
f.write(text)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, required=True,
help='Path to dataset')
parser.add_argument('--filelist', type=str, required=True, nargs='+',
help='List of wavs with transcript')
parser.add_argument('--n-speakers', type=int, default=1,
help='Number of speakers in dataset')
args = parser.parse_args()

create_lab_files(args.dataset, args.filelist, args.n_speakers)
1 change: 1 addition & 0 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def parse_fastpitch_args(parent, add_help=False):

energy_pred = parser.add_argument_group('energy predictor parameters')
energy_pred.add_argument('--energy-conditioning', action='store_true')
energy_pred.add_argument('--norm_energy', action='store_true')
energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
help='Pitch predictor conv-1D kernel size')
energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,
Expand Down
220 changes: 0 additions & 220 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py

This file was deleted.

Loading