evdv · evdv · Feb 17, 2022 · Feb 18, 2022 · Feb 23, 2022 · Mar 5, 2022
diff --git a/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py b/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+
+def add_duration_column(filename, output_filename):
+    all_info = []
+    with open(filename) as f:
+        for line in f:
+            file_path, pitch_path, transcript = line.strip().split('|', maxsplit=2)
+            name_stem = Path(os.path.basename(file_path)).stem
+            # stop hard-coding which columns already exist (no mels or speakers)
+            all_info.append('|'.join([file_path,
+                                      pitch_path,
+                                      f'durations/{name_stem}.pt',
+                                      transcript]))
+
+    with open(output_filename, 'w') as f:
+        f.writelines('\n'.join(all_info))
+
+
+if __name__ == '__main__':
+    filelists = {'filelists/ljs_audio_pitch_text_test.txt': 'filelists/ljs_audio_pitch_durs_text_test.txt',
+                 'filelists/ljs_audio_pitch_text_train_v3.txt': 'filelists/ljs_audio_pitch_durs_text_train_v3.txt',
+                 'filelists/ljs_audio_pitch_text_val.txt': 'filelists/ljs_audio_pitch_durs_text_val.txt'}
+    for file_name, output_name in filelists.items():
+        add_duration_column(file_name, output_name)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
@@ -93,8 +93,9 @@ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
         self.n_mel_channels = n_mel_channels
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=filter_length,
+                                   n_mels=n_mel_channels,
+                                   fmin=mel_fmin, fmax=mel_fmax)
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer('mel_basis', mel_basis)
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
@@ -64,7 +64,7 @@ def __init__(self, filter_length=800, hop_length=200, win_length=800,
             assert(filter_length >= win_length)
             # get window and zero center pad it to filter_length
             fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
+            fft_window = pad_center(fft_window, size=filter_length)
             fft_window = torch.from_numpy(fft_window).float()
 
             # window the bases

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py
@@ -9,6 +9,8 @@
 
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
 _arpabet = ['@' + s for s in valid_symbols]
+# In phones extracted from MFA TextGrid
+_silences = ['@sp', '@sil']
 
 
 def get_symbols(symbol_set='english_basic'):
@@ -17,20 +19,20 @@ def get_symbols(symbol_set='english_basic'):
         _punctuation = '!\'(),.:;? '
         _special = '-'
         _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
     elif symbol_set == 'english_basic_lowercase':
         _pad = '_'
         _punctuation = '!\'"(),.:;? '
         _special = '-'
         _letters = 'abcdefghijklmnopqrstuvwxyz'
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
     elif symbol_set == 'english_expanded':
         _punctuation = '!\'",.:;? '
         _math = '#%&*+-/[]()'
         _special = '_@©°½—₩€$'
         _accented = 'áçéêëñöøćž'
         _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet
+        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + _silences
     else:
         raise Exception("{} symbol set does not exist".format(symbol_set))
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
@@ -77,6 +77,9 @@ def clean_text(self, text):
     def symbols_to_sequence(self, symbols):
         return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
 
+    def arpabet_list_to_sequence(self, text):
+        return self.symbols_to_sequence(['@' + s for s in text])
+
     def arpabet_to_sequence(self, text):
         return self.symbols_to_sequence(['@' + s for s in text.split()])
 
@@ -118,9 +121,7 @@ def get_arpabet(self, word):
         else:
             arpabet = arpabet[0]
 
-        arpabet = "{" + arpabet + arpabet_suffix + "}"
-
-        return arpabet
+        return arpabet + arpabet_suffix
 
     def encode_text(self, text, return_all=False):
         if self.expand_currency:
@@ -144,20 +145,16 @@ def encode_text(self, text, return_all=False):
                     text = text_arpabet
             elif self.handle_arpabet == 'word':
                 words = _words_re.findall(text)
-                text_arpabet = [
-                    word[1] if word[0] == '' else (
-                        self.get_arpabet(word[0])
-                        if np.random.uniform() < self.p_arpabet
-                        else word[0])
-                    for word in words]
-                text_arpabet = ''.join(text_arpabet)
+                text_arpabet = [[word[1]] if word[0] == ''
+                                else self.get_arpabet(word[0]).split(' ')
+                                for word in words]
+                text_arpabet = [phone for phone_list in text_arpabet
+                                for phone in phone_list if phone != ' ']
                 text = text_arpabet
             elif self.handle_arpabet != '':
                 raise Exception("{} handle_arpabet is not supported".format(
                     self.handle_arpabet))
-
-        text_encoded = self.text_to_sequence(text)
-
+        text_encoded = self.arpabet_list_to_sequence(text)
         if return_all:
             return text_encoded, text_clean, text_arpabet
 

diff --git a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
@@ -0,0 +1,36 @@
+import argparse
+import os
+import pathlib
+
+from common.utils import load_filepaths_and_text
+
+
+def create_lab_files(dataset_path, filelist, n_speakers):
+    # Expect a list of filenames
+    if type(filelist) is str:
+        filelist = [filelist]
+
+    # difficulty: dealing with 'are there speaker codes are not'?
+    dataset_entries = load_filepaths_and_text(filelist, dataset_path,
+                                              (n_speakers > 1))
+
+    for filepath, text in dataset_entries:
+        wav_name = pathlib.Path(filepath).stem
+        # lab extension is hardcoded
+        # so is the use of the wavs subdirectory
+        lab_filepath = os.path.join(dataset_path, f'{wav_name}.lab')
+        with open(lab_filepath, 'w') as f:
+            f.write(text)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='Path to dataset')
+    parser.add_argument('--filelist', type=str, required=True, nargs='+',
+                        help='List of wavs with transcript')
+    parser.add_argument('--n-speakers', type=int, default=1,
+                        help='Number of speakers in dataset')
+    args = parser.parse_args()
+
+    create_lab_files(args.dataset, args.filelist, args.n_speakers)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
@@ -110,6 +110,7 @@ def parse_fastpitch_args(parent, add_help=False):
 
     energy_pred = parser.add_argument_group('energy predictor parameters')
     energy_pred.add_argument('--energy-conditioning', action='store_true')
+    energy_pred.add_argument('--norm_energy', action='store_true')
     energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
                             help='Pitch predictor conv-1D kernel size')
     energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py