From 880e6a522febe41e05ebaab9b0ab3dce53733bb1 Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 17 Feb 2022 19:06:49 +0000 Subject: [PATCH 01/21] WIP extracting/loading durations --- .../FastPitch/fastpitch/data_function.py | 13 +++++++++++-- .../SpeechSynthesis/FastPitch/prepare_dataset.py | 7 ++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index a007db86f..9df3ab0b1 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -342,6 +342,14 @@ def __call__(self, batch): text = batch[ids_sorted_decreasing[i]][0] text_padded[i, :text.size(0)] = text + dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype) + dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32) + for i in range(len(ids_sorted_decreasing)): + dur = batch[ids_sorted_decreasing[i]][3] + dur_padded[i, :dur.shape[0]] = dur + dur_lens[i] = dur.shape[0] + assert dur_lens[i] == input_lengths[i] + # Right zero-pad mel-spec num_mels = batch[0][1].size(0) max_target_len = max([x[1].size(1) for x in batch]) @@ -386,16 +394,17 @@ def __call__(self, batch): audiopaths = [batch[i][7] for i in ids_sorted_decreasing] - return (text_padded, input_lengths, mel_padded, output_lengths, len_x, + return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x, pitch_padded, energy_padded, speaker, attn_prior_padded, audiopaths) def batch_to_gpu(batch): - (text_padded, input_lengths, mel_padded, output_lengths, len_x, + (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x, pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch text_padded = to_gpu(text_padded).long() + durs_padded = to_gpu(durs_padded).long() input_lengths = to_gpu(input_lengths).long() mel_padded = to_gpu(mel_padded).float() output_lengths = to_gpu(output_lengths).long() diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index d93065b42..ed6ba6566 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -50,6 +50,8 @@ def parse_args(parser): help='Calculate spectrograms from .wav files') parser.add_argument('--extract-pitch', action='store_true', help='Extract pitch') + parser.add_argument('--extract-durations', action='store_true', + help='Extract durations') parser.add_argument('--save-alignment-priors', action='store_true', help='Pre-calculate diagonal matrices of alignment of text to audio') parser.add_argument('--log-file', type=str, default='preproc_log.json', @@ -99,6 +101,9 @@ def main(): if args.extract_pitch: Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True) + if args.extract_durs: + Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True) + if args.save_alignment_priors: Path(args.dataset_path, 'alignment_priors').mkdir(parents=False, exist_ok=True) @@ -142,7 +147,7 @@ def main(): for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() - _, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch + _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch # Ensure filenames are unique for p in fpaths: From 832ba35e4d13d02bc8f357e0a071b1abce1c6962 Mon Sep 17 00:00:00 2001 From: evdv Date: Fri, 18 Feb 2022 14:27:55 +0000 Subject: [PATCH 02/21] WIP --- .../FastPitch/create_lab_files.py | 0 .../FastPitch/prepare_dataset.py | 65 ++++++++++++++++++- .../FastPitch/requirements.txt | 1 + .../FastPitch/scripts/prepare_dataset.sh | 8 +++ 4 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py diff --git a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py new file mode 100644 index 000000000..e69de29bb diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index ed6ba6566..6fdd0b23c 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -26,13 +26,16 @@ # ***************************************************************************** import argparse +import os import time from pathlib import Path +import numpy as np import torch import tqdm import dllogger as DLLogger from dllogger import StdOutBackend, JSONStreamBackend, Verbosity +from tgt.io import read_textgrid from torch.utils.data import DataLoader from fastpitch.data_function import TTSCollate, TTSDataset @@ -44,6 +47,8 @@ def parse_args(parser): """ parser.add_argument('-d', '--dataset-path', type=str, default='./', help='Path to dataset') + parser.add_argument('--textgrid-path', type=str, + help='Path to TextGrids') parser.add_argument('--wav-text-filelists', required=True, nargs='+', type=str, help='Files with audio paths and text') parser.add_argument('--extract-mels', action='store_true', @@ -51,7 +56,7 @@ def parse_args(parser): parser.add_argument('--extract-pitch', action='store_true', help='Extract pitch') parser.add_argument('--extract-durations', action='store_true', - help='Extract durations') + help='Extract durations (from alignment dir)') parser.add_argument('--save-alignment-priors', action='store_true', help='Pre-calculate diagonal matrices of alignment of text to audio') parser.add_argument('--log-file', type=str, default='preproc_log.json', @@ -82,6 +87,36 @@ def parse_args(parser): return parser +def parse_textgrid(tier, sampling_rate, hop_length): + # From Dan Wells + # Latest MFA replaces silence phones with "" in output TextGrids + sil_phones = ['sil', 'sp', 'spn', ''] + start_time = tier[0].start_time + end_time = tier[-1].end_time + phones = [] + durations = [] + for index, label in enumerate(tier._objects): + p_start, p_end, phone = label.start_time, label.end_time, label.text + if phone not in sil_phones: + phones.append(phone) + else: + if (index == 0) or (index == len(tier) - 1): + # leading or trailing silence + phones.append('sil') + else: + # short pause between words + phones.append('sp') + + durations.append(int(np.ceil(p_end * sampling_rate / hop_length) + - np.ceil(p_start * sampling_rate / hop_length))) + return phones, durations, start_time, end_time + + +def check_durations(durs, mel_len, filepath): + assert (sum(durs) == mel_len, + f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens') + + def main(): parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing') parser = parse_args(parser) @@ -102,6 +137,8 @@ def main(): Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True) if args.extract_durs: + if not args.textgrid_path: + args.textgridPath = os.path.join(args.dataset_path, 'TextGrid') Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True) if args.save_alignment_priors: @@ -147,6 +184,10 @@ def main(): for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() + # From TTSCollate __call__ + # (text_padded, dur_padded, input_lengths, mel_padded, + # output_lengths, len_x, pitch_padded, energy_padded, speaker, + # attn_prior_padded, audiopaths) _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch # Ensure filenames are unique @@ -168,6 +209,28 @@ def main(): fpath = Path(args.dataset_path, 'pitch', fname) torch.save(p[:mel_lens[j]], fpath) + if args.extract_durations: + # From Dan Wells + for j, _ in range(len(mel_lens)): + filename = fpaths[j] + tgt_path = Path(args.textgrid_path, f'{filename}.TextGrid') + try: + textgrid = read_textgrid(tgt_path, + include_empty_intervals=True) + except FileNotFoundError: + print(f'{filename}.wav TextGrid missing: {tgt_path}') + raise + _, durs, _, _ = parse_textgrid( + textgrid.get_tier_by_name('phones'), + args.sampling_rate, + args.hop_length) + + check_durations(durs, mel_lens[j], filename) + + dur_path = Path(args.dataset_path, + 'durations', f'{filename}.pt') + torch.save(torch.LongTensor(durs).cpu().int(), dur_path) + if args.save_alignment_priors: for j, prior in enumerate(attn_prior): fname = Path(fpaths[j]).with_suffix('.pt').name diff --git a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt index e6d7b1751..33b7548c1 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt @@ -3,5 +3,6 @@ numpy inflect librosa==0.8.0 scipy +tgt tensorboardX==2.0 git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index 43525ef48..408d75a63 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -3,13 +3,21 @@ set -e : ${DATA_DIR:=LJSpeech-1.1} +: ${ALIGNMENT_DIR:=${DATA_DIR}/mfa_alignments} : ${ARGS="--extract-mels"} +mfa model download acoustic english +mfa model download dictionary english +mfa validate $DATA_DIR english english +mfa align $DATA_DIR english english $ALIGNMENT_DIR + python prepare_dataset.py \ --wav-text-filelists filelists/ljs_audio_text.txt \ --n-workers 16 \ --batch-size 1 \ --dataset-path $DATA_DIR \ + --textgrid-path $ALIGNMENT_DIR \ --extract-pitch \ + --extract-durations\ --f0-method pyin \ $ARGS From 6b7b290676d187f110e65f713be113255076ad0a Mon Sep 17 00:00:00 2001 From: evdv Date: Wed, 23 Feb 2022 10:58:59 +0000 Subject: [PATCH 03/21] WIP, issue with zero-padding durations --- .../FastPitch/common/layers.py | 5 +-- .../SpeechSynthesis/FastPitch/common/stft.py | 2 +- .../FastPitch/create_lab_files.py | 36 +++++++++++++++++++ .../FastPitch/fastpitch/data_function.py | 7 ++++ .../FastPitch/prepare_dataset.py | 12 ++++--- .../FastPitch/scripts/prepare_dataset.sh | 29 +++++++++++---- 6 files changed, 77 insertions(+), 14 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py index d3ec68f6d..80c059b87 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py @@ -93,8 +93,9 @@ def __init__(self, filter_length=1024, hop_length=256, win_length=1024, self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) - mel_basis = librosa_mel_fn( - sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=filter_length, + n_mels=n_mel_channels, + fmin=mel_fmin, fmax=mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py index 4084dc68e..bc140c142 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py @@ -64,7 +64,7 @@ def __init__(self, filter_length=800, hop_length=200, win_length=800, assert(filter_length >= win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) - fft_window = pad_center(fft_window, filter_length) + fft_window = pad_center(fft_window, size=filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases diff --git a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py index e69de29bb..48da1f631 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py +++ b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py @@ -0,0 +1,36 @@ +import argparse +import os +import pathlib + +from common.utils import load_filepaths_and_text + + +def create_lab_files(dataset_path, filelist, n_speakers): + # Expect a list of filenames + if type(filelist) is str: + filelist = [filelist] + + # difficulty: dealing with 'are there speaker codes are not'? + dataset_entries = load_filepaths_and_text(filelist, dataset_path, + (n_speakers > 1)) + + for filepath, text in dataset_entries: + wav_name = pathlib.Path(filepath).stem + # lab extension is hardcoded + # so is the use of the wavs subdirectory + lab_filepath = os.path.join(dataset_path, f'{wav_name}.lab') + with open(lab_filepath, 'w') as f: + f.write(text) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--dataset', type=str, required=True, + help='Path to dataset') + parser.add_argument('--filelist', type=str, required=True, nargs='+', + help='List of wavs with transcript') + parser.add_argument('--n-speakers', type=int, default=1, + help='Number of speakers in dataset') + args = parser.parse_args() + + create_lab_files(args.dataset, args.filelist, args.n_speakers) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 9df3ab0b1..544a626ba 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -329,23 +329,30 @@ class TTSCollate: """Zero-pads model inputs and targets based on number of frames per step""" def __call__(self, batch): + print('new call') + # print(batch) """Collate training batch from normalized text and mel-spec""" # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True) max_input_len = input_lengths[0] + print(max_input_len) text_padded = torch.LongTensor(len(batch), max_input_len) text_padded.zero_() for i in range(len(ids_sorted_decreasing)): text = batch[ids_sorted_decreasing[i]][0] text_padded[i, :text.size(0)] = text + #print(text) dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype) dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32) + #print(dur_lens) for i in range(len(ids_sorted_decreasing)): dur = batch[ids_sorted_decreasing[i]][3] + # error + print(i, dur.shape) dur_padded[i, :dur.shape[0]] = dur dur_lens[i] = dur.shape[0] assert dur_lens[i] == input_lengths[i] diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index 6fdd0b23c..355c595b8 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -113,8 +113,8 @@ def parse_textgrid(tier, sampling_rate, hop_length): def check_durations(durs, mel_len, filepath): - assert (sum(durs) == mel_len, - f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens') + assert sum(durs) == mel_len, \ + f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens' def main(): @@ -136,7 +136,7 @@ def main(): if args.extract_pitch: Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True) - if args.extract_durs: + if args.extract_durations: if not args.textgrid_path: args.textgridPath = os.path.join(args.dataset_path, 'TextGrid') Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True) @@ -181,6 +181,8 @@ def main(): drop_last=False) all_filenames = set() + print('pre-loop') + print(data_loader.dataset) for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() @@ -189,14 +191,14 @@ def main(): # output_lengths, len_x, pitch_padded, energy_padded, speaker, # attn_prior_padded, audiopaths) _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch - + print(f'batch: {fpaths}') # Ensure filenames are unique for p in fpaths: fname = Path(p).name if fname in all_filenames: raise ValueError(f'Filename is not unique: {fname}') all_filenames.add(fname) - + print('filename check complete') if args.extract_mels: for j, mel in enumerate(mels): fname = Path(fpaths[j]).with_suffix('.pt').name diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index 408d75a63..e9f9b0c9f 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -2,18 +2,35 @@ set -e +while getopts "ln:" opt; do + case $opt in + l ) LABELS="true";; + n ) NSPEAKERS=$OPTARG;; + \?) echo "Invalid option: -"$OPTARG"" >&2 + exit 1;; + esac + done + +: ${NSPEAKERS:=1} # default value : ${DATA_DIR:=LJSpeech-1.1} +: ${WAV_DIR:=${DATA_DIR}/wavs} # should already exist +: ${FILELIST:=filelists/ljs_audio_text.txt} : ${ALIGNMENT_DIR:=${DATA_DIR}/mfa_alignments} : ${ARGS="--extract-mels"} -mfa model download acoustic english -mfa model download dictionary english -mfa validate $DATA_DIR english english -mfa align $DATA_DIR english english $ALIGNMENT_DIR +if [ "$LABELS" = "true" ] +then + python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS} +fi + +# mfa model download acoustic english +# mfa model download dictionary english +# mfa validate ${WAV_DIR} english english +# mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} python prepare_dataset.py \ - --wav-text-filelists filelists/ljs_audio_text.txt \ - --n-workers 16 \ + --wav-text-filelists ${FILELIST} \ + --n-workers 8 \ --batch-size 1 \ --dataset-path $DATA_DIR \ --textgrid-path $ALIGNMENT_DIR \ From e8484e00ed2a9aadcf96fc69af62058a840af0b1 Mon Sep 17 00:00:00 2001 From: evdv Date: Sat, 5 Mar 2022 18:57:54 +0000 Subject: [PATCH 04/21] Trying to figure out why I can't get the batches anymore --- .../FastPitch/fastpitch/data_function.py | 21 +++++++++++-------- .../FastPitch/prepare_dataset.py | 6 +++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 544a626ba..4e549f0d0 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -225,6 +225,8 @@ def __getitem__(self, index): if len(pitch.size()) == 1: pitch = pitch[None, :] + print('getting a batch') + # this is a batch return (text, mel, len(text), pitch, energy, speaker, attn_prior, audiopath) @@ -327,36 +329,37 @@ def get_pitch(self, index, mel_len=None): class TTSCollate: """Zero-pads model inputs and targets based on number of frames per step""" - + # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, + # len_x, pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch def __call__(self, batch): - print('new call') - # print(batch) + print('COLLATE GETS CALLED') """Collate training batch from normalized text and mel-spec""" # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True) max_input_len = input_lengths[0] - print(max_input_len) text_padded = torch.LongTensor(len(batch), max_input_len) text_padded.zero_() for i in range(len(ids_sorted_decreasing)): text = batch[ids_sorted_decreasing[i]][0] text_padded[i, :text.size(0)] = text - #print(text) dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype) + print('dur padded orig', dur_padded.shape) dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32) - #print(dur_lens) + print('start loop?') for i in range(len(ids_sorted_decreasing)): dur = batch[ids_sorted_decreasing[i]][3] - # error - print(i, dur.shape) + # ERROR + # print(i, dur_padded[0].shape, dur[0].shape) + print(i) dur_padded[i, :dur.shape[0]] = dur + print('new shape: ', dur_padded.shape) dur_lens[i] = dur.shape[0] assert dur_lens[i] == input_lengths[i] - + print('end loop?') # Right zero-pad mel-spec num_mels = batch[0][1].size(0) max_target_len = max([x[1].size(1) for x in batch]) diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index 355c595b8..645f6b97d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -27,6 +27,7 @@ import argparse import os +import sys import time from pathlib import Path @@ -181,8 +182,7 @@ def main(): drop_last=False) all_filenames = set() - print('pre-loop') - print(data_loader.dataset) + print('TIME TO LOOP') for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() @@ -191,7 +191,7 @@ def main(): # output_lengths, len_x, pitch_padded, energy_padded, speaker, # attn_prior_padded, audiopaths) _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch - print(f'batch: {fpaths}') + print(f'BATCH: {fpaths}') # Ensure filenames are unique for p in fpaths: fname = Path(p).name From 8c4919de55e86b4d6c52235826ec9c9e21862fbd Mon Sep 17 00:00:00 2001 From: evdv Date: Tue, 8 Mar 2022 22:47:37 +0000 Subject: [PATCH 05/21] Remove alignment, but units of textgrid and text processing not the same --- .../FastPitch/common/text/text_processing.py | 31 ++- .../FastPitch/fastpitch/attention.py | 220 ------------------ .../FastPitch/fastpitch/attn_loss_function.py | 54 ----- .../FastPitch/fastpitch/data_function.py | 207 ++++++++-------- .../FastPitch/fastpitch/loss_function.py | 24 +- .../FastPitch/fastpitch/model.py | 71 +----- .../FastPitch/prepare_dataset.py | 106 ++------- .../FastPitch/scripts/prepare_dataset.sh | 3 +- PyTorch/SpeechSynthesis/FastPitch/train.py | 34 +-- 9 files changed, 160 insertions(+), 590 deletions(-) delete mode 100644 PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py delete mode 100644 PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py index b700df1f4..e13b3ed1c 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py @@ -77,6 +77,9 @@ def clean_text(self, text): def symbols_to_sequence(self, symbols): return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] + def arpabet_list_to_sequence(self, text): + return self.symbols_to_sequence(['@' + s for s in text]) + def arpabet_to_sequence(self, text): return self.symbols_to_sequence(['@' + s for s in text.split()]) @@ -118,7 +121,7 @@ def get_arpabet(self, word): else: arpabet = arpabet[0] - arpabet = "{" + arpabet + arpabet_suffix + "}" + # arpabet = "{" + arpabet + arpabet_suffix + "}" return arpabet @@ -144,20 +147,26 @@ def encode_text(self, text, return_all=False): text = text_arpabet elif self.handle_arpabet == 'word': words = _words_re.findall(text) - text_arpabet = [ - word[1] if word[0] == '' else ( - self.get_arpabet(word[0]) - if np.random.uniform() < self.p_arpabet - else word[0]) - for word in words] - text_arpabet = ''.join(text_arpabet) + text_arpabet = [[word[1]] if word[0] == '' + else self.get_arpabet(word[0]).split(' ') + for word in words] + text_arpabet = [phone for phone_list in text_arpabet for phone in phone_list if phone != ' '] + # text_arpabet = [ + # word[1] if word[0] == '' else ( + # self.get_arpabet(word[0]) + # if np.random.uniform() < self.p_arpabet + # else word[0]) + # for word in words] + print('ARPABET: ', text_arpabet[:10]) + #text_arpabet = ''.join(text_arpabet) text = text_arpabet elif self.handle_arpabet != '': raise Exception("{} handle_arpabet is not supported".format( self.handle_arpabet)) - - text_encoded = self.text_to_sequence(text) - + # text_encoded = self.arpabet_to_sequence(text) + # text_encoded = self.text_to_sequence(text) + text_encoded = self.arpabet_list_to_sequence(text) + print(len(text_encoded)) if return_all: return text_encoded, text_clean, text_arpabet diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py deleted file mode 100644 index 59a7397d6..000000000 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - - -class ConvNorm(torch.nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, - padding=None, dilation=1, bias=True, w_init_gain='linear'): - super(ConvNorm, self).__init__() - if padding is None: - assert(kernel_size % 2 == 1) - padding = int(dilation * (kernel_size - 1) / 2) - - self.conv = torch.nn.Conv1d(in_channels, out_channels, - kernel_size=kernel_size, stride=stride, - padding=padding, dilation=dilation, - bias=bias) - - torch.nn.init.xavier_uniform_( - self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) - - def forward(self, signal): - conv_signal = self.conv(signal) - return conv_signal - - -class Invertible1x1ConvLUS(torch.nn.Module): - def __init__(self, c): - super(Invertible1x1ConvLUS, self).__init__() - # Sample a random orthonormal matrix to initialize weights - W, _ = torch.linalg.qr(torch.randn(c, c)) - # Ensure determinant is 1.0 not -1.0 - if torch.det(W) < 0: - W[:, 0] = -1*W[:, 0] - p, lower, upper = torch.lu_unpack(*torch.lu(W)) - - self.register_buffer('p', p) - # diagonals of lower will always be 1s anyway - lower = torch.tril(lower, -1) - lower_diag = torch.diag(torch.eye(c, c)) - self.register_buffer('lower_diag', lower_diag) - self.lower = nn.Parameter(lower) - self.upper_diag = nn.Parameter(torch.diag(upper)) - self.upper = nn.Parameter(torch.triu(upper, 1)) - - def forward(self, z, reverse=False): - U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag) - L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag) - W = torch.mm(self.p, torch.mm(L, U)) - if reverse: - if not hasattr(self, 'W_inverse'): - # Reverse computation - W_inverse = W.float().inverse() - if z.type() == 'torch.cuda.HalfTensor': - W_inverse = W_inverse.half() - - self.W_inverse = W_inverse[..., None] - z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) - return z - else: - W = W[..., None] - z = F.conv1d(z, W, bias=None, stride=1, padding=0) - log_det_W = torch.sum(torch.log(torch.abs(self.upper_diag))) - return z, log_det_W - - -class ConvAttention(torch.nn.Module): - def __init__(self, n_mel_channels=80, n_speaker_dim=128, - n_text_channels=512, n_att_channels=80, temperature=1.0, - n_mel_convs=2, align_query_enc_type='3xconv', - use_query_proj=True): - super(ConvAttention, self).__init__() - self.temperature = temperature - self.att_scaling_factor = np.sqrt(n_att_channels) - self.softmax = torch.nn.Softmax(dim=3) - self.log_softmax = torch.nn.LogSoftmax(dim=3) - self.query_proj = Invertible1x1ConvLUS(n_mel_channels) - self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1) - self.align_query_enc_type = align_query_enc_type - self.use_query_proj = bool(use_query_proj) - - self.key_proj = nn.Sequential( - ConvNorm(n_text_channels, - n_text_channels * 2, - kernel_size=3, - bias=True, - w_init_gain='relu'), - torch.nn.ReLU(), - ConvNorm(n_text_channels * 2, - n_att_channels, - kernel_size=1, - bias=True)) - - self.align_query_enc_type = align_query_enc_type - - if align_query_enc_type == "inv_conv": - self.query_proj = Invertible1x1ConvLUS(n_mel_channels) - elif align_query_enc_type == "3xconv": - self.query_proj = nn.Sequential( - ConvNorm(n_mel_channels, - n_mel_channels * 2, - kernel_size=3, - bias=True, - w_init_gain='relu'), - torch.nn.ReLU(), - ConvNorm(n_mel_channels * 2, - n_mel_channels, - kernel_size=1, - bias=True), - torch.nn.ReLU(), - ConvNorm(n_mel_channels, - n_att_channels, - kernel_size=1, - bias=True)) - else: - raise ValueError("Unknown query encoder type specified") - - def run_padded_sequence(self, sorted_idx, unsort_idx, lens, padded_data, - recurrent_model): - """Sorts input data by previded ordering (and un-ordering) and runs the - packed data through the recurrent model - - Args: - sorted_idx (torch.tensor): 1D sorting index - unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx) - lens: lengths of input data (sorted in descending order) - padded_data (torch.tensor): input sequences (padded) - recurrent_model (nn.Module): recurrent model to run data through - Returns: - hidden_vectors (torch.tensor): outputs of the RNN, in the original, - unsorted, ordering - """ - - # sort the data by decreasing length using provided index - # we assume batch index is in dim=1 - padded_data = padded_data[:, sorted_idx] - padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens) - hidden_vectors = recurrent_model(padded_data)[0] - hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors) - # unsort the results at dim=1 and return - hidden_vectors = hidden_vectors[:, unsort_idx] - return hidden_vectors - - def encode_query(self, query, query_lens): - query = query.permute(2, 0, 1) # seq_len, batch, feature dim - lens, ids = torch.sort(query_lens, descending=True) - original_ids = [0] * lens.size(0) - for i in range(len(ids)): - original_ids[ids[i]] = i - - query_encoded = self.run_padded_sequence(ids, original_ids, lens, - query, self.query_lstm) - query_encoded = query_encoded.permute(1, 2, 0) - return query_encoded - - def forward(self, queries, keys, query_lens, mask=None, key_lens=None, - keys_encoded=None, attn_prior=None): - """Attention mechanism for flowtron parallel - Unlike in Flowtron, we have no restrictions such as causality etc, - since we only need this during training. - - Args: - queries (torch.tensor): B x C x T1 tensor - (probably going to be mel data) - keys (torch.tensor): B x C2 x T2 tensor (text data) - query_lens: lengths for sorting the queries in descending order - mask (torch.tensor): uint8 binary mask for variable length entries - (should be in the T2 domain) - Output: - attn (torch.tensor): B x 1 x T1 x T2 attention mask. - Final dim T2 should sum to 1 - """ - keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 - - # Beware can only do this since query_dim = attn_dim = n_mel_channels - if self.use_query_proj: - if self.align_query_enc_type == "inv_conv": - queries_enc, log_det_W = self.query_proj(queries) - elif self.align_query_enc_type == "3xconv": - queries_enc = self.query_proj(queries) - log_det_W = 0.0 - else: - queries_enc, log_det_W = self.query_proj(queries) - else: - queries_enc, log_det_W = queries, 0.0 - - # different ways of computing attn, - # one is isotopic gaussians (per phoneme) - # Simplistic Gaussian Isotopic Attention - - # B x n_attn_dims x T1 x T2 - attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2 - # compute log likelihood from a gaussian - attn = -0.0005 * attn.sum(1, keepdim=True) - if attn_prior is not None: - attn = self.log_softmax(attn) + torch.log(attn_prior[:, None]+1e-8) - - attn_logprob = attn.clone() - - if mask is not None: - attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2), - -float("inf")) - - attn = self.softmax(attn) # Softmax along T2 - return attn, attn_logprob diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py deleted file mode 100644 index a653504fd..000000000 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class AttentionCTCLoss(torch.nn.Module): - def __init__(self, blank_logprob=-1): - super(AttentionCTCLoss, self).__init__() - self.log_softmax = torch.nn.LogSoftmax(dim=3) - self.blank_logprob = blank_logprob - self.CTCLoss = nn.CTCLoss(zero_infinity=True) - - def forward(self, attn_logprob, in_lens, out_lens): - key_lens = in_lens - query_lens = out_lens - attn_logprob_padded = F.pad(input=attn_logprob, - pad=(1, 0, 0, 0, 0, 0, 0, 0), - value=self.blank_logprob) - cost_total = 0.0 - for bid in range(attn_logprob.shape[0]): - target_seq = torch.arange(1, key_lens[bid]+1).unsqueeze(0) - curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2) - curr_logprob = curr_logprob[:query_lens[bid], :, :key_lens[bid]+1] - curr_logprob = self.log_softmax(curr_logprob[None])[0] - ctc_cost = self.CTCLoss( - curr_logprob, target_seq, input_lengths=query_lens[bid:bid+1], - target_lengths=key_lens[bid:bid+1]) - cost_total += ctc_cost - cost = cost_total/attn_logprob.shape[0] - return cost - - -class AttentionBinarizationLoss(torch.nn.Module): - def __init__(self): - super(AttentionBinarizationLoss, self).__init__() - - def forward(self, hard_attention, soft_attention, eps=1e-12): - log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1], - min=eps)).sum() - return -log_sum / hard_attention.sum() diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 4e549f0d0..ab96cbc41 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -25,57 +25,54 @@ # # ***************************************************************************** -import functools -import json import re +from functools import lru_cache from pathlib import Path import librosa import numpy as np import torch import torch.nn.functional as F -from scipy import ndimage -from scipy.stats import betabinom import common.layers as layers +from common.text import cmudict from common.text.text_processing import TextProcessing from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu +from tgt.io import read_textgrid + + +def check_durations(durs, mel_len, filepath): + assert sum(durs) == mel_len, \ + f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens' + + +def parse_textgrid(tier, sampling_rate, hop_length): + # From Dan Wells + # Latest MFA replaces silence phones with "" in output TextGrids + sil_phones = ['sil', 'sp', 'spn', ''] + start_time = tier[0].start_time + end_time = tier[-1].end_time + phones = [] + durations = [] + for index, label in enumerate(tier._objects): + p_start, p_end, phone = label.start_time, label.end_time, label.text + # if p_start > end_time: + # phones.append('') + end_time = p_end + if phone not in sil_phones: + phones.append(phone) + else: + if (index == 0) or (index == len(tier) - 1): + # leading or trailing silence + phones.append('sil') + else: + # short pause between words + phones.append('sp') - -class BetaBinomialInterpolator: - """Interpolates alignment prior matrices to save computation. - - Calculating beta-binomial priors is costly. Instead cache popular sizes - and use img interpolation to get priors faster. - """ - def __init__(self, round_mel_len_to=100, round_text_len_to=20): - self.round_mel_len_to = round_mel_len_to - self.round_text_len_to = round_text_len_to - self.bank = functools.lru_cache(beta_binomial_prior_distribution) - - def round(self, val, to): - return max(1, int(np.round((val + 1) / to))) * to - - def __call__(self, w, h): - bw = self.round(w, to=self.round_mel_len_to) - bh = self.round(h, to=self.round_text_len_to) - ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1) - assert ret.shape[0] == w, ret.shape - assert ret.shape[1] == h, ret.shape - return ret - - -def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling=1.0): - P = phoneme_count - M = mel_count - x = np.arange(0, P) - mel_text_probs = [] - for i in range(1, M+1): - a, b = scaling * i, scaling * (M + 1 - i) - rv = betabinom(P, a, b) - mel_i_prob = rv.pmf(x) - mel_text_probs.append(mel_i_prob) - return torch.tensor(np.array(mel_text_probs)) + durations.append(int(np.ceil(p_end * sampling_rate / hop_length) + - np.ceil(p_start * sampling_rate / hop_length))) + print('PHONES', phones[:15]) + return phones, durations, start_time, end_time def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None, @@ -128,38 +125,25 @@ class TTSDataset(torch.utils.data.Dataset): 2) normalizes text and converts them to sequences of one-hot vectors 3) computes mel-spectrograms from audio files. """ - def __init__(self, - dataset_path, - audiopaths_and_text, - text_cleaners, - n_mel_channels, - symbol_set='english_basic', - p_arpabet=1.0, - n_speakers=1, - load_mel_from_disk=True, - load_pitch_from_disk=True, - pitch_mean=214.72203, # LJSpeech defaults - pitch_std=65.72038, - max_wav_value=None, - sampling_rate=None, - filter_length=None, - hop_length=None, - win_length=None, - mel_fmin=None, - mel_fmax=None, - prepend_space_to_text=False, + def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, + n_mel_channels, symbol_set='english_basic', p_arpabet=1.0, + cmu_dict='cmudict/cmudict-0.7b', + n_speakers=1, load_mel_from_disk=True, + load_pitch_from_disk=True, pitch_mean=214.72203, + pitch_std=65.72038, max_wav_value=None, sampling_rate=None, + filter_length=None, hop_length=None, win_length=None, + mel_fmin=None, mel_fmax=None, prepend_space_to_text=False, append_space_to_text=False, - pitch_online_dir=None, - betabinomial_online_dir=None, - use_betabinomial_interpolator=True, - pitch_online_method='pyin', - **ignored): + dur_online_dir=None, textgrid_path=None, + pitch_online_dir=None, pitch_online_method='pyin', **ignored): # Expect a list of filenames if type(audiopaths_and_text) is str: audiopaths_and_text = [audiopaths_and_text] + self.hop_length = hop_length self.dataset_path = dataset_path + self.textgrid_path = textgrid_path self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, dataset_path, has_speakers=(n_speakers > 1)) @@ -178,16 +162,14 @@ def __init__(self, assert p_arpabet == 0.0 or p_arpabet == 1.0, ( 'Only 0.0 and 1.0 p_arpabet is currently supported. ' 'Variable probability breaks caching of betabinomial matrices.') + if p_arpabet > 0.0: + cmudict.initialize(cmu_dict, keep_ambiguous=True) - self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet) + self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet, handle_arpabet='word', handle_arpabet_ambiguous='random') self.n_speakers = n_speakers self.pitch_tmp_dir = pitch_online_dir + self.dur_tmp_dir = dur_online_dir self.f0_method = pitch_online_method - self.betabinomial_tmp_dir = betabinomial_online_dir - self.use_betabinomial_interpolator = use_betabinomial_interpolator - - if use_betabinomial_interpolator: - self.betabinomial_interpolator = BetaBinomialInterpolator() expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1)) @@ -217,22 +199,23 @@ def __getitem__(self, index): text = self.get_text(text) pitch = self.get_pitch(index, mel.size(-1)) energy = torch.norm(mel.float(), dim=0, p=2) - attn_prior = self.get_prior(index, mel.shape[1], text.shape[0]) - + dur = self.get_dur(index) + print('get batch dur: ', len(dur)) assert pitch.size(-1) == mel.size(-1) # No higher formants? if len(pitch.size()) == 1: pitch = pitch[None, :] - print('getting a batch') # this is a batch - return (text, mel, len(text), pitch, energy, speaker, attn_prior, + # FastPitch 1.0: (text, mel, len_text, dur, pitch, speaker) + return (text, mel, len(text), pitch, energy, speaker, dur, audiopath) def __len__(self): return len(self.audiopaths_and_text) + @lru_cache() def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) @@ -263,30 +246,37 @@ def get_text(self, text): if self.append_space_to_text: text = text + space + print('TEXT: ', len(text)) return torch.LongTensor(text) - def get_prior(self, index, mel_len, text_len): - - if self.use_betabinomial_interpolator: - return torch.from_numpy(self.betabinomial_interpolator(mel_len, - text_len)) + def get_dur(self, index): + audiopath, *fields = self.audiopaths_and_text[index] + name = Path(audiopath).stem - if self.betabinomial_tmp_dir is not None: - audiopath, *_ = self.audiopaths_and_text[index] - fname = Path(audiopath).relative_to(self.dataset_path) if self.dataset_path else Path(audiopath) - fname = fname.with_suffix('.pt') - cached_fpath = Path(self.betabinomial_tmp_dir, fname) + path = Path(self.dataset_path, 'durations') if self.dataset_path else Path(audiopath) + fname = Path(path, name).with_suffix('.pt') + if self.dur_tmp_dir is not None: + cached_fpath = Path(self.dur_tmp_dir, fname) if cached_fpath.is_file(): return torch.load(cached_fpath) - attn_prior = beta_binomial_prior_distribution(text_len, mel_len) + tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid') + try: + textgrid = read_textgrid(tgt_path, include_empty_intervals=True) + except FileNotFoundError: + print(f'{name}.wav TextGrid missing: {tgt_path}') + raise + _, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), + self.sampling_rate, + self.hop_length) - if self.betabinomial_tmp_dir is not None: - cached_fpath.parent.mkdir(parents=True, exist_ok=True) - torch.save(attn_prior, cached_fpath) + check_durations(durs, self.get_mel(audiopath).size(1), name) + + if self.dur_tmp_dir is not None and not cached_fpath.is_file(): + return torch.save(durs, cached_fpath) - return attn_prior + return durs def get_pitch(self, index, mel_len=None): audiopath, *fields = self.audiopaths_and_text[index] @@ -330,9 +320,8 @@ def get_pitch(self, index, mel_len=None): class TTSCollate: """Zero-pads model inputs and targets based on number of frames per step""" # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, - # len_x, pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch + # len_x, pitch_padded, energy_padded, speaker, DUR, audiopaths) = batch def __call__(self, batch): - print('COLLATE GETS CALLED') """Collate training batch from normalized text and mel-spec""" # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( @@ -346,20 +335,16 @@ def __call__(self, batch): text = batch[ids_sorted_decreasing[i]][0] text_padded[i, :text.size(0)] = text - dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype) - print('dur padded orig', dur_padded.shape) + dur_padded = torch.zeros_like(text_padded, dtype=torch.int32) + dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32) - print('start loop?') for i in range(len(ids_sorted_decreasing)): - dur = batch[ids_sorted_decreasing[i]][3] - # ERROR - # print(i, dur_padded[0].shape, dur[0].shape) - print(i) - dur_padded[i, :dur.shape[0]] = dur - print('new shape: ', dur_padded.shape) - dur_lens[i] = dur.shape[0] + dur = batch[ids_sorted_decreasing[i]][6] + # ERROR some mismatch between phones in transcript vs phones form text preprocessing + print('TEXT LEN', dur_padded.shape, 'DUR LEN', len(dur)) + dur_padded[i, :len(dur)] = torch.Tensor(dur) + dur_lens[i] = len(dur) assert dur_lens[i] == input_lengths[i] - print('end loop?') # Right zero-pad mel-spec num_mels = batch[0][1].size(0) max_target_len = max([x[1].size(1) for x in batch]) @@ -391,13 +376,6 @@ def __call__(self, batch): else: speaker = None - attn_prior_padded = torch.zeros(len(batch), max_target_len, - max_input_len) - attn_prior_padded.zero_() - for i in range(len(ids_sorted_decreasing)): - prior = batch[ids_sorted_decreasing[i]][6] - attn_prior_padded[i, :prior.size(0), :prior.size(1)] = prior - # Count number of items - characters in text len_x = [x[2] for x in batch] len_x = torch.Tensor(len_x) @@ -405,28 +383,27 @@ def __call__(self, batch): audiopaths = [batch[i][7] for i in ids_sorted_decreasing] return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, attn_prior_padded, - audiopaths) + pitch_padded, energy_padded, speaker, audiopaths) def batch_to_gpu(batch): (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch + pitch_padded, energy_padded, speaker, dur_lens, audiopaths) = batch text_padded = to_gpu(text_padded).long() durs_padded = to_gpu(durs_padded).long() + dur_lens = to_gpu(dur_lens).long() input_lengths = to_gpu(input_lengths).long() mel_padded = to_gpu(mel_padded).float() output_lengths = to_gpu(output_lengths).long() pitch_padded = to_gpu(pitch_padded).float() energy_padded = to_gpu(energy_padded).float() - attn_prior = to_gpu(attn_prior).float() if speaker is not None: speaker = to_gpu(speaker).long() # Alignments act as both inputs and targets - pass shallow copies x = [text_padded, input_lengths, mel_padded, output_lengths, - pitch_padded, energy_padded, speaker, attn_prior, audiopaths] - y = [mel_padded, input_lengths, output_lengths] + pitch_padded, energy_padded, speaker, durs_padded, audiopaths] + y = [mel_padded, durs_padded, dur_lens, output_lengths] len_x = torch.sum(output_lengths) return (x, y, len_x) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py index 0cd3775e5..5b789a9a2 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py @@ -30,29 +30,24 @@ from torch import nn from common.utils import mask_from_lens -from fastpitch.attn_loss_function import AttentionCTCLoss class FastPitchLoss(nn.Module): def __init__(self, dur_predictor_loss_scale=1.0, - pitch_predictor_loss_scale=1.0, attn_loss_scale=1.0, + pitch_predictor_loss_scale=1.0, energy_predictor_loss_scale=0.1): super(FastPitchLoss, self).__init__() self.dur_predictor_loss_scale = dur_predictor_loss_scale self.pitch_predictor_loss_scale = pitch_predictor_loss_scale self.energy_predictor_loss_scale = energy_predictor_loss_scale - self.attn_loss_scale = attn_loss_scale - self.attn_ctc_loss = AttentionCTCLoss() def forward(self, model_out, targets, is_training=True, meta_agg='mean'): (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, - energy_pred, energy_tgt, attn_soft, attn_hard, attn_dur, - attn_logprob) = model_out - - (mel_tgt, in_lens, out_lens) = targets - - dur_tgt = attn_dur - dur_lens = in_lens + energy_pred, energy_tgt) = model_out + # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) + #(mel_tgt, in_lens, out_lens) = targets + mel_tgt, dur_tgt, dur_lens, pitch_tgt = targets + #dur_lens = in_lens mel_tgt.requires_grad = False # (B,H,T) => (B,T,H) @@ -83,21 +78,16 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'): else: energy_loss = 0 - # Attention loss - attn_loss = self.attn_ctc_loss(attn_logprob, in_lens, out_lens) - loss = (mel_loss + dur_pred_loss * self.dur_predictor_loss_scale + pitch_loss * self.pitch_predictor_loss_scale - + energy_loss * self.energy_predictor_loss_scale - + attn_loss * self.attn_loss_scale) + + energy_loss * self.energy_predictor_loss_scale) meta = { 'loss': loss.clone().detach(), 'mel_loss': mel_loss.clone().detach(), 'duration_predictor_loss': dur_pred_loss.clone().detach(), 'pitch_loss': pitch_loss.clone().detach(), - 'attn_loss': attn_loss.clone().detach(), 'dur_error': (torch.abs(dur_pred - dur_tgt).sum() / dur_mask.sum()).detach(), } diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index 34fca4dff..ac0b188fe 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -34,7 +34,6 @@ from common.layers import ConvReLUNorm from common.utils import mask_from_lens from fastpitch.alignment import b_mas, mas_width1 -from fastpitch.attention import ConvAttention from fastpitch.transformer import FFTransformer @@ -204,45 +203,11 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx, self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True) - self.attention = ConvAttention( - n_mel_channels, 0, symbols_embedding_dim, - use_query_proj=True, align_query_enc_type='3xconv') - - def binarize_attention(self, attn, in_lens, out_lens): - """For training purposes only. Binarizes attention with MAS. - These will no longer recieve a gradient. - - Args: - attn: B x 1 x max_mel_len x max_text_len - """ - b_size = attn.shape[0] - with torch.no_grad(): - attn_cpu = attn.data.cpu().numpy() - attn_out = torch.zeros_like(attn) - for ind in range(b_size): - hard_attn = mas_width1( - attn_cpu[ind, 0, :out_lens[ind], :in_lens[ind]]) - attn_out[ind, 0, :out_lens[ind], :in_lens[ind]] = torch.tensor( - hard_attn, device=attn.get_device()) - return attn_out - - def binarize_attention_parallel(self, attn, in_lens, out_lens): - """For training purposes only. Binarizes attention with MAS. - These will no longer recieve a gradient. - - Args: - attn: B x 1 x max_mel_len x max_text_len - """ - with torch.no_grad(): - attn_cpu = attn.data.cpu().numpy() - attn_out = b_mas(attn_cpu, in_lens.cpu().numpy(), - out_lens.cpu().numpy(), width=1) - return torch.from_numpy(attn_out).to(attn.get_device()) - - def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75): - - (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense, - speaker, attn_prior, audiopaths) = inputs + def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75): + # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs + # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs + (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense, + speaker, audiopaths) = inputs mel_max_len = mel_tgt.size(2) @@ -256,26 +221,6 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75): # Input FFT enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb) - # Alignment - text_emb = self.encoder.word_emb(inputs) - - # make sure to do the alignments before folding - attn_mask = mask_from_lens(input_lens)[..., None] == 0 - # attn_mask should be 1 for unused timesteps in the text_enc_w_spkvec tensor - - attn_soft, attn_logprob = self.attention( - mel_tgt, text_emb.permute(0, 2, 1), mel_lens, attn_mask, - key_lens=input_lens, keys_encoded=enc_out, attn_prior=attn_prior) - - attn_hard = self.binarize_attention_parallel( - attn_soft, input_lens, mel_lens) - - # Viterbi --> durations - attn_hard_dur = attn_hard.sum(2)[:, 0, :] - dur_tgt = attn_hard_dur - - assert torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens)) - # Predict durations log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1) dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration) @@ -308,14 +253,14 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75): energy_tgt = None len_regulated, dec_lens = regulate_len( - dur_tgt, enc_out, pace, mel_max_len) + dur_tgt if use_gt_durations else dur_pred, + enc_out, pace, mel_max_len) # Output FFT dec_out, dec_mask = self.decoder(len_regulated, dec_lens) mel_out = self.proj(dec_out) return (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, - pitch_tgt, energy_pred, energy_tgt, attn_soft, attn_hard, - attn_hard_dur, attn_logprob) + pitch_tgt, energy_pred, energy_tgt) def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None, energy_tgt=None, pitch_transform=None, max_duration=75, diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index 645f6b97d..f523c9f6d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -27,16 +27,13 @@ import argparse import os -import sys import time from pathlib import Path -import numpy as np import torch import tqdm import dllogger as DLLogger from dllogger import StdOutBackend, JSONStreamBackend, Verbosity -from tgt.io import read_textgrid from torch.utils.data import DataLoader from fastpitch.data_function import TTSCollate, TTSDataset @@ -58,8 +55,8 @@ def parse_args(parser): help='Extract pitch') parser.add_argument('--extract-durations', action='store_true', help='Extract durations (from alignment dir)') - parser.add_argument('--save-alignment-priors', action='store_true', - help='Pre-calculate diagonal matrices of alignment of text to audio') + parser.add_argument('--durs-online-dir', type=str, + help='Durations tmp dir') parser.add_argument('--log-file', type=str, default='preproc_log.json', help='Filename for logging') parser.add_argument('--n-speakers', type=int, default=1) @@ -88,36 +85,6 @@ def parse_args(parser): return parser -def parse_textgrid(tier, sampling_rate, hop_length): - # From Dan Wells - # Latest MFA replaces silence phones with "" in output TextGrids - sil_phones = ['sil', 'sp', 'spn', ''] - start_time = tier[0].start_time - end_time = tier[-1].end_time - phones = [] - durations = [] - for index, label in enumerate(tier._objects): - p_start, p_end, phone = label.start_time, label.end_time, label.text - if phone not in sil_phones: - phones.append(phone) - else: - if (index == 0) or (index == len(tier) - 1): - # leading or trailing silence - phones.append('sil') - else: - # short pause between words - phones.append('sp') - - durations.append(int(np.ceil(p_end * sampling_rate / hop_length) - - np.ceil(p_start * sampling_rate / hop_length))) - return phones, durations, start_time, end_time - - -def check_durations(durs, mel_len, filepath): - assert sum(durs) == mel_len, \ - f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens' - - def main(): parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing') parser = parse_args(parser) @@ -139,37 +106,32 @@ def main(): if args.extract_durations: if not args.textgrid_path: - args.textgridPath = os.path.join(args.dataset_path, 'TextGrid') - Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True) - - if args.save_alignment_priors: - Path(args.dataset_path, 'alignment_priors').mkdir(parents=False, exist_ok=True) + args.textgrid_path = os.path.join(args.dataset_path, 'TextGrid') + durs_path = Path(args.dataset_path, 'durations') + durs_path.mkdir(parents=False, exist_ok=True) + if args.durs_online_dir: + Path(args.durs_online_dir, durs_path).mkdir(parents=True, exist_ok=True) for filelist in args.wav_text_filelists: print(f'Processing {filelist}...') - dataset = TTSDataset( - args.dataset_path, - filelist, - text_cleaners=['english_cleaners_v2'], - n_mel_channels=args.n_mel_channels, - p_arpabet=0.0, - n_speakers=args.n_speakers, - load_mel_from_disk=False, - load_pitch_from_disk=False, - pitch_mean=None, - pitch_std=None, - max_wav_value=args.max_wav_value, - sampling_rate=args.sampling_rate, - filter_length=args.filter_length, - hop_length=args.hop_length, - win_length=args.win_length, - mel_fmin=args.mel_fmin, - mel_fmax=args.mel_fmax, - betabinomial_online_dir=None, - pitch_online_dir=None, - pitch_online_method=args.f0_method) + dataset = TTSDataset(args.dataset_path, filelist, + text_cleaners=['english_cleaners_v2'], + n_mel_channels=args.n_mel_channels, p_arpabet=1.0, + n_speakers=args.n_speakers, + load_mel_from_disk=False, + load_pitch_from_disk=False, pitch_mean=None, + pitch_std=None, max_wav_value=args.max_wav_value, + sampling_rate=args.sampling_rate, + filter_length=args.filter_length, + hop_length=args.hop_length, + win_length=args.win_length, mel_fmin=args.mel_fmin, + mel_fmax=args.mel_fmax, + pitch_online_dir=None, + dur_online_dir=None, + textgrid_path=args.textgrid_path, + pitch_online_method=args.f0_method) data_loader = DataLoader( dataset, @@ -213,32 +175,12 @@ def main(): if args.extract_durations: # From Dan Wells - for j, _ in range(len(mel_lens)): + for j, _ in range(len(durs)): filename = fpaths[j] - tgt_path = Path(args.textgrid_path, f'{filename}.TextGrid') - try: - textgrid = read_textgrid(tgt_path, - include_empty_intervals=True) - except FileNotFoundError: - print(f'{filename}.wav TextGrid missing: {tgt_path}') - raise - _, durs, _, _ = parse_textgrid( - textgrid.get_tier_by_name('phones'), - args.sampling_rate, - args.hop_length) - - check_durations(durs, mel_lens[j], filename) - dur_path = Path(args.dataset_path, 'durations', f'{filename}.pt') torch.save(torch.LongTensor(durs).cpu().int(), dur_path) - if args.save_alignment_priors: - for j, prior in enumerate(attn_prior): - fname = Path(fpaths[j]).with_suffix('.pt').name - fpath = Path(args.dataset_path, 'alignment_priors', fname) - torch.save(prior[:mel_lens[j], :input_lens[j]], fpath) - if __name__ == '__main__': main() diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index e9f9b0c9f..a1baf1c3c 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -30,11 +30,12 @@ fi python prepare_dataset.py \ --wav-text-filelists ${FILELIST} \ - --n-workers 8 \ + --n-workers 2 \ --batch-size 1 \ --dataset-path $DATA_DIR \ --textgrid-path $ALIGNMENT_DIR \ --extract-pitch \ --extract-durations\ + --durs-online-dir "/tmp/" \ --f0-method pyin \ $ARGS diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 90cfb4443..fb59e9fba 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -50,7 +50,6 @@ import models from common.text import cmudict from common.utils import BenchmarkStats, prepare_tmp -from fastpitch.attn_loss_function import AttentionBinarizationLoss from fastpitch.data_function import batch_to_gpu, TTSCollate, TTSDataset from fastpitch.loss_function import FastPitchLoss from fastpitch.model import regulate_len @@ -129,7 +128,8 @@ def parse_args(parser): help='Type of text cleaners for input text') data.add_argument('--symbol-set', type=str, default='english_basic', help='Define symbol set for input text') - data.add_argument('--p-arpabet', type=float, default=0.0, + # should be 1.0 to work with MFA textgrids, which contain only phones + data.add_argument('--p-arpabet', type=float, default=1.0, help='Probability of using arpabets instead of graphemes ' 'for each word; set 0 for pure grapheme training') data.add_argument('--heteronyms-path', type=str, default='cmudict/heteronyms', @@ -514,7 +514,6 @@ def main(): model_config = models.get_model_config('FastPitch', args) model = models.get_model('FastPitch', model_config, device) - attention_kl_loss = AttentionBinarizationLoss() if args.local_rank == 0: wandb.init(project=args.project, @@ -574,8 +573,7 @@ def main(): criterion = FastPitchLoss( dur_predictor_loss_scale=args.dur_predictor_loss_scale, - pitch_predictor_loss_scale=args.pitch_predictor_loss_scale, - attn_loss_scale=args.attn_loss_scale) + pitch_predictor_loss_scale=args.pitch_predictor_loss_scale) collate_fn = TTSCollate() @@ -640,26 +638,10 @@ def main(): x, y, num_frames = batch_to_gpu(batch) with torch.cuda.amp.autocast(enabled=args.amp): - y_pred = model(x) + # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) + y_pred = model(x, use_gt_durations=True) + # y = mel_padded, input_lengths, output_lengths loss, meta = criterion(y_pred, y) - - if (args.kl_loss_start_epoch is not None - and epoch >= args.kl_loss_start_epoch): - - if args.kl_loss_start_epoch == epoch and epoch_iter == 1: - print('Begin hard_attn loss') - - _, _, _, _, _, _, _, _, attn_soft, attn_hard, _, _ = y_pred - binarization_loss = attention_kl_loss(attn_hard, attn_soft) - kl_weight = min((epoch - args.kl_loss_start_epoch) / args.kl_loss_warmup_epochs, 1.0) * args.kl_loss_weight - meta['kl_loss'] = binarization_loss.clone().detach() * kl_weight - loss += kl_weight * binarization_loss - - else: - meta['kl_loss'] = torch.zeros_like(loss) - kl_weight = 0 - binarization_loss = 0 - loss /= args.grad_accumulation meta = {k: v / args.grad_accumulation @@ -702,12 +684,12 @@ def main(): apply_multi_tensor_ema(args.ema_decay, *mt_ema_params) iter_mel_loss = iter_meta['mel_loss'].item() - iter_kl_loss = iter_meta['kl_loss'].item() iter_time = time.perf_counter() - iter_start_time epoch_frames_per_sec += iter_num_frames / iter_time epoch_loss += iter_loss epoch_num_frames += iter_num_frames epoch_mel_loss += iter_mel_loss + if epoch_iter % 5 == 0: log({ 'epoch': epoch, @@ -716,8 +698,6 @@ def main(): 'total_steps': total_iter, 'loss/loss': iter_loss, 'mel-loss/mel_loss': iter_mel_loss, - 'kl_loss': iter_kl_loss, - 'kl_weight': kl_weight, 'frames per s': iter_num_frames / iter_time, 'took': iter_time, 'lrate': optimizer.param_groups[0]['lr'], From 83958506424d0ab983b43d3e43a148dab0b5aaec Mon Sep 17 00:00:00 2001 From: evdv Date: Wed, 9 Mar 2022 19:45:50 +0000 Subject: [PATCH 06/21] Replace preprocessed text with phone sequence from MFA TextGrid --- .../FastPitch/common/text/symbols.py | 8 ++++--- .../FastPitch/common/text/text_processing.py | 18 +++------------ .../FastPitch/fastpitch/data_function.py | 23 +++++++++---------- .../FastPitch/prepare_dataset.py | 15 +++++------- .../FastPitch/scripts/prepare_dataset.sh | 4 ++-- 5 files changed, 27 insertions(+), 41 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py index cfdb5755a..7262b1284 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py @@ -9,6 +9,8 @@ # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): _arpabet = ['@' + s for s in valid_symbols] +# In phones extracted from MFA TextGrid +_silences = ['@sp', '@sil'] def get_symbols(symbol_set='english_basic'): @@ -17,20 +19,20 @@ def get_symbols(symbol_set='english_basic'): _punctuation = '!\'(),.:;? ' _special = '-' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' - symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences elif symbol_set == 'english_basic_lowercase': _pad = '_' _punctuation = '!\'"(),.:;? ' _special = '-' _letters = 'abcdefghijklmnopqrstuvwxyz' - symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences elif symbol_set == 'english_expanded': _punctuation = '!\'",.:;? ' _math = '#%&*+-/[]()' _special = '_@©°½—₩€$' _accented = 'áçéêëñöøćž' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' - symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + _silences else: raise Exception("{} symbol set does not exist".format(symbol_set)) diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py index e13b3ed1c..8a7e3e638 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py +++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py @@ -121,9 +121,7 @@ def get_arpabet(self, word): else: arpabet = arpabet[0] - # arpabet = "{" + arpabet + arpabet_suffix + "}" - - return arpabet + return arpabet + arpabet_suffix def encode_text(self, text, return_all=False): if self.expand_currency: @@ -150,23 +148,13 @@ def encode_text(self, text, return_all=False): text_arpabet = [[word[1]] if word[0] == '' else self.get_arpabet(word[0]).split(' ') for word in words] - text_arpabet = [phone for phone_list in text_arpabet for phone in phone_list if phone != ' '] - # text_arpabet = [ - # word[1] if word[0] == '' else ( - # self.get_arpabet(word[0]) - # if np.random.uniform() < self.p_arpabet - # else word[0]) - # for word in words] - print('ARPABET: ', text_arpabet[:10]) - #text_arpabet = ''.join(text_arpabet) + text_arpabet = [phone for phone_list in text_arpabet + for phone in phone_list if phone != ' '] text = text_arpabet elif self.handle_arpabet != '': raise Exception("{} handle_arpabet is not supported".format( self.handle_arpabet)) - # text_encoded = self.arpabet_to_sequence(text) - # text_encoded = self.text_to_sequence(text) text_encoded = self.arpabet_list_to_sequence(text) - print(len(text_encoded)) if return_all: return text_encoded, text_clean, text_arpabet diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index ab96cbc41..dcd4baa6a 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -71,7 +71,7 @@ def parse_textgrid(tier, sampling_rate, hop_length): durations.append(int(np.ceil(p_end * sampling_rate / hop_length) - np.ceil(p_start * sampling_rate / hop_length))) - print('PHONES', phones[:15]) + return phones, durations, start_time, end_time @@ -196,11 +196,10 @@ def __getitem__(self, index): speaker = None mel = self.get_mel(audiopath) - text = self.get_text(text) pitch = self.get_pitch(index, mel.size(-1)) energy = torch.norm(mel.float(), dim=0, p=2) - dur = self.get_dur(index) - print('get batch dur: ', len(dur)) + dur, phones = self.get_dur(index) + text = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones)) assert pitch.size(-1) == mel.size(-1) # No higher formants? @@ -237,7 +236,7 @@ def get_mel(self, filename): return melspec def get_text(self, text): - text = self.tp.encode_text(text) + text, text_clean, text_arpabet = self.tp.encode_text(text, return_all=True) space = [self.tp.encode_text("A A")[1]] if self.prepend_space_to_text: @@ -246,8 +245,7 @@ def get_text(self, text): if self.append_space_to_text: text = text + space - print('TEXT: ', len(text)) - return torch.LongTensor(text) + return torch.LongTensor(text), text_arpabet def get_dur(self, index): audiopath, *fields = self.audiopaths_and_text[index] @@ -267,7 +265,7 @@ def get_dur(self, index): except FileNotFoundError: print(f'{name}.wav TextGrid missing: {tgt_path}') raise - _, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), + phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), self.sampling_rate, self.hop_length) @@ -276,7 +274,7 @@ def get_dur(self, index): if self.dur_tmp_dir is not None and not cached_fpath.is_file(): return torch.save(durs, cached_fpath) - return durs + return durs, phones def get_pitch(self, index, mel_len=None): audiopath, *fields = self.audiopaths_and_text[index] @@ -326,7 +324,7 @@ def __call__(self, batch): # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( torch.LongTensor([len(x[0]) for x in batch]), - dim=0, descending=True) + dim=0, descending=False) max_input_len = input_lengths[0] text_padded = torch.LongTensor(len(batch), max_input_len) @@ -340,8 +338,9 @@ def __call__(self, batch): dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32) for i in range(len(ids_sorted_decreasing)): dur = batch[ids_sorted_decreasing[i]][6] - # ERROR some mismatch between phones in transcript vs phones form text preprocessing - print('TEXT LEN', dur_padded.shape, 'DUR LEN', len(dur)) + # With MFA durations: + # some mismatch between phones in transcript vs phones from text preprocessing + # for now using phones from texgrid as input dur_padded[i, :len(dur)] = torch.Tensor(dur) dur_lens[i] = len(dur) assert dur_lens[i] == input_lengths[i] diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index f523c9f6d..5b0a4bef7 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -144,23 +144,20 @@ def main(): drop_last=False) all_filenames = set() - print('TIME TO LOOP') for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() # From TTSCollate __call__ # (text_padded, dur_padded, input_lengths, mel_padded, - # output_lengths, len_x, pitch_padded, energy_padded, speaker, - # attn_prior_padded, audiopaths) - _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch - print(f'BATCH: {fpaths}') + # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths) + _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths = batch # Ensure filenames are unique for p in fpaths: fname = Path(p).name if fname in all_filenames: raise ValueError(f'Filename is not unique: {fname}') all_filenames.add(fname) - print('filename check complete') + if args.extract_mels: for j, mel in enumerate(mels): fname = Path(fpaths[j]).with_suffix('.pt').name @@ -175,11 +172,11 @@ def main(): if args.extract_durations: # From Dan Wells - for j, _ in range(len(durs)): - filename = fpaths[j] + for j, d in enumerate(durs): + filename = Path(fpaths[j]).stem dur_path = Path(args.dataset_path, 'durations', f'{filename}.pt') - torch.save(torch.LongTensor(durs).cpu().int(), dur_path) + torch.save(d, dur_path) if __name__ == '__main__': diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index a1baf1c3c..ae544f25d 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -30,8 +30,8 @@ fi python prepare_dataset.py \ --wav-text-filelists ${FILELIST} \ - --n-workers 2 \ - --batch-size 1 \ + --n-workers 4 \ + --batch-size 1 \ # don't change this --dataset-path $DATA_DIR \ --textgrid-path $ALIGNMENT_DIR \ --extract-pitch \ From ab11e60e8946cac5105b724cbeb245609797fbee Mon Sep 17 00:00:00 2001 From: evdv Date: Wed, 9 Mar 2022 19:50:54 +0000 Subject: [PATCH 07/21] Fix up descending keyword --- .../SpeechSynthesis/FastPitch/fastpitch/data_function.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index dcd4baa6a..74f41342a 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -266,8 +266,8 @@ def get_dur(self, index): print(f'{name}.wav TextGrid missing: {tgt_path}') raise phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), - self.sampling_rate, - self.hop_length) + self.sampling_rate, + self.hop_length) check_durations(durs, self.get_mel(audiopath).size(1), name) @@ -324,7 +324,7 @@ def __call__(self, batch): # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( torch.LongTensor([len(x[0]) for x in batch]), - dim=0, descending=False) + dim=0, descending=True) max_input_len = input_lengths[0] text_padded = torch.LongTensor(len(batch), max_input_len) From 7f66533224ac757f6dc641a3a7d5b9651aa85d82 Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 10 Mar 2022 13:56:39 +0000 Subject: [PATCH 08/21] Save transcriptions from textGrids to be used as inputs (often kept separate for no good reason so far --- .../FastPitch/add_durations_lj_filelist.py | 26 +++++++++++ .../FastPitch/fastpitch/data_function.py | 46 ++++++++++++------- .../FastPitch/fastpitch/model.py | 4 +- .../FastPitch/prepare_dataset.py | 14 +++++- .../FastPitch/scripts/prepare_dataset.sh | 3 +- .../FastPitch/scripts/train.sh | 24 +++++----- PyTorch/SpeechSynthesis/FastPitch/train.py | 9 ++-- 7 files changed, 86 insertions(+), 40 deletions(-) create mode 100644 PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py diff --git a/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py b/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py new file mode 100644 index 000000000..56bd9e378 --- /dev/null +++ b/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py @@ -0,0 +1,26 @@ +import os +from pathlib import Path + + +def add_duration_column(filename, output_filename): + all_info = [] + with open(filename) as f: + for line in f: + file_path, pitch_path, transcript = line.strip().split('|', maxsplit=2) + name_stem = Path(os.path.basename(file_path)).stem + # stop hard-coding which columns already exist (no mels or speakers) + all_info.append('|'.join([file_path, + pitch_path, + f'durations/{name_stem}.pt', + transcript])) + + with open(output_filename, 'w') as f: + f.writelines('\n'.join(all_info)) + + +if __name__ == '__main__': + filelists = {'filelists/ljs_audio_pitch_text_test.txt': 'filelists/ljs_audio_pitch_durs_text_test.txt', + 'filelists/ljs_audio_pitch_text_train_v3.txt': 'filelists/ljs_audio_pitch_durs_text_train_v3.txt', + 'filelists/ljs_audio_pitch_text_val.txt': 'filelists/ljs_audio_pitch_durs_text_val.txt'} + for file_name, output_name in filelists.items(): + add_duration_column(file_name, output_name) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 74f41342a..638e116fb 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -133,7 +133,7 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, pitch_std=65.72038, max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, prepend_space_to_text=False, - append_space_to_text=False, + append_space_to_text=False, load_durs_from_disk=False, dur_online_dir=None, textgrid_path=None, pitch_online_dir=None, pitch_online_method='pyin', **ignored): @@ -155,6 +155,7 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax) self.load_pitch_from_disk = load_pitch_from_disk + self.load_durs_from_disk = load_durs_from_disk self.prepend_space_to_text = prepend_space_to_text self.append_space_to_text = append_space_to_text @@ -171,8 +172,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, self.dur_tmp_dir = dur_online_dir self.f0_method = pitch_online_method - expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1)) - + expected_columns = (2 + int(load_durs_from_disk) + int(load_pitch_from_disk) + (n_speakers > 1)) + print(load_durs_from_disk, load_pitch_from_disk, expected_columns) assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) if len(self.audiopaths_and_text[0]) < expected_columns: @@ -199,7 +200,7 @@ def __getitem__(self, index): pitch = self.get_pitch(index, mel.size(-1)) energy = torch.norm(mel.float(), dim=0, p=2) dur, phones = self.get_dur(index) - text = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones)) + text = phones assert pitch.size(-1) == mel.size(-1) # No higher formants? @@ -209,7 +210,7 @@ def __getitem__(self, index): # this is a batch # FastPitch 1.0: (text, mel, len_text, dur, pitch, speaker) return (text, mel, len(text), pitch, energy, speaker, dur, - audiopath) + audiopath, phones) def __len__(self): return len(self.audiopaths_and_text) @@ -251,13 +252,23 @@ def get_dur(self, index): audiopath, *fields = self.audiopaths_and_text[index] name = Path(audiopath).stem + # TODO: check what happens here with absolute vs relative paths path = Path(self.dataset_path, 'durations') if self.dataset_path else Path(audiopath) fname = Path(path, name).with_suffix('.pt') if self.dur_tmp_dir is not None: - cached_fpath = Path(self.dur_tmp_dir, fname) - if cached_fpath.is_file(): - return torch.load(cached_fpath) + cached_durpath = Path(self.dur_tmp_dir, fname) + cached_phonepath = Path(self.dur_tmp_dir, name + '_phones').with_suffix('.pt') + if cached_durpath.is_file(): + # assume if one exists the other does too + return torch.load(cached_durpath), torch.load(cached_phonepath) + + if self.load_durs_from_disk: + duration_path = fields[1] # assume durations come after pitch + # assume phone_path is known from duration_path + phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt') + print(duration_path, phone_path) + return torch.load(duration_path), torch.load(phone_path) tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid') try: @@ -268,11 +279,11 @@ def get_dur(self, index): phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), self.sampling_rate, self.hop_length) - + phones = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones)) check_durations(durs, self.get_mel(audiopath).size(1), name) - if self.dur_tmp_dir is not None and not cached_fpath.is_file(): - return torch.save(durs, cached_fpath) + if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file(): + return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath) return durs, phones @@ -317,8 +328,7 @@ def get_pitch(self, index, mel_len=None): class TTSCollate: """Zero-pads model inputs and targets based on number of frames per step""" - # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, - # len_x, pitch_padded, energy_padded, speaker, DUR, audiopaths) = batch + # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones) = batch def __call__(self, batch): """Collate training batch from normalized text and mel-spec""" # Right zero-pad all one-hot text sequences to max input length @@ -361,12 +371,15 @@ def __call__(self, batch): pitch_padded = torch.zeros(mel_padded.size(0), n_formants, mel_padded.size(2), dtype=batch[0][3].dtype) energy_padded = torch.zeros_like(pitch_padded[:, 0, :]) + phones_padded = torch.zeros_like(pitch_padded[:, 0, :]) for i in range(len(ids_sorted_decreasing)): pitch = batch[ids_sorted_decreasing[i]][3] energy = batch[ids_sorted_decreasing[i]][4] + phones = batch[ids_sorted_decreasing[i]][8] pitch_padded[i, :, :pitch.shape[1]] = pitch energy_padded[i, :energy.shape[0]] = energy + phones_padded[i, :phones.shape[0]] = phones if batch[0][5] is not None: speaker = torch.zeros_like(input_lengths) @@ -382,12 +395,12 @@ def __call__(self, batch): audiopaths = [batch[i][7] for i in ids_sorted_decreasing] return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, audiopaths) + pitch_padded, energy_padded, speaker, audiopaths, phones_padded) def batch_to_gpu(batch): (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, dur_lens, audiopaths) = batch + pitch_padded, energy_padded, speaker, dur_lens, audiopaths, phones_padded) = batch text_padded = to_gpu(text_padded).long() durs_padded = to_gpu(durs_padded).long() @@ -397,12 +410,13 @@ def batch_to_gpu(batch): output_lengths = to_gpu(output_lengths).long() pitch_padded = to_gpu(pitch_padded).float() energy_padded = to_gpu(energy_padded).float() + phones_padded = to_gpu(phones_padded).long() if speaker is not None: speaker = to_gpu(speaker).long() # Alignments act as both inputs and targets - pass shallow copies x = [text_padded, input_lengths, mel_padded, output_lengths, - pitch_padded, energy_padded, speaker, durs_padded, audiopaths] + pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded] y = [mel_padded, durs_padded, dur_lens, output_lengths] len_x = torch.sum(output_lengths) return (x, y, len_x) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index ac0b188fe..ee2981a05 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -207,7 +207,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense, - speaker, audiopaths) = inputs + speaker, audiopaths, phones_padded) = inputs mel_max_len = mel_tgt.size(2) @@ -219,7 +219,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma spk_emb.mul_(self.speaker_emb_weight) # Input FFT - enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb) + enc_out, enc_mask = self.encoder(phones_padded, conditioning=spk_emb) # Predict durations log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1) diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index 5b0a4bef7..94794eebd 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -27,6 +27,7 @@ import argparse import os +import sys import time from pathlib import Path @@ -88,8 +89,10 @@ def parse_args(parser): def main(): parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing') parser = parse_args(parser) + print(sys.argv) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: + print(unk_args) raise ValueError(f'Invalid options {unk_args}') DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, Path(args.dataset_path, args.log_file)), @@ -149,8 +152,8 @@ def main(): # From TTSCollate __call__ # (text_padded, dur_padded, input_lengths, mel_padded, - # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths) - _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths = batch + # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths, phones) + _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch # Ensure filenames are unique for p in fpaths: fname = Path(p).name @@ -174,9 +177,16 @@ def main(): # From Dan Wells for j, d in enumerate(durs): filename = Path(fpaths[j]).stem + # TODO remove hardcoding dataset path? dur_path = Path(args.dataset_path, 'durations', f'{filename}.pt') torch.save(d, dur_path) + for j, p in enumerate(phones): + filename = Path(fpaths[j]).stem + # save phones too + phones_path = Path(args.dataset_path, + 'durations', f'{filename}_phones.pt') + torch.save(p, phones_path) if __name__ == '__main__': diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index ae544f25d..b69a8aac0 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -28,10 +28,11 @@ fi # mfa validate ${WAV_DIR} english english # mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} +# don't change batch size python prepare_dataset.py \ --wav-text-filelists ${FILELIST} \ --n-workers 4 \ - --batch-size 1 \ # don't change this + --batch-size 1 \ --dataset-path $DATA_DIR \ --textgrid-path $ALIGNMENT_DIR \ --extract-pitch \ diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index ba041a33f..1c4e816ef 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -2,13 +2,13 @@ export OMP_NUM_THREADS=1 -: ${NUM_GPUS:=8} -: ${BATCH_SIZE:=16} +: ${NUM_GPUS:=1} +: ${BATCH_SIZE:=8} : ${GRAD_ACCUMULATION:=2} -: ${OUTPUT_DIR:="./output"} +: ${OUTPUT_DIR:="./output_mfa"} : ${DATASET_PATH:=LJSpeech-1.1} -: ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_text_train_v3.txt} -: ${VAL_FILELIST:=filelists/ljs_audio_pitch_text_val.txt} +: ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt} +: ${VAL_FILELIST:=filelists/ljs_audio_pitch_durs_text_val.txt} : ${AMP:=false} : ${SEED:=""} @@ -18,7 +18,6 @@ export OMP_NUM_THREADS=1 : ${EPOCHS:=1000} : ${EPOCHS_PER_CHECKPOINT:=100} : ${WARMUP_STEPS:=1000} -: ${KL_LOSS_WARMUP:=100} # Train a mixed phoneme/grapheme model : ${PHONE:=true} @@ -28,8 +27,9 @@ export OMP_NUM_THREADS=1 # Add dummy space prefix/suffix is audio is not precisely trimmed : ${APPEND_SPACES:=false} -: ${LOAD_PITCH_FROM_DISK:=true} -: ${LOAD_MEL_FROM_DISK:=false} +: ${LOAD_PITCH_FROM_DISK:=TRUE} +: ${LOAD_DURS_FROM_DISK:=TRUE} +: ${LOAD_MEL_FROM_DISK:=FALSE} # For multispeaker models, add speaker ID = {0, 1, ...} as the last filelist column : ${NSPEAKERS:=1} @@ -60,9 +60,6 @@ ARGS+=" --grad-clip-thresh 1000.0" ARGS+=" --dur-predictor-loss-scale 0.1" ARGS+=" --pitch-predictor-loss-scale 0.1" -# Autoalign & new features -ARGS+=" --kl-loss-start-epoch 0" -ARGS+=" --kl-loss-warmup-epochs $KL_LOSS_WARMUP" ARGS+=" --text-cleaners $TEXT_CLEANERS" ARGS+=" --n-speakers $NSPEAKERS" @@ -72,8 +69,9 @@ ARGS+=" --n-speakers $NSPEAKERS" [ "$PHONE" = "true" ] && ARGS+=" --p-arpabet 1.0" [ "$ENERGY" = "true" ] && ARGS+=" --energy-conditioning" [ "$SEED" != "" ] && ARGS+=" --seed $SEED" -[ "$LOAD_MEL_FROM_DISK" = true ] && ARGS+=" --load-mel-from-disk" -[ "$LOAD_PITCH_FROM_DISK" = true ] && ARGS+=" --load-pitch-from-disk" +[ "$LOAD_MEL_FROM_DISK" = TRUE ] && ARGS+=" --load-mel-from-disk" +[ "$LOAD_DURS_FROM_DISK" = TRUE ] && ARGS+=" --load-durs-from-disk" +[ "$LOAD_PITCH_FROM_DISK" = TRUE ] && ARGS+=" --load-pitch-from-disk" [ "$PITCH_ONLINE_DIR" != "" ] && ARGS+=" --pitch-online-dir $PITCH_ONLINE_DIR" # e.g., /dev/shm/pitch [ "$PITCH_ONLINE_METHOD" != "" ] && ARGS+=" --pitch-online-method $PITCH_ONLINE_METHOD" [ "$APPEND_SPACES" = true ] && ARGS+=" --prepend-space-to-text" diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index fb59e9fba..7ec53c509 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -89,12 +89,6 @@ def parse_args(parser): help='Discounting factor for training weights EMA') train.add_argument('--grad-accumulation', type=int, default=1, help='Training steps to accumulate gradients for') - train.add_argument('--kl-loss-start-epoch', type=int, default=250, - help='Start adding the hard attention loss term') - train.add_argument('--kl-loss-warmup-epochs', type=int, default=100, - help='Gradually increase the hard attention loss term') - train.add_argument('--kl-loss-weight', type=float, default=1.0, - help='Gradually increase the hard attention loss term') train.add_argument('--benchmark-epochs-num', type=int, default=20, help='Number of epochs for calculating final stats') @@ -145,6 +139,8 @@ def parse_args(parser): cond.add_argument('--n-speakers', type=int, default=1, help='Number of speakers in the dataset. ' 'n_speakers > 1 enables speaker embeddings') + cond.add_argument('--load-durs-from-disk', action='store_true', + help='Use durations cached on disk with prepare_dataset.py') cond.add_argument('--load-pitch-from-disk', action='store_true', help='Use pitch cached on disk with prepare_dataset.py') cond.add_argument('--pitch-online-method', default='pyin', @@ -580,6 +576,7 @@ def main(): if args.local_rank == 0: prepare_tmp(args.pitch_online_dir) + print(args) trainset = TTSDataset(audiopaths_and_text=args.training_files, **vars(args)) valset = TTSDataset(audiopaths_and_text=args.validation_files, **vars(args)) From d2bdd343d4ceb555eb28ddbca601222743c38982 Mon Sep 17 00:00:00 2001 From: evdv Date: Fri, 11 Mar 2022 13:53:20 +0000 Subject: [PATCH 09/21] Actually save phones correctly --- .../FastPitch/fastpitch/data_function.py | 41 +++++++++++++++---- .../FastPitch/prepare_dataset.py | 12 ++++-- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 638e116fb..f07ec8f77 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -201,6 +201,7 @@ def __getitem__(self, index): energy = torch.norm(mel.float(), dim=0, p=2) dur, phones = self.get_dur(index) text = phones + print('MOAR TEXT LEN: ', len(text)) assert pitch.size(-1) == mel.size(-1) # No higher formants? @@ -249,6 +250,7 @@ def get_text(self, text): return torch.LongTensor(text), text_arpabet def get_dur(self, index): + print('GET DUR') audiopath, *fields = self.audiopaths_and_text[index] name = Path(audiopath).stem @@ -267,8 +269,11 @@ def get_dur(self, index): duration_path = fields[1] # assume durations come after pitch # assume phone_path is known from duration_path phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt') - print(duration_path, phone_path) - return torch.load(duration_path), torch.load(phone_path) + a = torch.load(duration_path) + b = torch.load(phone_path) + print('PHONES', phone_path, b[:10]) + print('LOADING LENS: ', len(a), len(b)) + return a, b tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid') try: @@ -279,12 +284,15 @@ def get_dur(self, index): phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'), self.sampling_rate, self.hop_length) - phones = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones)) + phones = torch.Tensor(self.tp.arpabet_list_to_sequence(phones)) check_durations(durs, self.get_mel(audiopath).size(1), name) + durs = torch.Tensor(durs) if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file(): + print('HOWMANYPHONES', len(phones)) + print('cached_phonepath: ', cached_phonepath) return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath) - + print('HOWMANYPHONES', len(phones)) return durs, phones def get_pitch(self, index, mel_len=None): @@ -329,18 +337,29 @@ def get_pitch(self, index, mel_len=None): class TTSCollate: """Zero-pads model inputs and targets based on number of frames per step""" # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones) = batch + # 0: text + # 1: mel + # 2: len_text + # 3: pitch + # 4: energy + # 5: speaker + # 6: dur + # 7: audiopath + # 8: phones def __call__(self, batch): """Collate training batch from normalized text and mel-spec""" # Right zero-pad all one-hot text sequences to max input length input_lengths, ids_sorted_decreasing = torch.sort( torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True) + max_input_len = input_lengths[0] text_padded = torch.LongTensor(len(batch), max_input_len) text_padded.zero_() for i in range(len(ids_sorted_decreasing)): text = batch[ids_sorted_decreasing[i]][0] + print('LEN TEXT AS WE SAVE: ', text.size(0)) text_padded[i, :text.size(0)] = text dur_padded = torch.zeros_like(text_padded, dtype=torch.int32) @@ -351,9 +370,12 @@ def __call__(self, batch): # With MFA durations: # some mismatch between phones in transcript vs phones from text preprocessing # for now using phones from texgrid as input - dur_padded[i, :len(dur)] = torch.Tensor(dur) + # PREP DATASET: DUR = LIST, TRAIN: DUR = TENSOR + dur_padded[i, :len(dur)] = dur dur_lens[i] = len(dur) + print('LENS: ', dur_lens[i], input_lengths[i]) assert dur_lens[i] == input_lengths[i] + # Right zero-pad mel-spec num_mels = batch[0][1].size(0) max_target_len = max([x[1].size(1) for x in batch]) @@ -371,16 +393,21 @@ def __call__(self, batch): pitch_padded = torch.zeros(mel_padded.size(0), n_formants, mel_padded.size(2), dtype=batch[0][3].dtype) energy_padded = torch.zeros_like(pitch_padded[:, 0, :]) - phones_padded = torch.zeros_like(pitch_padded[:, 0, :]) - + phones_padded = torch.zeros_like(text_padded, dtype=int) + print('PHONES PAD SETUP: ', phones_padded.shape) for i in range(len(ids_sorted_decreasing)): pitch = batch[ids_sorted_decreasing[i]][3] energy = batch[ids_sorted_decreasing[i]][4] phones = batch[ids_sorted_decreasing[i]][8] + print('BATCH OF PHONES: ', phones.shape[0]) pitch_padded[i, :, :pitch.shape[1]] = pitch energy_padded[i, :energy.shape[0]] = energy + print('ADD TO PAD', i, phones.shape) phones_padded[i, :phones.shape[0]] = phones + print('PHONES PADDED SHAPE: ', phones_padded.shape) + print('ENERGY PADDED SHAPE: ', energy_padded.shape) + if batch[0][5] is not None: speaker = torch.zeros_like(input_lengths) for i in range(len(ids_sorted_decreasing)): diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index 94794eebd..f0bf0728d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -150,10 +150,13 @@ def main(): for i, batch in enumerate(tqdm.tqdm(data_loader)): tik = time.time() - # From TTSCollate __call__ - # (text_padded, dur_padded, input_lengths, mel_padded, - # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths, phones) - _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch + # DATASET GETITEM + # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones) + # TTSCOLLATE CALL + # (text_padded, dur_padded, input_lengths, mel_padded, + # output_lengths, len_x, pitch_padded, energy_padded, speaker, + # audiopaths, phones_padded) + text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch # Ensure filenames are unique for p in fpaths: fname = Path(p).name @@ -182,6 +185,7 @@ def main(): 'durations', f'{filename}.pt') torch.save(d, dur_path) for j, p in enumerate(phones): + print('LEN PHONES BEFORE SAVING: ', len(p)) filename = Path(fpaths[j]).stem # save phones too phones_path = Path(args.dataset_path, From a95012b8053c864af2c9e94897164774385035ac Mon Sep 17 00:00:00 2001 From: evdv Date: Fri, 11 Mar 2022 20:04:41 +0000 Subject: [PATCH 10/21] Get training to work with saved MFA durs --- .../FastPitch/fastpitch/data_function.py | 28 +++++-------------- .../FastPitch/fastpitch/loss_function.py | 5 ++-- .../FastPitch/fastpitch/model.py | 12 ++++++-- .../FastPitch/prepare_dataset.py | 3 +- PyTorch/SpeechSynthesis/FastPitch/train.py | 3 +- 5 files changed, 22 insertions(+), 29 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index f07ec8f77..58fe19c30 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -173,7 +173,6 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, self.f0_method = pitch_online_method expected_columns = (2 + int(load_durs_from_disk) + int(load_pitch_from_disk) + (n_speakers > 1)) - print(load_durs_from_disk, load_pitch_from_disk, expected_columns) assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None) if len(self.audiopaths_and_text[0]) < expected_columns: @@ -201,7 +200,6 @@ def __getitem__(self, index): energy = torch.norm(mel.float(), dim=0, p=2) dur, phones = self.get_dur(index) text = phones - print('MOAR TEXT LEN: ', len(text)) assert pitch.size(-1) == mel.size(-1) # No higher formants? @@ -250,7 +248,6 @@ def get_text(self, text): return torch.LongTensor(text), text_arpabet def get_dur(self, index): - print('GET DUR') audiopath, *fields = self.audiopaths_and_text[index] name = Path(audiopath).stem @@ -269,11 +266,7 @@ def get_dur(self, index): duration_path = fields[1] # assume durations come after pitch # assume phone_path is known from duration_path phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt') - a = torch.load(duration_path) - b = torch.load(phone_path) - print('PHONES', phone_path, b[:10]) - print('LOADING LENS: ', len(a), len(b)) - return a, b + return torch.load(duration_path), torch.load(phone_path) tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid') try: @@ -289,10 +282,8 @@ def get_dur(self, index): durs = torch.Tensor(durs) if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file(): - print('HOWMANYPHONES', len(phones)) - print('cached_phonepath: ', cached_phonepath) return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath) - print('HOWMANYPHONES', len(phones)) + return durs, phones def get_pitch(self, index, mel_len=None): @@ -304,8 +295,11 @@ def get_pitch(self, index, mel_len=None): spk = 0 if self.load_pitch_from_disk: + print('WE RE LOADING PITCH') pitchpath = fields[0] + print('PATH: ', pitchpath) pitch = torch.load(pitchpath) + print('AND ITS SIZE: ', pitch.shape) if self.pitch_mean is not None: assert self.pitch_std is not None pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std) @@ -359,7 +353,6 @@ def __call__(self, batch): text_padded.zero_() for i in range(len(ids_sorted_decreasing)): text = batch[ids_sorted_decreasing[i]][0] - print('LEN TEXT AS WE SAVE: ', text.size(0)) text_padded[i, :text.size(0)] = text dur_padded = torch.zeros_like(text_padded, dtype=torch.int32) @@ -373,7 +366,6 @@ def __call__(self, batch): # PREP DATASET: DUR = LIST, TRAIN: DUR = TENSOR dur_padded[i, :len(dur)] = dur dur_lens[i] = len(dur) - print('LENS: ', dur_lens[i], input_lengths[i]) assert dur_lens[i] == input_lengths[i] # Right zero-pad mel-spec @@ -394,20 +386,14 @@ def __call__(self, batch): mel_padded.size(2), dtype=batch[0][3].dtype) energy_padded = torch.zeros_like(pitch_padded[:, 0, :]) phones_padded = torch.zeros_like(text_padded, dtype=int) - print('PHONES PAD SETUP: ', phones_padded.shape) for i in range(len(ids_sorted_decreasing)): pitch = batch[ids_sorted_decreasing[i]][3] energy = batch[ids_sorted_decreasing[i]][4] phones = batch[ids_sorted_decreasing[i]][8] - print('BATCH OF PHONES: ', phones.shape[0]) pitch_padded[i, :, :pitch.shape[1]] = pitch energy_padded[i, :energy.shape[0]] = energy - print('ADD TO PAD', i, phones.shape) phones_padded[i, :phones.shape[0]] = phones - print('PHONES PADDED SHAPE: ', phones_padded.shape) - print('ENERGY PADDED SHAPE: ', energy_padded.shape) - if batch[0][5] is not None: speaker = torch.zeros_like(input_lengths) for i in range(len(ids_sorted_decreasing)): @@ -422,12 +408,12 @@ def __call__(self, batch): audiopaths = [batch[i][7] for i in ids_sorted_decreasing] return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, audiopaths, phones_padded) + pitch_padded, energy_padded, dur_lens, speaker, audiopaths, phones_padded) def batch_to_gpu(batch): (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x, - pitch_padded, energy_padded, speaker, dur_lens, audiopaths, phones_padded) = batch + pitch_padded, energy_padded, dur_lens, speaker, audiopaths, phones_padded) = batch text_padded = to_gpu(text_padded).long() durs_padded = to_gpu(durs_padded).long() diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py index 5b789a9a2..00bcbfd71 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py @@ -46,7 +46,8 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'): energy_pred, energy_tgt) = model_out # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) #(mel_tgt, in_lens, out_lens) = targets - mel_tgt, dur_tgt, dur_lens, pitch_tgt = targets + # mel_padded, durs_padded, dur_lens, + mel_tgt, dur_tgt, dur_lens, output_lengths = targets #dur_lens = in_lens mel_tgt.requires_grad = False @@ -65,7 +66,7 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'): loss_fn = F.mse_loss mel_loss = loss_fn(mel_out, mel_tgt, reduction='none') mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum() - + print('SHAPES IN LOSS FUNCTION: ', pitch_tgt.shape, pitch_pred.shape) ldiff = pitch_tgt.size(2) - pitch_pred.size(2) pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0) pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none') diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index ee2981a05..636451b45 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -206,9 +206,14 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx, def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75): # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs - (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense, - speaker, audiopaths, phones_padded) = inputs - + print('NUMBER OF INPUTS', len(inputs)) + # text_padded, input_lengths, mel_padded, output_lengths, + # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, \ + # phones_padded + (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense, + speaker, dur_tgt, audiopaths, phones_padded) = inputs + # text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, \ + # phones = batch mel_max_len = mel_tgt.size(2) # Calculate speaker embedding @@ -230,6 +235,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma # Average pitch over characters pitch_tgt = average_pitch(pitch_dense, dur_tgt) + print('DENSE PITCH SHAPE: ', pitch_tgt.shape, pitch_pred.shape) if use_gt_pitch and pitch_tgt is not None: pitch_emb = self.pitch_emb(pitch_tgt) diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index f0bf0728d..f918ef74d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -89,7 +89,6 @@ def parse_args(parser): def main(): parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing') parser = parse_args(parser) - print(sys.argv) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: print(unk_args) @@ -172,6 +171,7 @@ def main(): if args.extract_pitch: for j, p in enumerate(pitch): + print('SIZE OF P: ', len(p), type(p)) fname = Path(fpaths[j]).with_suffix('.pt').name fpath = Path(args.dataset_path, 'pitch', fname) torch.save(p[:mel_lens[j]], fpath) @@ -185,7 +185,6 @@ def main(): 'durations', f'{filename}.pt') torch.save(d, dur_path) for j, p in enumerate(phones): - print('LEN PHONES BEFORE SAVING: ', len(p)) filename = Path(fpaths[j]).stem # save phones too phones_path = Path(args.dataset_path, diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 7ec53c509..813c241dd 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -576,7 +576,6 @@ def main(): if args.local_rank == 0: prepare_tmp(args.pitch_online_dir) - print(args) trainset = TTSDataset(audiopaths_and_text=args.training_files, **vars(args)) valset = TTSDataset(audiopaths_and_text=args.validation_files, **vars(args)) @@ -633,10 +632,12 @@ def main(): model.zero_grad(set_to_none=True) x, y, num_frames = batch_to_gpu(batch) + print('NUMBER OF X: ', len(x)) with torch.cuda.amp.autocast(enabled=args.amp): # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) y_pred = model(x, use_gt_durations=True) + print('LEN Y: ', len(y_pred), len(y)) # y = mel_padded, input_lengths, output_lengths loss, meta = criterion(y_pred, y) loss /= args.grad_accumulation From 2ade2cd2ce1874f5f7d13a6b4c087cceb269a89b Mon Sep 17 00:00:00 2001 From: evdv Date: Fri, 11 Mar 2022 20:16:43 +0000 Subject: [PATCH 11/21] Remove print statements --- .../SpeechSynthesis/FastPitch/fastpitch/data_function.py | 3 --- .../SpeechSynthesis/FastPitch/fastpitch/loss_function.py | 1 - PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py | 8 +------- PyTorch/SpeechSynthesis/FastPitch/train.py | 2 -- 4 files changed, 1 insertion(+), 13 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 58fe19c30..2f9047993 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -295,11 +295,8 @@ def get_pitch(self, index, mel_len=None): spk = 0 if self.load_pitch_from_disk: - print('WE RE LOADING PITCH') pitchpath = fields[0] - print('PATH: ', pitchpath) pitch = torch.load(pitchpath) - print('AND ITS SIZE: ', pitch.shape) if self.pitch_mean is not None: assert self.pitch_std is not None pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py index 00bcbfd71..53d0439db 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py @@ -66,7 +66,6 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'): loss_fn = F.mse_loss mel_loss = loss_fn(mel_out, mel_tgt, reduction='none') mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum() - print('SHAPES IN LOSS FUNCTION: ', pitch_tgt.shape, pitch_pred.shape) ldiff = pitch_tgt.size(2) - pitch_pred.size(2) pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0) pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none') diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index 636451b45..29ce2a42a 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -206,14 +206,9 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx, def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75): # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs - print('NUMBER OF INPUTS', len(inputs)) - # text_padded, input_lengths, mel_padded, output_lengths, - # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, \ - # phones_padded (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense, speaker, dur_tgt, audiopaths, phones_padded) = inputs - # text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, \ - # phones = batch + mel_max_len = mel_tgt.size(2) # Calculate speaker embedding @@ -235,7 +230,6 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma # Average pitch over characters pitch_tgt = average_pitch(pitch_dense, dur_tgt) - print('DENSE PITCH SHAPE: ', pitch_tgt.shape, pitch_pred.shape) if use_gt_pitch and pitch_tgt is not None: pitch_emb = self.pitch_emb(pitch_tgt) diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 813c241dd..3ddd1ceb8 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -632,12 +632,10 @@ def main(): model.zero_grad(set_to_none=True) x, y, num_frames = batch_to_gpu(batch) - print('NUMBER OF X: ', len(x)) with torch.cuda.amp.autocast(enabled=args.amp): # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) y_pred = model(x, use_gt_durations=True) - print('LEN Y: ', len(y_pred), len(y)) # y = mel_padded, input_lengths, output_lengths loss, meta = criterion(y_pred, y) loss /= args.grad_accumulation From 13f871066bfd84bded38a734c503ab51316817d2 Mon Sep 17 00:00:00 2001 From: evdv Date: Fri, 11 Mar 2022 20:43:26 +0000 Subject: [PATCH 12/21] Fix up setup steps --- PyTorch/SpeechSynthesis/FastPitch/install.sh | 2 ++ .../SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/install.sh b/PyTorch/SpeechSynthesis/FastPitch/install.sh index b788fdcc3..2e6b138b7 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/install.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/install.sh @@ -42,6 +42,7 @@ conda uninstall pytorch ## Then we reinstall and this for some reason downgrades the gcc to 7 and then installing apex works/ conda install pytorch torchvision cudatoolkit=10.2 -c pytorch +conda install -c conda-forge montreal-forced-aligner ## Apex cd /disk/scratch1/${USER}/FastPitches/PyTorch/SpeechSynthesis/FastPitch/ @@ -58,6 +59,7 @@ pip install wandb pip install llvmlite==0.35.0 ## Ignore warning around here pip install numba==0.49.1 +pip install tgt ## for logging ## if needed, create a free account here: https://app.wandb.ai/login?signup=true diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index b69a8aac0..9b2b3925b 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -23,10 +23,10 @@ then python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS} fi -# mfa model download acoustic english -# mfa model download dictionary english -# mfa validate ${WAV_DIR} english english -# mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} +mfa model download acoustic english +mfa model download dictionary english +mfa validate ${WAV_DIR} english english +mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} # don't change batch size python prepare_dataset.py \ From 93f83423f44d0c4ac4cd0af5dd58482c1c2bcc32 Mon Sep 17 00:00:00 2001 From: evdv Date: Sun, 13 Mar 2022 11:13:28 +0000 Subject: [PATCH 13/21] MFA training fixups --- .../SpeechSynthesis/FastPitch/fastpitch/data_function.py | 2 +- PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py | 3 +-- .../SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh | 8 ++++---- PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh | 7 +++++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 2f9047993..fcbf7cb05 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -268,7 +268,7 @@ def get_dur(self, index): phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt') return torch.load(duration_path), torch.load(phone_path) - tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid') + tgt_path = Path(self.textgrid_path, f'{name}.TextGrid') try: textgrid = read_textgrid(tgt_path, include_empty_intervals=True) except FileNotFoundError: diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py index f918ef74d..cb17fcd14 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py +++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py @@ -155,7 +155,7 @@ def main(): # (text_padded, dur_padded, input_lengths, mel_padded, # output_lengths, len_x, pitch_padded, energy_padded, speaker, # audiopaths, phones_padded) - text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch + text, durs, input_lens, mels, mel_lens, _, pitch, _, _, _, fpaths, phones = batch # Ensure filenames are unique for p in fpaths: fname = Path(p).name @@ -171,7 +171,6 @@ def main(): if args.extract_pitch: for j, p in enumerate(pitch): - print('SIZE OF P: ', len(p), type(p)) fname = Path(fpaths[j]).with_suffix('.pt').name fpath = Path(args.dataset_path, 'pitch', fname) torch.save(p[:mel_lens[j]], fpath) diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh index 9b2b3925b..c7f9f5846 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh @@ -23,10 +23,10 @@ then python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS} fi -mfa model download acoustic english -mfa model download dictionary english -mfa validate ${WAV_DIR} english english -mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} +#mfa model download acoustic english --temp_directory /disk/scratch1/evdv/tmp/MFA +#mfa model download dictionary english --temp_directory /disk/scratch1/evdv/tmp/MFA +#mfa validate ${WAV_DIR} english english --temp_directory /disk/scratch1/evdv/tmp/MFA +#mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR} --temp_directory /disk/scratch1/evdv/tmp/MFA # don't change batch size python prepare_dataset.py \ diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index 1c4e816ef..6a62ab5d7 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -1,9 +1,12 @@ #!/usr/bin/env bash export OMP_NUM_THREADS=1 +export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/ +export WANDB_SHOW_RUN=true +export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb -: ${NUM_GPUS:=1} -: ${BATCH_SIZE:=8} +: ${NUM_GPUS:=2} +: ${BATCH_SIZE:=16} : ${GRAD_ACCUMULATION:=2} : ${OUTPUT_DIR:="./output_mfa"} : ${DATASET_PATH:=LJSpeech-1.1} From c2bafd5b2c0f4195bdbb37d2ec58f2873a103d22 Mon Sep 17 00:00:00 2001 From: evdv Date: Sun, 13 Mar 2022 11:27:08 +0000 Subject: [PATCH 14/21] Update keys used for spectrogram plotting during validation --- PyTorch/SpeechSynthesis/FastPitch/train.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 3ddd1ceb8..97cf740bf 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -362,20 +362,20 @@ def plot_batch_mels(pred_tgt_lists, rank): def log_validation_batch(x, y_pred, rank): + # x = [text_padded, input_lengths, mel_padded, output_lengths, + # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded] + # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred x_fields = ['text_padded', 'input_lengths', 'mel_padded', 'output_lengths', 'pitch_padded', 'energy_padded', - 'speaker', 'attn_prior', 'audiopaths'] - y_pred_fields = ['mel_out', 'dec_mask', 'dur_pred', 'log_dur_pred', - 'pitch_pred', 'pitch_tgt', 'energy_pred', - 'energy_tgt', 'attn_soft', 'attn_hard', - 'attn_hard_dur', 'attn_logprob'] + 'speaker', 'durs_padded', 'audiopaths', 'phones_padded'] + y_pred_fields = ['mel_out', 'dec_mask', 'dur_pred', 'pitch_pred', 'energy_pred'] validation_dict = dict(zip(x_fields + y_pred_fields, list(x) + list(y_pred))) log(validation_dict, rank) # something in here returns a warning - pred_specs_keys = ['mel_out', 'pitch_pred', 'energy_pred', 'attn_hard_dur'] - tgt_specs_keys = ['mel_padded', 'pitch_tgt', 'energy_tgt', 'attn_hard_dur'] + pred_specs_keys = ['mel_out', 'pitch_pred', 'energy_pred', 'durs_padded'] + tgt_specs_keys = ['mel_padded', 'pitch_padded', 'energy_padded', 'durs_padded'] plot_batch_mels([[validation_dict[key] for key in pred_specs_keys], [validation_dict[key] for key in tgt_specs_keys]], rank) @@ -396,7 +396,12 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run, val_meta = defaultdict(float) val_num_frames = 0 for i, batch in enumerate(val_loader): + # x = [text_padded, input_lengths, mel_padded, output_lengths, + # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded] + # y = [mel_padded, durs_padded, dur_lens, output_lengths] + # len_x = torch.sum(output_lengths) x, y, num_frames = batch_to_gpu(batch) + # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred y_pred = model(x) if i % 5 == 0: From 1755903717fa6fda46e2e45f013c35baff79f679 Mon Sep 17 00:00:00 2001 From: evdv Date: Mon, 14 Mar 2022 14:46:29 +0000 Subject: [PATCH 15/21] Fix spectrogram plotting --- .../FastPitch/fastpitch/loss_function.py | 6 +--- .../FastPitch/fastpitch/model.py | 1 + .../FastPitch/scripts/train.sh | 8 ++--- PyTorch/SpeechSynthesis/FastPitch/train.py | 31 +++++++++++++------ 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py index 53d0439db..dc2361cbe 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py @@ -42,13 +42,9 @@ def __init__(self, dur_predictor_loss_scale=1.0, self.energy_predictor_loss_scale = energy_predictor_loss_scale def forward(self, model_out, targets, is_training=True, meta_agg='mean'): - (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, - energy_pred, energy_tgt) = model_out + (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) = model_out # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) - #(mel_tgt, in_lens, out_lens) = targets - # mel_padded, durs_padded, dur_lens, mel_tgt, dur_tgt, dur_lens, output_lengths = targets - #dur_lens = in_lens mel_tgt.requires_grad = False # (B,H,T) => (B,T,H) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index 29ce2a42a..0f883e7c5 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -54,6 +54,7 @@ def regulate_len(durations, enc_out, pace: float = 1.0, mult = ((reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)) mult = mult.to(dtype) + print('THESE SHAPES WILL BE MATMULLED: ', mult.shape, enc_out.shape) enc_rep = torch.matmul(mult, enc_out) if mel_max_len is not None: diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index 6a62ab5d7..0088bc1f4 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash export OMP_NUM_THREADS=1 -export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/ +#export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/ export WANDB_SHOW_RUN=true -export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb +#export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb -: ${NUM_GPUS:=2} -: ${BATCH_SIZE:=16} +: ${NUM_GPUS:=1} +: ${BATCH_SIZE:=2} : ${GRAD_ACCUMULATION:=2} : ${OUTPUT_DIR:="./output_mfa"} : ${DATASET_PATH:=LJSpeech-1.1} diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 97cf740bf..59b6e3815 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -332,19 +332,27 @@ def plot_batch_mels(pred_tgt_lists, rank): regulated_features = [] # prediction: mel, pitch, energy # target: mel, pitch, energy - for mel_pitch_energy in pred_tgt_lists: + for i, mel_pitch_energy in enumerate(pred_tgt_lists): + if i == 0: + print('PREDICTION') + elif i == 1: + print('TARGET') mels = mel_pitch_energy[0] if mels.size(dim=2) == 80: # tgt and pred mel have diff dimension order mels = mels.permute(0, 2, 1) - mel_lens = mel_pitch_energy[-1] + mel_lens = mel_pitch_energy[-1].squeeze() + pitch = mel_pitch_energy[1].squeeze().unsqueeze(dim=-1) + energy = mel_pitch_energy[2].squeeze().unsqueeze(dim=-1) # reverse regulation for plotting: for every mel frame get pitch+energy - new_pitch = regulate_len(mel_lens, - mel_pitch_energy[1].permute(0, 2, 1))[0] - new_energy = regulate_len(mel_lens, - mel_pitch_energy[2].unsqueeze(dim=-1))[0] + if i == 0: + energy = regulate_len(mel_lens, energy)[0] + pitch = regulate_len(mel_lens, pitch)[0] + + print('PITCH: ', pitch.shape) + print('ENERGY', energy.shape) regulated_features.append([mels, - new_pitch.squeeze(axis=2), - new_energy.squeeze(axis=2)]) + pitch.squeeze(axis=2), + energy.squeeze(axis=2)]) batch_sizes = [feature.size(dim=0) for pred_tgt in regulated_features @@ -404,8 +412,8 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run, # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred y_pred = model(x) - if i % 5 == 0: - log_validation_batch(x, y_pred, rank) + #if i % 5 == 0: + log_validation_batch(x, y_pred, rank) loss, meta = criterion(y_pred, y, is_training=False, meta_agg='sum') @@ -709,6 +717,9 @@ def main(): iter_num_frames = 0 iter_meta = {} iter_start_time = time.perf_counter() + # for debugging only + # validate(model, criterion, valset, args.batch_size, collate_fn, + # distributed_run, batch_to_gpu, args.local_rank) # Finished epoch epoch_loss /= epoch_iter From d0fa41cdd1b01b4fcfd3b82555c055adb004941d Mon Sep 17 00:00:00 2001 From: evdv Date: Mon, 14 Mar 2022 17:00:53 +0000 Subject: [PATCH 16/21] Add pitch, energy, and duration losses --- PyTorch/SpeechSynthesis/FastPitch/train.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 59b6e3815..faf7a4382 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -434,6 +434,9 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run, log({ 'loss/validation-loss': val_meta['loss'].item(), 'mel-loss/validation-mel-loss': val_meta['mel_loss'].item(), + 'pitch-loss/validation-pitch-loss': val_meta['pitch_loss'].item(), + 'energy-loss/validation-energy-loss': val_meta['energy_loss'].item(), + 'dur-loss/validation-dur-error': val_meta['duration_predictor_loss'].item(), 'validation-frames per s': num_frames.item() / val_meta['took'], 'validation-took': val_meta['took'], }, rank) @@ -617,6 +620,9 @@ def main(): epoch_loss = 0.0 epoch_mel_loss = 0.0 + epoch_pitch_loss = 0.0 + epoch_energy_loss = 0.0 + epoch_dur_loss = 0.0 epoch_num_frames = 0 epoch_frames_per_sec = 0.0 @@ -693,11 +699,17 @@ def main(): apply_multi_tensor_ema(args.ema_decay, *mt_ema_params) iter_mel_loss = iter_meta['mel_loss'].item() + iter_pitch_loss = iter_meta['pitch_loss'].item() + iter_energy_loss = iter_meta['energy_loss'].item() + iter_dur_loss = iter_meta['duration_predictor_loss'].item() iter_time = time.perf_counter() - iter_start_time epoch_frames_per_sec += iter_num_frames / iter_time epoch_loss += iter_loss epoch_num_frames += iter_num_frames epoch_mel_loss += iter_mel_loss + epoch_pitch_loss += iter_pitch_loss + epoch_energy_loss += iter_energy_loss + epoch_dur_loss += iter_dur_loss if epoch_iter % 5 == 0: log({ @@ -707,6 +719,9 @@ def main(): 'total_steps': total_iter, 'loss/loss': iter_loss, 'mel-loss/mel_loss': iter_mel_loss, + 'pitch-loss/pitch_loss': iter_pitch_loss, + 'energy-loss/energy_loss': iter_energy_loss, + 'dur-loss/dur_loss': iter_dur_loss, 'frames per s': iter_num_frames / iter_time, 'took': iter_time, 'lrate': optimizer.param_groups[0]['lr'], @@ -730,6 +745,9 @@ def main(): 'epoch': epoch, 'loss/epoch_loss': epoch_loss, 'mel-loss/epoch_mel_loss': epoch_mel_loss, + 'pitch-loss/epoch_pitch_loss': epoch_pitch_loss, + 'energy-loss/epoch_energy_loss': epoch_energy_loss, + 'dur-loss/epoch_dur_loss': epoch_dur_loss, 'epoch_frames per s': epoch_num_frames / epoch_time, 'epoch_took': epoch_time, }, args.local_rank) From 20a375cb617aa0e9412688da473b1c03e49ba804 Mon Sep 17 00:00:00 2001 From: evdv Date: Wed, 16 Mar 2022 15:29:13 +0000 Subject: [PATCH 17/21] Still figuring out the issue --- .../FastPitch/fastpitch/model.py | 1 - ...ini_ljs_audio_pitch_durs_text_train_v3.txt | 20 +++++++++++++++++++ .../mini_ljs_audio_pitch_durs_text_val.txt | 16 +++++++++++++++ .../FastPitch/scripts/train.sh | 9 ++++----- PyTorch/SpeechSynthesis/FastPitch/train.py | 6 ------ 5 files changed, 40 insertions(+), 12 deletions(-) create mode 100644 PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt create mode 100644 PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index 0f883e7c5..29ce2a42a 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -54,7 +54,6 @@ def regulate_len(durations, enc_out, pace: float = 1.0, mult = ((reps_cumsum[:, :, :-1] <= range_) & (reps_cumsum[:, :, 1:] > range_)) mult = mult.to(dtype) - print('THESE SHAPES WILL BE MATMULLED: ', mult.shape, enc_out.shape) enc_rep = torch.matmul(mult, enc_out) if mel_max_len is not None: diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt new file mode 100644 index 000000000..db6e29b47 --- /dev/null +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt @@ -0,0 +1,20 @@ +wavs/LJ050-0234.wav|pitch/LJ050-0234.pt|durations/LJ050-0234.pt|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels. +wavs/LJ019-0373.wav|pitch/LJ019-0373.pt|durations/LJ019-0373.pt|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors. +wavs/LJ050-0207.wav|pitch/LJ050-0207.pt|durations/LJ050-0207.pt|Although Chief Rowley does not complain about the pay scale for Secret Service agents, +wavs/LJ048-0203.wav|pitch/LJ048-0203.pt|durations/LJ048-0203.pt|The three officers confirm that their primary concern was crowd and traffic control, +wavs/LJ003-0182.wav|pitch/LJ003-0182.pt|durations/LJ003-0182.pt|The tried and the untried, young and old, were herded together +wavs/LJ044-0166.wav|pitch/LJ044-0166.pt|durations/LJ044-0166.pt|According to Marina Oswald, he thought that would help him when he got to Cuba. +wavs/LJ019-0208.wav|pitch/LJ019-0208.pt|durations/LJ019-0208.pt|The proposal made was to purchase some fifty thousand square feet between Newgate, Warwick Lane, and the Sessions House, +wavs/LJ021-0146.wav|pitch/LJ021-0146.pt|durations/LJ021-0146.pt|I shall seek assurances of the making and maintenance of agreements, which can be mutually relied upon, +wavs/LJ013-0214.wav|pitch/LJ013-0214.pt|durations/LJ013-0214.pt|who took a carving-knife from the sideboard in the dining-room, went upstairs to Lord William's bedroom, and drew the knife across his throat. +wavs/LJ011-0256.wav|pitch/LJ011-0256.pt|durations/LJ011-0256.pt|By this time the neighbors were aroused, and several people came to the scene of the affray. +wavs/LJ014-0083.wav|pitch/LJ014-0083.pt|durations/LJ014-0083.pt|which, having possessed herself of the murdered man's keys, she rifled from end to end. +wavs/LJ035-0121.wav|pitch/LJ035-0121.pt|durations/LJ035-0121.pt|This is the period during which Oswald would have descended the stairs. In all likelihood +wavs/LJ049-0118.wav|pitch/LJ049-0118.pt|durations/LJ049-0118.pt|Enactment of this statute would mean that the investigation of any of the acts covered and of the possibility of a further attempt +wavs/LJ006-0132.wav|pitch/LJ006-0132.pt|durations/LJ006-0132.pt|All the wardsmen alike were more or less irresponsible. +wavs/LJ049-0084.wav|pitch/LJ049-0084.pt|durations/LJ049-0084.pt|Murder of the President has never been covered by Federal law, however, so that once it became reasonably clear that the killing was the act of a single person, +wavs/LJ012-0052.wav|pitch/LJ012-0052.pt|durations/LJ012-0052.pt|He claimed to be admitted to bail, and was taken from Newgate on a writ of habeas before one of the judges sitting at Westminster. +wavs/LJ011-0203.wav|pitch/LJ011-0203.pt|durations/LJ011-0203.pt|Monsieur le Maire was appealed to, and decided to leave it to the young lady, who at once abandoned Wakefield. +wavs/LJ019-0141.wav|pitch/LJ019-0141.pt|durations/LJ019-0141.pt|The old wards, day rooms and sleeping rooms combined, of which the reader has already heard so much, +wavs/LJ003-0322.wav|pitch/LJ003-0322.pt|durations/LJ003-0322.pt|except for the use of the debtors, or as medical comforts for the infirmary. +wavs/LJ027-0028.wav|pitch/LJ027-0028.pt|durations/LJ027-0028.pt|Such structures or organs are most often found internally. diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt new file mode 100644 index 000000000..eda515c7e --- /dev/null +++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt @@ -0,0 +1,16 @@ +wavs/LJ016-0288.wav|pitch/LJ016-0288.pt|durations/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. +wavs/LJ028-0275.wav|pitch/LJ028-0275.pt|durations/LJ028-0275.pt|At last, in the twentieth month, +wavs/LJ019-0273.wav|pitch/LJ019-0273.pt|durations/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. +wavs/LJ021-0145.wav|pitch/LJ021-0145.pt|durations/LJ021-0145.pt|From those willing to join in establishing this hoped-for period of peace, +wavs/LJ009-0076.wav|pitch/LJ009-0076.pt|durations/LJ009-0076.pt|We come to the sermon. +wavs/LJ048-0194.wav|pitch/LJ048-0194.pt|durations/LJ048-0194.pt|during the morning of November twenty-two prior to the motorcade. +wavs/LJ049-0050.wav|pitch/LJ049-0050.pt|durations/LJ049-0050.pt|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy. +wavs/LJ022-0023.wav|pitch/LJ022-0023.pt|durations/LJ022-0023.pt|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read. +wavs/LJ034-0053.wav|pitch/LJ034-0053.pt|durations/LJ034-0053.pt|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald. +wavs/LJ035-0129.wav|pitch/LJ035-0129.pt|durations/LJ035-0129.pt|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him. +wavs/LJ039-0075.wav|pitch/LJ039-0075.pt|durations/LJ039-0075.pt|once you know that you must put the crosshairs on the target and that is all that is necessary. +wavs/LJ046-0184.wav|pitch/LJ046-0184.pt|durations/LJ046-0184.pt|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes. +wavs/LJ003-0111.wav|pitch/LJ003-0111.pt|durations/LJ003-0111.pt|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity. +wavs/LJ037-0234.wav|pitch/LJ037-0234.pt|durations/LJ037-0234.pt|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male, +wavs/LJ047-0044.wav|pitch/LJ047-0044.pt|durations/LJ047-0044.pt|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies +wavs/LJ028-0081.wav|pitch/LJ028-0081.pt|durations/LJ028-0081.pt|Years later, when the archaeologists could readily distinguish the false from the true, diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index 0088bc1f4..819a21fb6 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -2,7 +2,6 @@ export OMP_NUM_THREADS=1 #export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/ -export WANDB_SHOW_RUN=true #export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb : ${NUM_GPUS:=1} @@ -11,16 +10,16 @@ export WANDB_SHOW_RUN=true : ${OUTPUT_DIR:="./output_mfa"} : ${DATASET_PATH:=LJSpeech-1.1} : ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt} -: ${VAL_FILELIST:=filelists/ljs_audio_pitch_durs_text_val.txt} +: ${VAL_FILELIST:=filelists/mini_ljs_audio_pitch_durs_text_val.txt} : ${AMP:=false} : ${SEED:=""} : ${LEARNING_RATE:=0.1} # Adjust these when the amount of data changes -: ${EPOCHS:=1000} -: ${EPOCHS_PER_CHECKPOINT:=100} -: ${WARMUP_STEPS:=1000} +: ${EPOCHS:=50} +: ${EPOCHS_PER_CHECKPOINT:=10} +: ${WARMUP_STEPS:=10} # Train a mixed phoneme/grapheme model : ${PHONE:=true} diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index faf7a4382..396f86fc5 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -333,10 +333,6 @@ def plot_batch_mels(pred_tgt_lists, rank): # prediction: mel, pitch, energy # target: mel, pitch, energy for i, mel_pitch_energy in enumerate(pred_tgt_lists): - if i == 0: - print('PREDICTION') - elif i == 1: - print('TARGET') mels = mel_pitch_energy[0] if mels.size(dim=2) == 80: # tgt and pred mel have diff dimension order mels = mels.permute(0, 2, 1) @@ -348,8 +344,6 @@ def plot_batch_mels(pred_tgt_lists, rank): energy = regulate_len(mel_lens, energy)[0] pitch = regulate_len(mel_lens, pitch)[0] - print('PITCH: ', pitch.shape) - print('ENERGY', energy.shape) regulated_features.append([mels, pitch.squeeze(axis=2), energy.squeeze(axis=2)]) From c71ace3367fb0824bb57e62be13a4a8fd647ccbb Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 17 Mar 2022 14:13:45 +0000 Subject: [PATCH 18/21] Fix up some settings --- .../SpeechSynthesis/FastPitch/fastpitch/data_function.py | 3 +++ PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh | 1 + PyTorch/SpeechSynthesis/FastPitch/train.py | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index fcbf7cb05..2d498192f 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -235,6 +235,7 @@ def get_mel(self, filename): return melspec + @lru_cache() def get_text(self, text): text, text_clean, text_arpabet = self.tp.encode_text(text, return_all=True) space = [self.tp.encode_text("A A")[1]] @@ -247,6 +248,7 @@ def get_text(self, text): return torch.LongTensor(text), text_arpabet + @lru_cache() def get_dur(self, index): audiopath, *fields = self.audiopaths_and_text[index] name = Path(audiopath).stem @@ -286,6 +288,7 @@ def get_dur(self, index): return durs, phones + @lru_cache() def get_pitch(self, index, mel_len=None): audiopath, *fields = self.audiopaths_and_text[index] diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index 819a21fb6..db09b0edd 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -75,6 +75,7 @@ ARGS+=" --n-speakers $NSPEAKERS" [ "$LOAD_DURS_FROM_DISK" = TRUE ] && ARGS+=" --load-durs-from-disk" [ "$LOAD_PITCH_FROM_DISK" = TRUE ] && ARGS+=" --load-pitch-from-disk" [ "$PITCH_ONLINE_DIR" != "" ] && ARGS+=" --pitch-online-dir $PITCH_ONLINE_DIR" # e.g., /dev/shm/pitch +[ "$DUR_ONLINE_DIR" != "" ] && ARGS+=" --dur-online-dir $DUR_ONLINE_DIR" # e.g., /dev/shm/dur [ "$PITCH_ONLINE_METHOD" != "" ] && ARGS+=" --pitch-online-method $PITCH_ONLINE_METHOD" [ "$APPEND_SPACES" = true ] && ARGS+=" --prepend-space-to-text" [ "$APPEND_SPACES" = true ] && ARGS+=" --append-space-to-text" diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 396f86fc5..fee177209 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -148,6 +148,8 @@ def parse_args(parser): help='Calculate pitch on the fly during trainig') cond.add_argument('--pitch-online-dir', type=str, default=None, help='A directory for storing pitch calculated on-line') + cond.add_argument('--dur-online-dir', type=str, default=None, + help='A directory for storing durations calculated on-line') cond.add_argument('--pitch-mean', type=float, default=214.72203, help='Normalization value for pitch') cond.add_argument('--pitch-std', type=float, default=65.72038, @@ -406,8 +408,8 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run, # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred y_pred = model(x) - #if i % 5 == 0: - log_validation_batch(x, y_pred, rank) + if i % 5 == 0: + log_validation_batch(x, y_pred, rank) loss, meta = criterion(y_pred, y, is_training=False, meta_agg='sum') From 913ee6c463b6006c2a3ccfadab2aee9ee907d94c Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 17 Mar 2022 15:27:54 +0000 Subject: [PATCH 19/21] Train with normalised energy --- .../SpeechSynthesis/FastPitch/fastpitch/arg_parser.py | 1 + .../FastPitch/fastpitch/data_function.py | 10 +++++++++- PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py | 6 ++++-- PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh | 2 +- PyTorch/SpeechSynthesis/FastPitch/train.py | 2 ++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py index 4e5b13764..cb1d8a581 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py @@ -110,6 +110,7 @@ def parse_fastpitch_args(parent, add_help=False): energy_pred = parser.add_argument_group('energy predictor parameters') energy_pred.add_argument('--energy-conditioning', action='store_true') + energy_pred.add_argument('--norm_energy', action='store_true') energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int, help='Pitch predictor conv-1D kernel size') energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int, diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py index 2d498192f..001c90f22 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py @@ -130,7 +130,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, cmu_dict='cmudict/cmudict-0.7b', n_speakers=1, load_mel_from_disk=True, load_pitch_from_disk=True, pitch_mean=214.72203, - pitch_std=65.72038, max_wav_value=None, sampling_rate=None, + pitch_std=65.72038, energy_mean=51.796032, energy_std=9.861213, + max_wav_value=None, sampling_rate=None, filter_length=None, hop_length=None, win_length=None, mel_fmin=None, mel_fmax=None, prepend_space_to_text=False, append_space_to_text=False, load_durs_from_disk=False, @@ -185,6 +186,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners, to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x self.pitch_mean = to_tensor(pitch_mean) self.pitch_std = to_tensor(pitch_std) + self.energy_mean = to_tensor(energy_mean) + self.energy_std = to_tensor(energy_std) def __getitem__(self, index): # Separate filename and text @@ -198,6 +201,11 @@ def __getitem__(self, index): mel = self.get_mel(audiopath) pitch = self.get_pitch(index, mel.size(-1)) energy = torch.norm(mel.float(), dim=0, p=2) + if self.energy_mean is not None: + assert self.energy_std is not None + norm_energy = normalize_pitch(energy.unsqueeze(dim=0), self.energy_mean, self.energy_std) + energy = norm_energy.squeeze() + dur, phones = self.get_dur(index) text = phones assert pitch.size(-1) == mel.size(-1) diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py index 29ce2a42a..b8f02300b 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py +++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py @@ -125,7 +125,7 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx, energy_predictor_kernel_size, energy_predictor_filter_size, p_energy_predictor_dropout, energy_predictor_n_layers, energy_embedding_kernel_size, - n_speakers, speaker_emb_weight, pitch_conditioning_formants=1): + n_speakers, speaker_emb_weight, pitch_conditioning_formants=1, norm_energy=True): super(FastPitch, self).__init__() self.encoder = FFTransformer( @@ -186,6 +186,7 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx, self.register_buffer('pitch_std', torch.zeros(1)) self.energy_conditioning = energy_conditioning + self.norm_energy = norm_energy if energy_conditioning: self.energy_predictor = TemporalPredictor( in_fft_output_size, @@ -243,7 +244,8 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma # Average energy over characters energy_tgt = average_pitch(energy_dense.unsqueeze(1), dur_tgt) - energy_tgt = torch.log(1.0 + energy_tgt) + if not self.norm_energy: + energy_tgt = torch.log(1.0 + energy_tgt) energy_emb = self.energy_emb(energy_tgt) energy_tgt = energy_tgt.squeeze(1) diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh index db09b0edd..1b2f0ce4d 100755 --- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh +++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh @@ -7,7 +7,7 @@ export OMP_NUM_THREADS=1 : ${NUM_GPUS:=1} : ${BATCH_SIZE:=2} : ${GRAD_ACCUMULATION:=2} -: ${OUTPUT_DIR:="./output_mfa"} +: ${OUTPUT_DIR:="./output_mfa/norm"} : ${DATASET_PATH:=LJSpeech-1.1} : ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt} : ${VAL_FILELIST:=filelists/mini_ljs_audio_pitch_durs_text_val.txt} diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index fee177209..13079a87d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -653,6 +653,8 @@ def main(): y_pred = model(x, use_gt_durations=True) # y = mel_padded, input_lengths, output_lengths loss, meta = criterion(y_pred, y) + print(loss) + print(meta) loss /= args.grad_accumulation meta = {k: v / args.grad_accumulation From b57ef295ac98de3face4da3ca2e8530eca78a965 Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 17 Mar 2022 17:18:33 +0000 Subject: [PATCH 20/21] Remove loss print --- PyTorch/SpeechSynthesis/FastPitch/train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 13079a87d..fee177209 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -653,8 +653,6 @@ def main(): y_pred = model(x, use_gt_durations=True) # y = mel_padded, input_lengths, output_lengths loss, meta = criterion(y_pred, y) - print(loss) - print(meta) loss /= args.grad_accumulation meta = {k: v / args.grad_accumulation From 035bcf6c177218a63a0e687a74ea3551cb47ac85 Mon Sep 17 00:00:00 2001 From: evdv Date: Thu, 17 Mar 2022 23:50:51 +0000 Subject: [PATCH 21/21] Remove all contents from the training data loader loop thing --- PyTorch/SpeechSynthesis/FastPitch/train.py | 251 +++++++++++---------- 1 file changed, 126 insertions(+), 125 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index fee177209..3b682713d 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -634,131 +634,132 @@ def main(): epoch_iter = 0 num_iters = len(train_loader) // args.grad_accumulation for batch in train_loader: - - if accumulated_steps == 0: - if epoch_iter == num_iters: - break - total_iter += 1 - epoch_iter += 1 - - adjust_learning_rate(total_iter, optimizer, args.learning_rate, - args.warmup_steps) - - model.zero_grad(set_to_none=True) - - x, y, num_frames = batch_to_gpu(batch) - - with torch.cuda.amp.autocast(enabled=args.amp): - # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) - y_pred = model(x, use_gt_durations=True) - # y = mel_padded, input_lengths, output_lengths - loss, meta = criterion(y_pred, y) - loss /= args.grad_accumulation - - meta = {k: v / args.grad_accumulation - for k, v in meta.items()} - - if args.amp: - scaler.scale(loss).backward() - else: - loss.backward() - - if distributed_run: - reduced_loss = reduce_tensor(loss.data, args.world_size).item() - reduced_num_frames = reduce_tensor(num_frames.data, 1).item() - meta = {k: reduce_tensor(v, args.world_size) for k, v in meta.items()} - else: - reduced_loss = loss.item() - reduced_num_frames = num_frames.item() - if np.isnan(reduced_loss): - raise Exception("loss is NaN") - - accumulated_steps += 1 - iter_loss += reduced_loss - iter_num_frames += reduced_num_frames - iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta} - - if accumulated_steps % args.grad_accumulation == 0: - - if args.amp: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_( - model.parameters(), args.grad_clip_thresh) - scaler.step(optimizer) - scaler.update() - else: - torch.nn.utils.clip_grad_norm_( - model.parameters(), args.grad_clip_thresh) - optimizer.step() - - if args.ema_decay > 0.0: - apply_multi_tensor_ema(args.ema_decay, *mt_ema_params) - - iter_mel_loss = iter_meta['mel_loss'].item() - iter_pitch_loss = iter_meta['pitch_loss'].item() - iter_energy_loss = iter_meta['energy_loss'].item() - iter_dur_loss = iter_meta['duration_predictor_loss'].item() - iter_time = time.perf_counter() - iter_start_time - epoch_frames_per_sec += iter_num_frames / iter_time - epoch_loss += iter_loss - epoch_num_frames += iter_num_frames - epoch_mel_loss += iter_mel_loss - epoch_pitch_loss += iter_pitch_loss - epoch_energy_loss += iter_energy_loss - epoch_dur_loss += iter_dur_loss - - if epoch_iter % 5 == 0: - log({ - 'epoch': epoch, - 'epoch_iter': epoch_iter, - 'num_iters': num_iters, - 'total_steps': total_iter, - 'loss/loss': iter_loss, - 'mel-loss/mel_loss': iter_mel_loss, - 'pitch-loss/pitch_loss': iter_pitch_loss, - 'energy-loss/energy_loss': iter_energy_loss, - 'dur-loss/dur_loss': iter_dur_loss, - 'frames per s': iter_num_frames / iter_time, - 'took': iter_time, - 'lrate': optimizer.param_groups[0]['lr'], - }, args.local_rank) - - accumulated_steps = 0 - iter_loss = 0 - iter_num_frames = 0 - iter_meta = {} - iter_start_time = time.perf_counter() - # for debugging only - # validate(model, criterion, valset, args.batch_size, collate_fn, - # distributed_run, batch_to_gpu, args.local_rank) - - # Finished epoch - epoch_loss /= epoch_iter - epoch_mel_loss /= epoch_iter - epoch_time = time.perf_counter() - epoch_start_time - - log({ - 'epoch': epoch, - 'loss/epoch_loss': epoch_loss, - 'mel-loss/epoch_mel_loss': epoch_mel_loss, - 'pitch-loss/epoch_pitch_loss': epoch_pitch_loss, - 'energy-loss/epoch_energy_loss': epoch_energy_loss, - 'dur-loss/epoch_dur_loss': epoch_dur_loss, - 'epoch_frames per s': epoch_num_frames / epoch_time, - 'epoch_took': epoch_time, - }, args.local_rank) - bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss, - epoch_time) - - validate(model, criterion, valset, args.batch_size, collate_fn, - distributed_run, batch_to_gpu, args.local_rank) - - if args.ema_decay > 0: - validate(ema_model, criterion, valset, args.batch_size, collate_fn, - distributed_run, batch_to_gpu, args.local_rank) - - maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch, - total_iter, model_config) + print(batch[-1]) + # + # if accumulated_steps == 0: + # if epoch_iter == num_iters: + # break + # total_iter += 1 + # epoch_iter += 1 + # + # adjust_learning_rate(total_iter, optimizer, args.learning_rate, + # args.warmup_steps) + # + # model.zero_grad(set_to_none=True) + # + # x, y, num_frames = batch_to_gpu(batch) + # + # with torch.cuda.amp.autocast(enabled=args.amp): + # # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) + # y_pred = model(x, use_gt_durations=True) + # # y = mel_padded, input_lengths, output_lengths + # loss, meta = criterion(y_pred, y) + # loss /= args.grad_accumulation + # + # meta = {k: v / args.grad_accumulation + # for k, v in meta.items()} + # + # if args.amp: + # scaler.scale(loss).backward() + # else: + # loss.backward() + # + # if distributed_run: + # reduced_loss = reduce_tensor(loss.data, args.world_size).item() + # reduced_num_frames = reduce_tensor(num_frames.data, 1).item() + # meta = {k: reduce_tensor(v, args.world_size) for k, v in meta.items()} + # else: + # reduced_loss = loss.item() + # reduced_num_frames = num_frames.item() + # if np.isnan(reduced_loss): + # raise Exception("loss is NaN") + # + # accumulated_steps += 1 + # iter_loss += reduced_loss + # iter_num_frames += reduced_num_frames + # iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta} + # + # if accumulated_steps % args.grad_accumulation == 0: + # + # if args.amp: + # scaler.unscale_(optimizer) + # torch.nn.utils.clip_grad_norm_( + # model.parameters(), args.grad_clip_thresh) + # scaler.step(optimizer) + # scaler.update() + # else: + # torch.nn.utils.clip_grad_norm_( + # model.parameters(), args.grad_clip_thresh) + # optimizer.step() + # + # if args.ema_decay > 0.0: + # apply_multi_tensor_ema(args.ema_decay, *mt_ema_params) + # + # iter_mel_loss = iter_meta['mel_loss'].item() + # iter_pitch_loss = iter_meta['pitch_loss'].item() + # iter_energy_loss = iter_meta['energy_loss'].item() + # iter_dur_loss = iter_meta['duration_predictor_loss'].item() + # iter_time = time.perf_counter() - iter_start_time + # epoch_frames_per_sec += iter_num_frames / iter_time + # epoch_loss += iter_loss + # epoch_num_frames += iter_num_frames + # epoch_mel_loss += iter_mel_loss + # epoch_pitch_loss += iter_pitch_loss + # epoch_energy_loss += iter_energy_loss + # epoch_dur_loss += iter_dur_loss + # + # if epoch_iter % 5 == 0: + # log({ + # 'epoch': epoch, + # 'epoch_iter': epoch_iter, + # 'num_iters': num_iters, + # 'total_steps': total_iter, + # 'loss/loss': iter_loss, + # 'mel-loss/mel_loss': iter_mel_loss, + # 'pitch-loss/pitch_loss': iter_pitch_loss, + # 'energy-loss/energy_loss': iter_energy_loss, + # 'dur-loss/dur_loss': iter_dur_loss, + # 'frames per s': iter_num_frames / iter_time, + # 'took': iter_time, + # 'lrate': optimizer.param_groups[0]['lr'], + # }, args.local_rank) + # + # accumulated_steps = 0 + # iter_loss = 0 + # iter_num_frames = 0 + # iter_meta = {} + # iter_start_time = time.perf_counter() + # # for debugging only + # # validate(model, criterion, valset, args.batch_size, collate_fn, + # # distributed_run, batch_to_gpu, args.local_rank) + # + # # Finished epoch + # epoch_loss /= epoch_iter + # epoch_mel_loss /= epoch_iter + # epoch_time = time.perf_counter() - epoch_start_time + # + # log({ + # 'epoch': epoch, + # 'loss/epoch_loss': epoch_loss, + # 'mel-loss/epoch_mel_loss': epoch_mel_loss, + # 'pitch-loss/epoch_pitch_loss': epoch_pitch_loss, + # 'energy-loss/epoch_energy_loss': epoch_energy_loss, + # 'dur-loss/epoch_dur_loss': epoch_dur_loss, + # 'epoch_frames per s': epoch_num_frames / epoch_time, + # 'epoch_took': epoch_time, + # }, args.local_rank) + # bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss, + # epoch_time) + # + # validate(model, criterion, valset, args.batch_size, collate_fn, + # distributed_run, batch_to_gpu, args.local_rank) + # + # if args.ema_decay > 0: + # validate(ema_model, criterion, valset, args.batch_size, collate_fn, + # distributed_run, batch_to_gpu, args.local_rank) + # + # maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch, + # total_iter, model_config) # Finished training if len(bmark_stats) > 0: