From 880e6a522febe41e05ebaab9b0ab3dce53733bb1 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 17 Feb 2022 19:06:49 +0000
Subject: [PATCH 01/21] WIP extracting/loading durations

---
 .../FastPitch/fastpitch/data_function.py            | 13 +++++++++++--
 .../SpeechSynthesis/FastPitch/prepare_dataset.py    |  7 ++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index a007db86f..9df3ab0b1 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -342,6 +342,14 @@ def __call__(self, batch):
             text = batch[ids_sorted_decreasing[i]][0]
             text_padded[i, :text.size(0)] = text
 
+        dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype)
+        dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32)
+        for i in range(len(ids_sorted_decreasing)):
+            dur = batch[ids_sorted_decreasing[i]][3]
+            dur_padded[i, :dur.shape[0]] = dur
+            dur_lens[i] = dur.shape[0]
+            assert dur_lens[i] == input_lengths[i]
+
         # Right zero-pad mel-spec
         num_mels = batch[0][1].size(0)
         max_target_len = max([x[1].size(1) for x in batch])
@@ -386,16 +394,17 @@ def __call__(self, batch):
 
         audiopaths = [batch[i][7] for i in ids_sorted_decreasing]
 
-        return (text_padded, input_lengths, mel_padded, output_lengths, len_x,
+        return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x,
                 pitch_padded, energy_padded, speaker, attn_prior_padded,
                 audiopaths)
 
 
 def batch_to_gpu(batch):
-    (text_padded, input_lengths, mel_padded, output_lengths, len_x,
+    (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x,
      pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch
 
     text_padded = to_gpu(text_padded).long()
+    durs_padded = to_gpu(durs_padded).long()
     input_lengths = to_gpu(input_lengths).long()
     mel_padded = to_gpu(mel_padded).float()
     output_lengths = to_gpu(output_lengths).long()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index d93065b42..ed6ba6566 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -50,6 +50,8 @@ def parse_args(parser):
                         help='Calculate spectrograms from .wav files')
     parser.add_argument('--extract-pitch', action='store_true',
                         help='Extract pitch')
+    parser.add_argument('--extract-durations', action='store_true',
+                        help='Extract durations')
     parser.add_argument('--save-alignment-priors', action='store_true',
                         help='Pre-calculate diagonal matrices of alignment of text to audio')
     parser.add_argument('--log-file', type=str, default='preproc_log.json',
@@ -99,6 +101,9 @@ def main():
     if args.extract_pitch:
         Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True)
 
+    if args.extract_durs:
+        Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True)
+
     if args.save_alignment_priors:
         Path(args.dataset_path, 'alignment_priors').mkdir(parents=False, exist_ok=True)
 
@@ -142,7 +147,7 @@ def main():
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
-            _, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
+            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
 
             # Ensure filenames are unique
             for p in fpaths:

From 832ba35e4d13d02bc8f357e0a071b1abce1c6962 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Fri, 18 Feb 2022 14:27:55 +0000
Subject: [PATCH 02/21] WIP

---
 .../FastPitch/create_lab_files.py             |  0
 .../FastPitch/prepare_dataset.py              | 65 ++++++++++++++++++-
 .../FastPitch/requirements.txt                |  1 +
 .../FastPitch/scripts/prepare_dataset.sh      |  8 +++
 4 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py

diff --git a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index ed6ba6566..6fdd0b23c 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -26,13 +26,16 @@
 # *****************************************************************************
 
 import argparse
+import os
 import time
 from pathlib import Path
 
+import numpy as np
 import torch
 import tqdm
 import dllogger as DLLogger
 from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
+from tgt.io import read_textgrid
 from torch.utils.data import DataLoader
 
 from fastpitch.data_function import TTSCollate, TTSDataset
@@ -44,6 +47,8 @@ def parse_args(parser):
     """
     parser.add_argument('-d', '--dataset-path', type=str,
                         default='./', help='Path to dataset')
+    parser.add_argument('--textgrid-path', type=str,
+                        help='Path to TextGrids')
     parser.add_argument('--wav-text-filelists', required=True, nargs='+',
                         type=str, help='Files with audio paths and text')
     parser.add_argument('--extract-mels', action='store_true',
@@ -51,7 +56,7 @@ def parse_args(parser):
     parser.add_argument('--extract-pitch', action='store_true',
                         help='Extract pitch')
     parser.add_argument('--extract-durations', action='store_true',
-                        help='Extract durations')
+                        help='Extract durations (from alignment dir)')
     parser.add_argument('--save-alignment-priors', action='store_true',
                         help='Pre-calculate diagonal matrices of alignment of text to audio')
     parser.add_argument('--log-file', type=str, default='preproc_log.json',
@@ -82,6 +87,36 @@ def parse_args(parser):
     return parser
 
 
+def parse_textgrid(tier, sampling_rate, hop_length):
+    # From Dan Wells
+    # Latest MFA replaces silence phones with "" in output TextGrids
+    sil_phones = ['sil', 'sp', 'spn', '']
+    start_time = tier[0].start_time
+    end_time = tier[-1].end_time
+    phones = []
+    durations = []
+    for index, label in enumerate(tier._objects):
+        p_start, p_end, phone = label.start_time, label.end_time, label.text
+        if phone not in sil_phones:
+            phones.append(phone)
+        else:
+            if (index == 0) or (index == len(tier) - 1):
+                # leading or trailing silence
+                phones.append('sil')
+            else:
+                # short pause between words
+                phones.append('sp')
+
+        durations.append(int(np.ceil(p_end * sampling_rate / hop_length)
+                             - np.ceil(p_start * sampling_rate / hop_length)))
+    return phones, durations, start_time, end_time
+
+
+def check_durations(durs, mel_len, filepath):
+    assert (sum(durs) == mel_len,
+            f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens')
+
+
 def main():
     parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing')
     parser = parse_args(parser)
@@ -102,6 +137,8 @@ def main():
         Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True)
 
     if args.extract_durs:
+        if not args.textgrid_path:
+            args.textgridPath = os.path.join(args.dataset_path, 'TextGrid')
         Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True)
 
     if args.save_alignment_priors:
@@ -147,6 +184,10 @@ def main():
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
+            # From TTSCollate __call__
+            # (text_padded, dur_padded, input_lengths, mel_padded,
+            # output_lengths, len_x, pitch_padded, energy_padded, speaker,
+            # attn_prior_padded, audiopaths)
             _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
 
             # Ensure filenames are unique
@@ -168,6 +209,28 @@ def main():
                     fpath = Path(args.dataset_path, 'pitch', fname)
                     torch.save(p[:mel_lens[j]], fpath)
 
+            if args.extract_durations:
+                # From Dan Wells
+                for j, _ in range(len(mel_lens)):
+                    filename = fpaths[j]
+                    tgt_path = Path(args.textgrid_path, f'{filename}.TextGrid')
+                    try:
+                        textgrid = read_textgrid(tgt_path,
+                                                 include_empty_intervals=True)
+                    except FileNotFoundError:
+                        print(f'{filename}.wav TextGrid missing: {tgt_path}')
+                        raise
+                    _, durs, _, _ = parse_textgrid(
+                        textgrid.get_tier_by_name('phones'),
+                        args.sampling_rate,
+                        args.hop_length)
+
+                    check_durations(durs, mel_lens[j], filename)
+
+                    dur_path = Path(args.dataset_path,
+                                    'durations', f'{filename}.pt')
+                    torch.save(torch.LongTensor(durs).cpu().int(), dur_path)
+
             if args.save_alignment_priors:
                 for j, prior in enumerate(attn_prior):
                     fname = Path(fpaths[j]).with_suffix('.pt').name
diff --git a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt
index e6d7b1751..33b7548c1 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt
+++ b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt
@@ -3,5 +3,6 @@ numpy
 inflect
 librosa==0.8.0
 scipy
+tgt
 tensorboardX==2.0
 git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index 43525ef48..408d75a63 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -3,13 +3,21 @@
 set -e
 
 : ${DATA_DIR:=LJSpeech-1.1}
+: ${ALIGNMENT_DIR:=${DATA_DIR}/mfa_alignments}
 : ${ARGS="--extract-mels"}
 
+mfa model download acoustic english
+mfa model download dictionary english
+mfa validate $DATA_DIR english english
+mfa align $DATA_DIR english english $ALIGNMENT_DIR
+
 python prepare_dataset.py \
     --wav-text-filelists filelists/ljs_audio_text.txt \
     --n-workers 16 \
     --batch-size 1 \
     --dataset-path $DATA_DIR \
+    --textgrid-path $ALIGNMENT_DIR \
     --extract-pitch \
+    --extract-durations\
     --f0-method pyin \
     $ARGS

From 6b7b290676d187f110e65f713be113255076ad0a Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Wed, 23 Feb 2022 10:58:59 +0000
Subject: [PATCH 03/21] WIP, issue with zero-padding durations

---
 .../FastPitch/common/layers.py                |  5 +--
 .../SpeechSynthesis/FastPitch/common/stft.py  |  2 +-
 .../FastPitch/create_lab_files.py             | 36 +++++++++++++++++++
 .../FastPitch/fastpitch/data_function.py      |  7 ++++
 .../FastPitch/prepare_dataset.py              | 12 ++++---
 .../FastPitch/scripts/prepare_dataset.sh      | 29 +++++++++++----
 6 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
index d3ec68f6d..80c059b87 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/layers.py
@@ -93,8 +93,9 @@ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
         self.n_mel_channels = n_mel_channels
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=filter_length,
+                                   n_mels=n_mel_channels,
+                                   fmin=mel_fmin, fmax=mel_fmax)
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer('mel_basis', mel_basis)
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
index 4084dc68e..bc140c142 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/stft.py
@@ -64,7 +64,7 @@ def __init__(self, filter_length=800, hop_length=200, win_length=800,
             assert(filter_length >= win_length)
             # get window and zero center pad it to filter_length
             fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
+            fft_window = pad_center(fft_window, size=filter_length)
             fft_window = torch.from_numpy(fft_window).float()
 
             # window the bases
diff --git a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
index e69de29bb..48da1f631 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/create_lab_files.py
@@ -0,0 +1,36 @@
+import argparse
+import os
+import pathlib
+
+from common.utils import load_filepaths_and_text
+
+
+def create_lab_files(dataset_path, filelist, n_speakers):
+    # Expect a list of filenames
+    if type(filelist) is str:
+        filelist = [filelist]
+
+    # difficulty: dealing with 'are there speaker codes are not'?
+    dataset_entries = load_filepaths_and_text(filelist, dataset_path,
+                                              (n_speakers > 1))
+
+    for filepath, text in dataset_entries:
+        wav_name = pathlib.Path(filepath).stem
+        # lab extension is hardcoded
+        # so is the use of the wavs subdirectory
+        lab_filepath = os.path.join(dataset_path, f'{wav_name}.lab')
+        with open(lab_filepath, 'w') as f:
+            f.write(text)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=str, required=True,
+                        help='Path to dataset')
+    parser.add_argument('--filelist', type=str, required=True, nargs='+',
+                        help='List of wavs with transcript')
+    parser.add_argument('--n-speakers', type=int, default=1,
+                        help='Number of speakers in dataset')
+    args = parser.parse_args()
+
+    create_lab_files(args.dataset, args.filelist, args.n_speakers)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 9df3ab0b1..544a626ba 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -329,23 +329,30 @@ class TTSCollate:
     """Zero-pads model inputs and targets based on number of frames per step"""
 
     def __call__(self, batch):
+        print('new call')
+        # print(batch)
         """Collate training batch from normalized text and mel-spec"""
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
             torch.LongTensor([len(x[0]) for x in batch]),
             dim=0, descending=True)
         max_input_len = input_lengths[0]
+        print(max_input_len)
 
         text_padded = torch.LongTensor(len(batch), max_input_len)
         text_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             text = batch[ids_sorted_decreasing[i]][0]
             text_padded[i, :text.size(0)] = text
+            #print(text)
 
         dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype)
         dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32)
+        #print(dur_lens)
         for i in range(len(ids_sorted_decreasing)):
             dur = batch[ids_sorted_decreasing[i]][3]
+            # error
+            print(i, dur.shape)
             dur_padded[i, :dur.shape[0]] = dur
             dur_lens[i] = dur.shape[0]
             assert dur_lens[i] == input_lengths[i]
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index 6fdd0b23c..355c595b8 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -113,8 +113,8 @@ def parse_textgrid(tier, sampling_rate, hop_length):
 
 
 def check_durations(durs, mel_len, filepath):
-    assert (sum(durs) == mel_len,
-            f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens')
+    assert sum(durs) == mel_len, \
+            f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens'
 
 
 def main():
@@ -136,7 +136,7 @@ def main():
     if args.extract_pitch:
         Path(args.dataset_path, 'pitch').mkdir(parents=False, exist_ok=True)
 
-    if args.extract_durs:
+    if args.extract_durations:
         if not args.textgrid_path:
             args.textgridPath = os.path.join(args.dataset_path, 'TextGrid')
         Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True)
@@ -181,6 +181,8 @@ def main():
             drop_last=False)
 
         all_filenames = set()
+        print('pre-loop')
+        print(data_loader.dataset)
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
@@ -189,14 +191,14 @@ def main():
             # output_lengths, len_x, pitch_padded, energy_padded, speaker,
             # attn_prior_padded, audiopaths)
             _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
-
+            print(f'batch: {fpaths}')
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name
                 if fname in all_filenames:
                     raise ValueError(f'Filename is not unique: {fname}')
                 all_filenames.add(fname)
-
+            print('filename check complete')
             if args.extract_mels:
                 for j, mel in enumerate(mels):
                     fname = Path(fpaths[j]).with_suffix('.pt').name
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index 408d75a63..e9f9b0c9f 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -2,18 +2,35 @@
 
 set -e
 
+while getopts "ln:" opt; do
+      case $opt in
+        l ) LABELS="true";;
+        n ) NSPEAKERS=$OPTARG;;
+        \?) echo "Invalid option: -"$OPTARG"" >&2
+            exit 1;;
+      esac
+    done
+
+: ${NSPEAKERS:=1}  # default value
 : ${DATA_DIR:=LJSpeech-1.1}
+: ${WAV_DIR:=${DATA_DIR}/wavs}  # should already exist
+: ${FILELIST:=filelists/ljs_audio_text.txt}
 : ${ALIGNMENT_DIR:=${DATA_DIR}/mfa_alignments}
 : ${ARGS="--extract-mels"}
 
-mfa model download acoustic english
-mfa model download dictionary english
-mfa validate $DATA_DIR english english
-mfa align $DATA_DIR english english $ALIGNMENT_DIR
+if [ "$LABELS" = "true" ]
+then
+  python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS}
+fi
+
+# mfa model download acoustic english
+# mfa model download dictionary english
+# mfa validate ${WAV_DIR} english english
+# mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}
 
 python prepare_dataset.py \
-    --wav-text-filelists filelists/ljs_audio_text.txt \
-    --n-workers 16 \
+    --wav-text-filelists ${FILELIST} \
+    --n-workers 8 \
     --batch-size 1 \
     --dataset-path $DATA_DIR \
     --textgrid-path $ALIGNMENT_DIR \

From e8484e00ed2a9aadcf96fc69af62058a840af0b1 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Sat, 5 Mar 2022 18:57:54 +0000
Subject: [PATCH 04/21] Trying to figure out why I can't get the batches
 anymore

---
 .../FastPitch/fastpitch/data_function.py      | 21 +++++++++++--------
 .../FastPitch/prepare_dataset.py              |  6 +++---
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 544a626ba..4e549f0d0 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -225,6 +225,8 @@ def __getitem__(self, index):
         if len(pitch.size()) == 1:
             pitch = pitch[None, :]
 
+        print('getting a batch')
+        # this is a batch
         return (text, mel, len(text), pitch, energy, speaker, attn_prior,
                 audiopath)
 
@@ -327,36 +329,37 @@ def get_pitch(self, index, mel_len=None):
 
 class TTSCollate:
     """Zero-pads model inputs and targets based on number of frames per step"""
-
+    # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths,
+    # len_x, pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch
     def __call__(self, batch):
-        print('new call')
-        # print(batch)
+        print('COLLATE GETS CALLED')
         """Collate training batch from normalized text and mel-spec"""
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
             torch.LongTensor([len(x[0]) for x in batch]),
             dim=0, descending=True)
         max_input_len = input_lengths[0]
-        print(max_input_len)
 
         text_padded = torch.LongTensor(len(batch), max_input_len)
         text_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             text = batch[ids_sorted_decreasing[i]][0]
             text_padded[i, :text.size(0)] = text
-            #print(text)
 
         dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype)
+        print('dur padded orig', dur_padded.shape)
         dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32)
-        #print(dur_lens)
+        print('start loop?')
         for i in range(len(ids_sorted_decreasing)):
             dur = batch[ids_sorted_decreasing[i]][3]
-            # error
-            print(i, dur.shape)
+            # ERROR
+            # print(i, dur_padded[0].shape, dur[0].shape)
+            print(i)
             dur_padded[i, :dur.shape[0]] = dur
+            print('new shape: ', dur_padded.shape)
             dur_lens[i] = dur.shape[0]
             assert dur_lens[i] == input_lengths[i]
-
+        print('end loop?')
         # Right zero-pad mel-spec
         num_mels = batch[0][1].size(0)
         max_target_len = max([x[1].size(1) for x in batch])
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index 355c595b8..645f6b97d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -27,6 +27,7 @@
 
 import argparse
 import os
+import sys
 import time
 from pathlib import Path
 
@@ -181,8 +182,7 @@ def main():
             drop_last=False)
 
         all_filenames = set()
-        print('pre-loop')
-        print(data_loader.dataset)
+        print('TIME TO LOOP')
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
@@ -191,7 +191,7 @@ def main():
             # output_lengths, len_x, pitch_padded, energy_padded, speaker,
             # attn_prior_padded, audiopaths)
             _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
-            print(f'batch: {fpaths}')
+            print(f'BATCH: {fpaths}')
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name

From 8c4919de55e86b4d6c52235826ec9c9e21862fbd Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Tue, 8 Mar 2022 22:47:37 +0000
Subject: [PATCH 05/21] Remove alignment, but units of textgrid and text
 processing not the same

---
 .../FastPitch/common/text/text_processing.py  |  31 ++-
 .../FastPitch/fastpitch/attention.py          | 220 ------------------
 .../FastPitch/fastpitch/attn_loss_function.py |  54 -----
 .../FastPitch/fastpitch/data_function.py      | 207 ++++++++--------
 .../FastPitch/fastpitch/loss_function.py      |  24 +-
 .../FastPitch/fastpitch/model.py              |  71 +-----
 .../FastPitch/prepare_dataset.py              | 106 ++-------
 .../FastPitch/scripts/prepare_dataset.sh      |   3 +-
 PyTorch/SpeechSynthesis/FastPitch/train.py    |  34 +--
 9 files changed, 160 insertions(+), 590 deletions(-)
 delete mode 100644 PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py
 delete mode 100644 PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
index b700df1f4..e13b3ed1c 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
@@ -77,6 +77,9 @@ def clean_text(self, text):
     def symbols_to_sequence(self, symbols):
         return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id]
 
+    def arpabet_list_to_sequence(self, text):
+        return self.symbols_to_sequence(['@' + s for s in text])
+
     def arpabet_to_sequence(self, text):
         return self.symbols_to_sequence(['@' + s for s in text.split()])
 
@@ -118,7 +121,7 @@ def get_arpabet(self, word):
         else:
             arpabet = arpabet[0]
 
-        arpabet = "{" + arpabet + arpabet_suffix + "}"
+        # arpabet = "{" + arpabet + arpabet_suffix + "}"
 
         return arpabet
 
@@ -144,20 +147,26 @@ def encode_text(self, text, return_all=False):
                     text = text_arpabet
             elif self.handle_arpabet == 'word':
                 words = _words_re.findall(text)
-                text_arpabet = [
-                    word[1] if word[0] == '' else (
-                        self.get_arpabet(word[0])
-                        if np.random.uniform() < self.p_arpabet
-                        else word[0])
-                    for word in words]
-                text_arpabet = ''.join(text_arpabet)
+                text_arpabet = [[word[1]] if word[0] == ''
+                                else self.get_arpabet(word[0]).split(' ')
+                                for word in words]
+                text_arpabet = [phone for phone_list in text_arpabet for phone in phone_list if phone != ' ']
+                # text_arpabet = [
+                #     word[1] if word[0] == '' else (
+                #         self.get_arpabet(word[0])
+                #         if np.random.uniform() < self.p_arpabet
+                #         else word[0])
+                #     for word in words]
+                print('ARPABET: ', text_arpabet[:10])
+                #text_arpabet = ''.join(text_arpabet)
                 text = text_arpabet
             elif self.handle_arpabet != '':
                 raise Exception("{} handle_arpabet is not supported".format(
                     self.handle_arpabet))
-
-        text_encoded = self.text_to_sequence(text)
-
+        # text_encoded = self.arpabet_to_sequence(text)
+        # text_encoded = self.text_to_sequence(text)
+        text_encoded = self.arpabet_list_to_sequence(text)
+        print(len(text_encoded))
         if return_all:
             return text_encoded, text_clean, text_arpabet
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py
deleted file mode 100644
index 59a7397d6..000000000
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attention.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-
-class ConvNorm(torch.nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
-                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
-        super(ConvNorm, self).__init__()
-        if padding is None:
-            assert(kernel_size % 2 == 1)
-            padding = int(dilation * (kernel_size - 1) / 2)
-
-        self.conv = torch.nn.Conv1d(in_channels, out_channels,
-                                    kernel_size=kernel_size, stride=stride,
-                                    padding=padding, dilation=dilation,
-                                    bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
-
-    def forward(self, signal):
-        conv_signal = self.conv(signal)
-        return conv_signal
-
-
-class Invertible1x1ConvLUS(torch.nn.Module):
-    def __init__(self, c):
-        super(Invertible1x1ConvLUS, self).__init__()
-        # Sample a random orthonormal matrix to initialize weights
-        W, _ = torch.linalg.qr(torch.randn(c, c))
-        # Ensure determinant is 1.0 not -1.0
-        if torch.det(W) < 0:
-            W[:, 0] = -1*W[:, 0]
-        p, lower, upper = torch.lu_unpack(*torch.lu(W))
-
-        self.register_buffer('p', p)
-        # diagonals of lower will always be 1s anyway
-        lower = torch.tril(lower, -1)
-        lower_diag = torch.diag(torch.eye(c, c))
-        self.register_buffer('lower_diag', lower_diag)
-        self.lower = nn.Parameter(lower)
-        self.upper_diag = nn.Parameter(torch.diag(upper))
-        self.upper = nn.Parameter(torch.triu(upper, 1))
-
-    def forward(self, z, reverse=False):
-        U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag)
-        L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag)
-        W = torch.mm(self.p, torch.mm(L, U))
-        if reverse:
-            if not hasattr(self, 'W_inverse'):
-                # Reverse computation
-                W_inverse = W.float().inverse()
-                if z.type() == 'torch.cuda.HalfTensor':
-                    W_inverse = W_inverse.half()
-
-                self.W_inverse = W_inverse[..., None]
-            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
-            return z
-        else:
-            W = W[..., None]
-            z = F.conv1d(z, W, bias=None, stride=1, padding=0)
-            log_det_W = torch.sum(torch.log(torch.abs(self.upper_diag)))
-            return z, log_det_W
-
-
-class ConvAttention(torch.nn.Module):
-    def __init__(self, n_mel_channels=80, n_speaker_dim=128,
-                 n_text_channels=512, n_att_channels=80, temperature=1.0,
-                 n_mel_convs=2, align_query_enc_type='3xconv',
-                 use_query_proj=True):
-        super(ConvAttention, self).__init__()
-        self.temperature = temperature
-        self.att_scaling_factor = np.sqrt(n_att_channels)
-        self.softmax = torch.nn.Softmax(dim=3)
-        self.log_softmax = torch.nn.LogSoftmax(dim=3)
-        self.query_proj = Invertible1x1ConvLUS(n_mel_channels)
-        self.attn_proj = torch.nn.Conv2d(n_att_channels, 1, kernel_size=1)
-        self.align_query_enc_type = align_query_enc_type
-        self.use_query_proj = bool(use_query_proj)
-
-        self.key_proj = nn.Sequential(
-            ConvNorm(n_text_channels,
-                     n_text_channels * 2,
-                     kernel_size=3,
-                     bias=True,
-                     w_init_gain='relu'),
-            torch.nn.ReLU(),
-            ConvNorm(n_text_channels * 2,
-                     n_att_channels,
-                     kernel_size=1,
-                     bias=True))
-
-        self.align_query_enc_type = align_query_enc_type
-
-        if align_query_enc_type == "inv_conv":
-            self.query_proj = Invertible1x1ConvLUS(n_mel_channels)
-        elif align_query_enc_type == "3xconv":
-            self.query_proj = nn.Sequential(
-                ConvNorm(n_mel_channels,
-                         n_mel_channels * 2,
-                         kernel_size=3,
-                         bias=True,
-                         w_init_gain='relu'),
-                torch.nn.ReLU(),
-                ConvNorm(n_mel_channels * 2,
-                         n_mel_channels,
-                         kernel_size=1,
-                         bias=True),
-                torch.nn.ReLU(),
-                ConvNorm(n_mel_channels,
-                         n_att_channels,
-                         kernel_size=1,
-                         bias=True))
-        else:
-            raise ValueError("Unknown query encoder type specified")
-
-    def run_padded_sequence(self, sorted_idx, unsort_idx, lens, padded_data,
-                            recurrent_model):
-        """Sorts input data by previded ordering (and un-ordering) and runs the
-        packed data through the recurrent model
-
-        Args:
-            sorted_idx (torch.tensor): 1D sorting index
-            unsort_idx (torch.tensor): 1D unsorting index (inverse of sorted_idx)
-            lens: lengths of input data (sorted in descending order)
-            padded_data (torch.tensor): input sequences (padded)
-            recurrent_model (nn.Module): recurrent model to run data through
-        Returns:
-            hidden_vectors (torch.tensor): outputs of the RNN, in the original,
-            unsorted, ordering
-        """
-
-        # sort the data by decreasing length using provided index
-        # we assume batch index is in dim=1
-        padded_data = padded_data[:, sorted_idx]
-        padded_data = nn.utils.rnn.pack_padded_sequence(padded_data, lens)
-        hidden_vectors = recurrent_model(padded_data)[0]
-        hidden_vectors, _ = nn.utils.rnn.pad_packed_sequence(hidden_vectors)
-        # unsort the results at dim=1 and return
-        hidden_vectors = hidden_vectors[:, unsort_idx]
-        return hidden_vectors
-
-    def encode_query(self, query, query_lens):
-        query = query.permute(2, 0, 1)  # seq_len, batch, feature dim
-        lens, ids = torch.sort(query_lens, descending=True)
-        original_ids = [0] * lens.size(0)
-        for i in range(len(ids)):
-            original_ids[ids[i]] = i
-
-        query_encoded = self.run_padded_sequence(ids, original_ids, lens,
-                                                 query, self.query_lstm)
-        query_encoded = query_encoded.permute(1, 2, 0)
-        return query_encoded
-
-    def forward(self, queries, keys, query_lens, mask=None, key_lens=None,
-                keys_encoded=None, attn_prior=None):
-        """Attention mechanism for flowtron parallel
-        Unlike in Flowtron, we have no restrictions such as causality etc,
-        since we only need this during training.
-
-        Args:
-            queries (torch.tensor): B x C x T1 tensor
-                (probably going to be mel data)
-            keys (torch.tensor): B x C2 x T2 tensor (text data)
-            query_lens: lengths for sorting the queries in descending order
-            mask (torch.tensor): uint8 binary mask for variable length entries
-                (should be in the T2 domain)
-        Output:
-            attn (torch.tensor): B x 1 x T1 x T2 attention mask.
-                Final dim T2 should sum to 1
-        """
-        keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
-
-        # Beware can only do this since query_dim = attn_dim = n_mel_channels
-        if self.use_query_proj:
-            if self.align_query_enc_type == "inv_conv":
-                queries_enc, log_det_W = self.query_proj(queries)
-            elif self.align_query_enc_type == "3xconv":
-                queries_enc = self.query_proj(queries)
-                log_det_W = 0.0
-            else:
-                queries_enc, log_det_W = self.query_proj(queries)
-        else:
-            queries_enc, log_det_W = queries, 0.0
-
-        # different ways of computing attn,
-        # one is isotopic gaussians (per phoneme)
-        # Simplistic Gaussian Isotopic Attention
-
-        # B x n_attn_dims x T1 x T2
-        attn = (queries_enc[:, :, :, None] - keys_enc[:, :, None]) ** 2
-        # compute log likelihood from a gaussian
-        attn = -0.0005 * attn.sum(1, keepdim=True)
-        if attn_prior is not None:
-            attn = self.log_softmax(attn) + torch.log(attn_prior[:, None]+1e-8)
-
-        attn_logprob = attn.clone()
-
-        if mask is not None:
-            attn.data.masked_fill_(mask.permute(0, 2, 1).unsqueeze(2),
-                                   -float("inf"))
-
-        attn = self.softmax(attn)  # Softmax along T2
-        return attn, attn_logprob
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py
deleted file mode 100644
index a653504fd..000000000
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class AttentionCTCLoss(torch.nn.Module):
-    def __init__(self, blank_logprob=-1):
-        super(AttentionCTCLoss, self).__init__()
-        self.log_softmax = torch.nn.LogSoftmax(dim=3)
-        self.blank_logprob = blank_logprob
-        self.CTCLoss = nn.CTCLoss(zero_infinity=True)
-
-    def forward(self, attn_logprob, in_lens, out_lens):
-        key_lens = in_lens
-        query_lens = out_lens
-        attn_logprob_padded = F.pad(input=attn_logprob,
-                                    pad=(1, 0, 0, 0, 0, 0, 0, 0),
-                                    value=self.blank_logprob)
-        cost_total = 0.0
-        for bid in range(attn_logprob.shape[0]):
-            target_seq = torch.arange(1, key_lens[bid]+1).unsqueeze(0)
-            curr_logprob = attn_logprob_padded[bid].permute(1, 0, 2)
-            curr_logprob = curr_logprob[:query_lens[bid], :, :key_lens[bid]+1]
-            curr_logprob = self.log_softmax(curr_logprob[None])[0]
-            ctc_cost = self.CTCLoss(
-                curr_logprob, target_seq, input_lengths=query_lens[bid:bid+1],
-                target_lengths=key_lens[bid:bid+1])
-            cost_total += ctc_cost
-        cost = cost_total/attn_logprob.shape[0]
-        return cost
-
-
-class AttentionBinarizationLoss(torch.nn.Module):
-    def __init__(self):
-        super(AttentionBinarizationLoss, self).__init__()
-
-    def forward(self, hard_attention, soft_attention, eps=1e-12):
-        log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1],
-                            min=eps)).sum()
-        return -log_sum / hard_attention.sum()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 4e549f0d0..ab96cbc41 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -25,57 +25,54 @@
 #
 # *****************************************************************************
 
-import functools
-import json
 import re
+from functools import lru_cache
 from pathlib import Path
 
 import librosa
 import numpy as np
 import torch
 import torch.nn.functional as F
-from scipy import ndimage
-from scipy.stats import betabinom
 
 import common.layers as layers
+from common.text import cmudict
 from common.text.text_processing import TextProcessing
 from common.utils import load_wav_to_torch, load_filepaths_and_text, to_gpu
+from tgt.io import read_textgrid
+
+
+def check_durations(durs, mel_len, filepath):
+    assert sum(durs) == mel_len, \
+            f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens'
+
+
+def parse_textgrid(tier, sampling_rate, hop_length):
+    # From Dan Wells
+    # Latest MFA replaces silence phones with "" in output TextGrids
+    sil_phones = ['sil', 'sp', 'spn', '']
+    start_time = tier[0].start_time
+    end_time = tier[-1].end_time
+    phones = []
+    durations = []
+    for index, label in enumerate(tier._objects):
+        p_start, p_end, phone = label.start_time, label.end_time, label.text
+        # if p_start > end_time:
+        #     phones.append('')
+        end_time = p_end
+        if phone not in sil_phones:
+            phones.append(phone)
+        else:
+            if (index == 0) or (index == len(tier) - 1):
+                # leading or trailing silence
+                phones.append('sil')
+            else:
+                # short pause between words
+                phones.append('sp')
 
-
-class BetaBinomialInterpolator:
-    """Interpolates alignment prior matrices to save computation.
-
-    Calculating beta-binomial priors is costly. Instead cache popular sizes
-    and use img interpolation to get priors faster.
-    """
-    def __init__(self, round_mel_len_to=100, round_text_len_to=20):
-        self.round_mel_len_to = round_mel_len_to
-        self.round_text_len_to = round_text_len_to
-        self.bank = functools.lru_cache(beta_binomial_prior_distribution)
-
-    def round(self, val, to):
-        return max(1, int(np.round((val + 1) / to))) * to
-
-    def __call__(self, w, h):
-        bw = self.round(w, to=self.round_mel_len_to)
-        bh = self.round(h, to=self.round_text_len_to)
-        ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1)
-        assert ret.shape[0] == w, ret.shape
-        assert ret.shape[1] == h, ret.shape
-        return ret
-
-
-def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling=1.0):
-    P = phoneme_count
-    M = mel_count
-    x = np.arange(0, P)
-    mel_text_probs = []
-    for i in range(1, M+1):
-        a, b = scaling * i, scaling * (M + 1 - i)
-        rv = betabinom(P, a, b)
-        mel_i_prob = rv.pmf(x)
-        mel_text_probs.append(mel_i_prob)
-    return torch.tensor(np.array(mel_text_probs))
+        durations.append(int(np.ceil(p_end * sampling_rate / hop_length)
+                             - np.ceil(p_start * sampling_rate / hop_length)))
+    print('PHONES', phones[:15])
+    return phones, durations, start_time, end_time
 
 
 def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None,
@@ -128,38 +125,25 @@ class TTSDataset(torch.utils.data.Dataset):
         2) normalizes text and converts them to sequences of one-hot vectors
         3) computes mel-spectrograms from audio files.
     """
-    def __init__(self,
-                 dataset_path,
-                 audiopaths_and_text,
-                 text_cleaners,
-                 n_mel_channels,
-                 symbol_set='english_basic',
-                 p_arpabet=1.0,
-                 n_speakers=1,
-                 load_mel_from_disk=True,
-                 load_pitch_from_disk=True,
-                 pitch_mean=214.72203,  # LJSpeech defaults
-                 pitch_std=65.72038,
-                 max_wav_value=None,
-                 sampling_rate=None,
-                 filter_length=None,
-                 hop_length=None,
-                 win_length=None,
-                 mel_fmin=None,
-                 mel_fmax=None,
-                 prepend_space_to_text=False,
+    def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
+                 n_mel_channels, symbol_set='english_basic', p_arpabet=1.0,
+                 cmu_dict='cmudict/cmudict-0.7b',
+                 n_speakers=1, load_mel_from_disk=True,
+                 load_pitch_from_disk=True, pitch_mean=214.72203,
+                 pitch_std=65.72038, max_wav_value=None, sampling_rate=None,
+                 filter_length=None, hop_length=None, win_length=None,
+                 mel_fmin=None, mel_fmax=None, prepend_space_to_text=False,
                  append_space_to_text=False,
-                 pitch_online_dir=None,
-                 betabinomial_online_dir=None,
-                 use_betabinomial_interpolator=True,
-                 pitch_online_method='pyin',
-                 **ignored):
+                 dur_online_dir=None, textgrid_path=None,
+                 pitch_online_dir=None, pitch_online_method='pyin', **ignored):
 
         # Expect a list of filenames
         if type(audiopaths_and_text) is str:
             audiopaths_and_text = [audiopaths_and_text]
 
+        self.hop_length = hop_length
         self.dataset_path = dataset_path
+        self.textgrid_path = textgrid_path
         self.audiopaths_and_text = load_filepaths_and_text(
             audiopaths_and_text, dataset_path,
             has_speakers=(n_speakers > 1))
@@ -178,16 +162,14 @@ def __init__(self,
         assert p_arpabet == 0.0 or p_arpabet == 1.0, (
             'Only 0.0 and 1.0 p_arpabet is currently supported. '
             'Variable probability breaks caching of betabinomial matrices.')
+        if p_arpabet > 0.0:
+            cmudict.initialize(cmu_dict, keep_ambiguous=True)
 
-        self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet)
+        self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet, handle_arpabet='word', handle_arpabet_ambiguous='random')
         self.n_speakers = n_speakers
         self.pitch_tmp_dir = pitch_online_dir
+        self.dur_tmp_dir = dur_online_dir
         self.f0_method = pitch_online_method
-        self.betabinomial_tmp_dir = betabinomial_online_dir
-        self.use_betabinomial_interpolator = use_betabinomial_interpolator
-
-        if use_betabinomial_interpolator:
-            self.betabinomial_interpolator = BetaBinomialInterpolator()
 
         expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1))
 
@@ -217,22 +199,23 @@ def __getitem__(self, index):
         text = self.get_text(text)
         pitch = self.get_pitch(index, mel.size(-1))
         energy = torch.norm(mel.float(), dim=0, p=2)
-        attn_prior = self.get_prior(index, mel.shape[1], text.shape[0])
-
+        dur = self.get_dur(index)
+        print('get batch dur: ', len(dur))
         assert pitch.size(-1) == mel.size(-1)
 
         # No higher formants?
         if len(pitch.size()) == 1:
             pitch = pitch[None, :]
 
-        print('getting a batch')
         # this is a batch
-        return (text, mel, len(text), pitch, energy, speaker, attn_prior,
+        # FastPitch 1.0: (text, mel, len_text, dur, pitch, speaker)
+        return (text, mel, len(text), pitch, energy, speaker, dur,
                 audiopath)
 
     def __len__(self):
         return len(self.audiopaths_and_text)
 
+    @lru_cache()
     def get_mel(self, filename):
         if not self.load_mel_from_disk:
             audio, sampling_rate = load_wav_to_torch(filename)
@@ -263,30 +246,37 @@ def get_text(self, text):
         if self.append_space_to_text:
             text = text + space
 
+        print('TEXT: ', len(text))
         return torch.LongTensor(text)
 
-    def get_prior(self, index, mel_len, text_len):
-
-        if self.use_betabinomial_interpolator:
-            return torch.from_numpy(self.betabinomial_interpolator(mel_len,
-                                                                   text_len))
+    def get_dur(self, index):
+        audiopath, *fields = self.audiopaths_and_text[index]
+        name = Path(audiopath).stem
 
-        if self.betabinomial_tmp_dir is not None:
-            audiopath, *_ = self.audiopaths_and_text[index]
-            fname = Path(audiopath).relative_to(self.dataset_path) if self.dataset_path else Path(audiopath)
-            fname = fname.with_suffix('.pt')
-            cached_fpath = Path(self.betabinomial_tmp_dir, fname)
+        path = Path(self.dataset_path, 'durations') if self.dataset_path else Path(audiopath)
+        fname = Path(path, name).with_suffix('.pt')
 
+        if self.dur_tmp_dir is not None:
+            cached_fpath = Path(self.dur_tmp_dir, fname)
             if cached_fpath.is_file():
                 return torch.load(cached_fpath)
 
-        attn_prior = beta_binomial_prior_distribution(text_len, mel_len)
+        tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid')
+        try:
+            textgrid = read_textgrid(tgt_path, include_empty_intervals=True)
+        except FileNotFoundError:
+            print(f'{name}.wav TextGrid missing: {tgt_path}')
+            raise
+        _, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
+                                       self.sampling_rate,
+                                       self.hop_length)
 
-        if self.betabinomial_tmp_dir is not None:
-            cached_fpath.parent.mkdir(parents=True, exist_ok=True)
-            torch.save(attn_prior, cached_fpath)
+        check_durations(durs, self.get_mel(audiopath).size(1), name)
+
+        if self.dur_tmp_dir is not None and not cached_fpath.is_file():
+            return torch.save(durs, cached_fpath)
 
-        return attn_prior
+        return durs
 
     def get_pitch(self, index, mel_len=None):
         audiopath, *fields = self.audiopaths_and_text[index]
@@ -330,9 +320,8 @@ def get_pitch(self, index, mel_len=None):
 class TTSCollate:
     """Zero-pads model inputs and targets based on number of frames per step"""
     # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths,
-    # len_x, pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch
+    # len_x, pitch_padded, energy_padded, speaker, DUR, audiopaths) = batch
     def __call__(self, batch):
-        print('COLLATE GETS CALLED')
         """Collate training batch from normalized text and mel-spec"""
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
@@ -346,20 +335,16 @@ def __call__(self, batch):
             text = batch[ids_sorted_decreasing[i]][0]
             text_padded[i, :text.size(0)] = text
 
-        dur_padded = torch.zeros_like(text_padded, dtype=batch[0][3].dtype)
-        print('dur padded orig', dur_padded.shape)
+        dur_padded = torch.zeros_like(text_padded, dtype=torch.int32)
+
         dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32)
-        print('start loop?')
         for i in range(len(ids_sorted_decreasing)):
-            dur = batch[ids_sorted_decreasing[i]][3]
-            # ERROR
-            # print(i, dur_padded[0].shape, dur[0].shape)
-            print(i)
-            dur_padded[i, :dur.shape[0]] = dur
-            print('new shape: ', dur_padded.shape)
-            dur_lens[i] = dur.shape[0]
+            dur = batch[ids_sorted_decreasing[i]][6]
+            # ERROR some mismatch between phones in transcript vs phones form text preprocessing
+            print('TEXT LEN', dur_padded.shape, 'DUR LEN', len(dur))
+            dur_padded[i, :len(dur)] = torch.Tensor(dur)
+            dur_lens[i] = len(dur)
             assert dur_lens[i] == input_lengths[i]
-        print('end loop?')
         # Right zero-pad mel-spec
         num_mels = batch[0][1].size(0)
         max_target_len = max([x[1].size(1) for x in batch])
@@ -391,13 +376,6 @@ def __call__(self, batch):
         else:
             speaker = None
 
-        attn_prior_padded = torch.zeros(len(batch), max_target_len,
-                                        max_input_len)
-        attn_prior_padded.zero_()
-        for i in range(len(ids_sorted_decreasing)):
-            prior = batch[ids_sorted_decreasing[i]][6]
-            attn_prior_padded[i, :prior.size(0), :prior.size(1)] = prior
-
         # Count number of items - characters in text
         len_x = [x[2] for x in batch]
         len_x = torch.Tensor(len_x)
@@ -405,28 +383,27 @@ def __call__(self, batch):
         audiopaths = [batch[i][7] for i in ids_sorted_decreasing]
 
         return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x,
-                pitch_padded, energy_padded, speaker, attn_prior_padded,
-                audiopaths)
+                pitch_padded, energy_padded, speaker, audiopaths)
 
 
 def batch_to_gpu(batch):
     (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x,
-     pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch
+     pitch_padded, energy_padded, speaker, dur_lens, audiopaths) = batch
 
     text_padded = to_gpu(text_padded).long()
     durs_padded = to_gpu(durs_padded).long()
+    dur_lens = to_gpu(dur_lens).long()
     input_lengths = to_gpu(input_lengths).long()
     mel_padded = to_gpu(mel_padded).float()
     output_lengths = to_gpu(output_lengths).long()
     pitch_padded = to_gpu(pitch_padded).float()
     energy_padded = to_gpu(energy_padded).float()
-    attn_prior = to_gpu(attn_prior).float()
     if speaker is not None:
         speaker = to_gpu(speaker).long()
 
     # Alignments act as both inputs and targets - pass shallow copies
     x = [text_padded, input_lengths, mel_padded, output_lengths,
-         pitch_padded, energy_padded, speaker, attn_prior, audiopaths]
-    y = [mel_padded, input_lengths, output_lengths]
+         pitch_padded, energy_padded, speaker, durs_padded, audiopaths]
+    y = [mel_padded, durs_padded, dur_lens, output_lengths]
     len_x = torch.sum(output_lengths)
     return (x, y, len_x)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
index 0cd3775e5..5b789a9a2 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
@@ -30,29 +30,24 @@
 from torch import nn
 
 from common.utils import mask_from_lens
-from fastpitch.attn_loss_function import AttentionCTCLoss
 
 
 class FastPitchLoss(nn.Module):
     def __init__(self, dur_predictor_loss_scale=1.0,
-                 pitch_predictor_loss_scale=1.0, attn_loss_scale=1.0,
+                 pitch_predictor_loss_scale=1.0,
                  energy_predictor_loss_scale=0.1):
         super(FastPitchLoss, self).__init__()
         self.dur_predictor_loss_scale = dur_predictor_loss_scale
         self.pitch_predictor_loss_scale = pitch_predictor_loss_scale
         self.energy_predictor_loss_scale = energy_predictor_loss_scale
-        self.attn_loss_scale = attn_loss_scale
-        self.attn_ctc_loss = AttentionCTCLoss()
 
     def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
         (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt,
-         energy_pred, energy_tgt, attn_soft, attn_hard, attn_dur,
-         attn_logprob) = model_out
-
-        (mel_tgt, in_lens, out_lens) = targets
-
-        dur_tgt = attn_dur
-        dur_lens = in_lens
+         energy_pred, energy_tgt) = model_out
+        # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
+        #(mel_tgt, in_lens, out_lens) = targets
+        mel_tgt, dur_tgt, dur_lens, pitch_tgt = targets
+        #dur_lens = in_lens
 
         mel_tgt.requires_grad = False
         # (B,H,T) => (B,T,H)
@@ -83,21 +78,16 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
         else:
             energy_loss = 0
 
-        # Attention loss
-        attn_loss = self.attn_ctc_loss(attn_logprob, in_lens, out_lens)
-
         loss = (mel_loss
                 + dur_pred_loss * self.dur_predictor_loss_scale
                 + pitch_loss * self.pitch_predictor_loss_scale
-                + energy_loss * self.energy_predictor_loss_scale
-                + attn_loss * self.attn_loss_scale)
+                + energy_loss * self.energy_predictor_loss_scale)
 
         meta = {
             'loss': loss.clone().detach(),
             'mel_loss': mel_loss.clone().detach(),
             'duration_predictor_loss': dur_pred_loss.clone().detach(),
             'pitch_loss': pitch_loss.clone().detach(),
-            'attn_loss': attn_loss.clone().detach(),
             'dur_error': (torch.abs(dur_pred - dur_tgt).sum()
                           / dur_mask.sum()).detach(),
         }
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index 34fca4dff..ac0b188fe 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -34,7 +34,6 @@
 from common.layers import ConvReLUNorm
 from common.utils import mask_from_lens
 from fastpitch.alignment import b_mas, mas_width1
-from fastpitch.attention import ConvAttention
 from fastpitch.transformer import FFTransformer
 
 
@@ -204,45 +203,11 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
 
         self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True)
 
-        self.attention = ConvAttention(
-            n_mel_channels, 0, symbols_embedding_dim,
-            use_query_proj=True, align_query_enc_type='3xconv')
-
-    def binarize_attention(self, attn, in_lens, out_lens):
-        """For training purposes only. Binarizes attention with MAS.
-           These will no longer recieve a gradient.
-
-        Args:
-            attn: B x 1 x max_mel_len x max_text_len
-        """
-        b_size = attn.shape[0]
-        with torch.no_grad():
-            attn_cpu = attn.data.cpu().numpy()
-            attn_out = torch.zeros_like(attn)
-            for ind in range(b_size):
-                hard_attn = mas_width1(
-                    attn_cpu[ind, 0, :out_lens[ind], :in_lens[ind]])
-                attn_out[ind, 0, :out_lens[ind], :in_lens[ind]] = torch.tensor(
-                    hard_attn, device=attn.get_device())
-        return attn_out
-
-    def binarize_attention_parallel(self, attn, in_lens, out_lens):
-        """For training purposes only. Binarizes attention with MAS.
-           These will no longer recieve a gradient.
-
-        Args:
-            attn: B x 1 x max_mel_len x max_text_len
-        """
-        with torch.no_grad():
-            attn_cpu = attn.data.cpu().numpy()
-            attn_out = b_mas(attn_cpu, in_lens.cpu().numpy(),
-                             out_lens.cpu().numpy(), width=1)
-        return torch.from_numpy(attn_out).to(attn.get_device())
-
-    def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
-
-        (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
-         speaker, attn_prior, audiopaths) = inputs
+    def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75):
+        # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs
+        # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs
+        (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense,
+         speaker, audiopaths) = inputs
 
         mel_max_len = mel_tgt.size(2)
 
@@ -256,26 +221,6 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
         # Input FFT
         enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
 
-        # Alignment
-        text_emb = self.encoder.word_emb(inputs)
-
-        # make sure to do the alignments before folding
-        attn_mask = mask_from_lens(input_lens)[..., None] == 0
-        # attn_mask should be 1 for unused timesteps in the text_enc_w_spkvec tensor
-
-        attn_soft, attn_logprob = self.attention(
-            mel_tgt, text_emb.permute(0, 2, 1), mel_lens, attn_mask,
-            key_lens=input_lens, keys_encoded=enc_out, attn_prior=attn_prior)
-
-        attn_hard = self.binarize_attention_parallel(
-            attn_soft, input_lens, mel_lens)
-
-        # Viterbi --> durations
-        attn_hard_dur = attn_hard.sum(2)[:, 0, :]
-        dur_tgt = attn_hard_dur
-
-        assert torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens))
-
         # Predict durations
         log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
         dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)
@@ -308,14 +253,14 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
             energy_tgt = None
 
         len_regulated, dec_lens = regulate_len(
-            dur_tgt, enc_out, pace, mel_max_len)
+            dur_tgt if use_gt_durations else dur_pred,
+            enc_out, pace, mel_max_len)
 
         # Output FFT
         dec_out, dec_mask = self.decoder(len_regulated, dec_lens)
         mel_out = self.proj(dec_out)
         return (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred,
-                pitch_tgt, energy_pred, energy_tgt, attn_soft, attn_hard,
-                attn_hard_dur, attn_logprob)
+                pitch_tgt, energy_pred, energy_tgt)
 
     def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None,
               energy_tgt=None, pitch_transform=None, max_duration=75,
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index 645f6b97d..f523c9f6d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -27,16 +27,13 @@
 
 import argparse
 import os
-import sys
 import time
 from pathlib import Path
 
-import numpy as np
 import torch
 import tqdm
 import dllogger as DLLogger
 from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
-from tgt.io import read_textgrid
 from torch.utils.data import DataLoader
 
 from fastpitch.data_function import TTSCollate, TTSDataset
@@ -58,8 +55,8 @@ def parse_args(parser):
                         help='Extract pitch')
     parser.add_argument('--extract-durations', action='store_true',
                         help='Extract durations (from alignment dir)')
-    parser.add_argument('--save-alignment-priors', action='store_true',
-                        help='Pre-calculate diagonal matrices of alignment of text to audio')
+    parser.add_argument('--durs-online-dir', type=str,
+                        help='Durations tmp dir')
     parser.add_argument('--log-file', type=str, default='preproc_log.json',
                          help='Filename for logging')
     parser.add_argument('--n-speakers', type=int, default=1)
@@ -88,36 +85,6 @@ def parse_args(parser):
     return parser
 
 
-def parse_textgrid(tier, sampling_rate, hop_length):
-    # From Dan Wells
-    # Latest MFA replaces silence phones with "" in output TextGrids
-    sil_phones = ['sil', 'sp', 'spn', '']
-    start_time = tier[0].start_time
-    end_time = tier[-1].end_time
-    phones = []
-    durations = []
-    for index, label in enumerate(tier._objects):
-        p_start, p_end, phone = label.start_time, label.end_time, label.text
-        if phone not in sil_phones:
-            phones.append(phone)
-        else:
-            if (index == 0) or (index == len(tier) - 1):
-                # leading or trailing silence
-                phones.append('sil')
-            else:
-                # short pause between words
-                phones.append('sp')
-
-        durations.append(int(np.ceil(p_end * sampling_rate / hop_length)
-                             - np.ceil(p_start * sampling_rate / hop_length)))
-    return phones, durations, start_time, end_time
-
-
-def check_durations(durs, mel_len, filepath):
-    assert sum(durs) == mel_len, \
-            f'Length mismatch: {filepath}, {sum(durs)} durs != {mel_len} lens'
-
-
 def main():
     parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing')
     parser = parse_args(parser)
@@ -139,37 +106,32 @@ def main():
 
     if args.extract_durations:
         if not args.textgrid_path:
-            args.textgridPath = os.path.join(args.dataset_path, 'TextGrid')
-        Path(args.dataset_path, 'durations').mkdir(parents=False, exist_ok=True)
-
-    if args.save_alignment_priors:
-        Path(args.dataset_path, 'alignment_priors').mkdir(parents=False, exist_ok=True)
+            args.textgrid_path = os.path.join(args.dataset_path, 'TextGrid')
+        durs_path = Path(args.dataset_path, 'durations')
+        durs_path.mkdir(parents=False, exist_ok=True)
+        if args.durs_online_dir:
+            Path(args.durs_online_dir, durs_path).mkdir(parents=True, exist_ok=True)
 
     for filelist in args.wav_text_filelists:
 
         print(f'Processing {filelist}...')
 
-        dataset = TTSDataset(
-            args.dataset_path,
-            filelist,
-            text_cleaners=['english_cleaners_v2'],
-            n_mel_channels=args.n_mel_channels,
-            p_arpabet=0.0,
-            n_speakers=args.n_speakers,
-            load_mel_from_disk=False,
-            load_pitch_from_disk=False,
-            pitch_mean=None,
-            pitch_std=None,
-            max_wav_value=args.max_wav_value,
-            sampling_rate=args.sampling_rate,
-            filter_length=args.filter_length,
-            hop_length=args.hop_length,
-            win_length=args.win_length,
-            mel_fmin=args.mel_fmin,
-            mel_fmax=args.mel_fmax,
-            betabinomial_online_dir=None,
-            pitch_online_dir=None,
-            pitch_online_method=args.f0_method)
+        dataset = TTSDataset(args.dataset_path, filelist,
+                             text_cleaners=['english_cleaners_v2'],
+                             n_mel_channels=args.n_mel_channels, p_arpabet=1.0,
+                             n_speakers=args.n_speakers,
+                             load_mel_from_disk=False,
+                             load_pitch_from_disk=False, pitch_mean=None,
+                             pitch_std=None, max_wav_value=args.max_wav_value,
+                             sampling_rate=args.sampling_rate,
+                             filter_length=args.filter_length,
+                             hop_length=args.hop_length,
+                             win_length=args.win_length, mel_fmin=args.mel_fmin,
+                             mel_fmax=args.mel_fmax,
+                             pitch_online_dir=None,
+                             dur_online_dir=None,
+                             textgrid_path=args.textgrid_path,
+                             pitch_online_method=args.f0_method)
 
         data_loader = DataLoader(
             dataset,
@@ -213,32 +175,12 @@ def main():
 
             if args.extract_durations:
                 # From Dan Wells
-                for j, _ in range(len(mel_lens)):
+                for j, _ in range(len(durs)):
                     filename = fpaths[j]
-                    tgt_path = Path(args.textgrid_path, f'{filename}.TextGrid')
-                    try:
-                        textgrid = read_textgrid(tgt_path,
-                                                 include_empty_intervals=True)
-                    except FileNotFoundError:
-                        print(f'{filename}.wav TextGrid missing: {tgt_path}')
-                        raise
-                    _, durs, _, _ = parse_textgrid(
-                        textgrid.get_tier_by_name('phones'),
-                        args.sampling_rate,
-                        args.hop_length)
-
-                    check_durations(durs, mel_lens[j], filename)
-
                     dur_path = Path(args.dataset_path,
                                     'durations', f'{filename}.pt')
                     torch.save(torch.LongTensor(durs).cpu().int(), dur_path)
 
-            if args.save_alignment_priors:
-                for j, prior in enumerate(attn_prior):
-                    fname = Path(fpaths[j]).with_suffix('.pt').name
-                    fpath = Path(args.dataset_path, 'alignment_priors', fname)
-                    torch.save(prior[:mel_lens[j], :input_lens[j]], fpath)
-
 
 if __name__ == '__main__':
     main()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index e9f9b0c9f..a1baf1c3c 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -30,11 +30,12 @@ fi
 
 python prepare_dataset.py \
     --wav-text-filelists ${FILELIST} \
-    --n-workers 8 \
+    --n-workers 2 \
     --batch-size 1 \
     --dataset-path $DATA_DIR \
     --textgrid-path $ALIGNMENT_DIR \
     --extract-pitch \
     --extract-durations\
+    --durs-online-dir "/tmp/" \
     --f0-method pyin \
     $ARGS
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 90cfb4443..fb59e9fba 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -50,7 +50,6 @@
 import models
 from common.text import cmudict
 from common.utils import BenchmarkStats, prepare_tmp
-from fastpitch.attn_loss_function import AttentionBinarizationLoss
 from fastpitch.data_function import batch_to_gpu, TTSCollate, TTSDataset
 from fastpitch.loss_function import FastPitchLoss
 from fastpitch.model import regulate_len
@@ -129,7 +128,8 @@ def parse_args(parser):
                       help='Type of text cleaners for input text')
     data.add_argument('--symbol-set', type=str, default='english_basic',
                       help='Define symbol set for input text')
-    data.add_argument('--p-arpabet', type=float, default=0.0,
+    # should be 1.0 to work with MFA textgrids, which contain only phones
+    data.add_argument('--p-arpabet', type=float, default=1.0,
                       help='Probability of using arpabets instead of graphemes '
                            'for each word; set 0 for pure grapheme training')
     data.add_argument('--heteronyms-path', type=str, default='cmudict/heteronyms',
@@ -514,7 +514,6 @@ def main():
 
     model_config = models.get_model_config('FastPitch', args)
     model = models.get_model('FastPitch', model_config, device)
-    attention_kl_loss = AttentionBinarizationLoss()
 
     if args.local_rank == 0:
         wandb.init(project=args.project,
@@ -574,8 +573,7 @@ def main():
 
     criterion = FastPitchLoss(
         dur_predictor_loss_scale=args.dur_predictor_loss_scale,
-        pitch_predictor_loss_scale=args.pitch_predictor_loss_scale,
-        attn_loss_scale=args.attn_loss_scale)
+        pitch_predictor_loss_scale=args.pitch_predictor_loss_scale)
 
     collate_fn = TTSCollate()
 
@@ -640,26 +638,10 @@ def main():
             x, y, num_frames = batch_to_gpu(batch)
 
             with torch.cuda.amp.autocast(enabled=args.amp):
-                y_pred = model(x)
+                # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
+                y_pred = model(x, use_gt_durations=True)
+                # y = mel_padded, input_lengths, output_lengths
                 loss, meta = criterion(y_pred, y)
-
-                if (args.kl_loss_start_epoch is not None
-                        and epoch >= args.kl_loss_start_epoch):
-
-                    if args.kl_loss_start_epoch == epoch and epoch_iter == 1:
-                        print('Begin hard_attn loss')
-
-                    _, _, _, _, _, _, _, _, attn_soft, attn_hard, _, _ = y_pred
-                    binarization_loss = attention_kl_loss(attn_hard, attn_soft)
-                    kl_weight = min((epoch - args.kl_loss_start_epoch) / args.kl_loss_warmup_epochs, 1.0) * args.kl_loss_weight
-                    meta['kl_loss'] = binarization_loss.clone().detach() * kl_weight
-                    loss += kl_weight * binarization_loss
-
-                else:
-                    meta['kl_loss'] = torch.zeros_like(loss)
-                    kl_weight = 0
-                    binarization_loss = 0
-
                 loss /= args.grad_accumulation
 
             meta = {k: v / args.grad_accumulation
@@ -702,12 +684,12 @@ def main():
                     apply_multi_tensor_ema(args.ema_decay, *mt_ema_params)
 
                 iter_mel_loss = iter_meta['mel_loss'].item()
-                iter_kl_loss = iter_meta['kl_loss'].item()
                 iter_time = time.perf_counter() - iter_start_time
                 epoch_frames_per_sec += iter_num_frames / iter_time
                 epoch_loss += iter_loss
                 epoch_num_frames += iter_num_frames
                 epoch_mel_loss += iter_mel_loss
+
                 if epoch_iter % 5 == 0:
                     log({
                         'epoch': epoch,
@@ -716,8 +698,6 @@ def main():
                         'total_steps': total_iter,
                         'loss/loss': iter_loss,
                         'mel-loss/mel_loss': iter_mel_loss,
-                        'kl_loss': iter_kl_loss,
-                        'kl_weight': kl_weight,
                         'frames per s': iter_num_frames / iter_time,
                         'took': iter_time,
                         'lrate': optimizer.param_groups[0]['lr'],

From 83958506424d0ab983b43d3e43a148dab0b5aaec Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Wed, 9 Mar 2022 19:45:50 +0000
Subject: [PATCH 06/21] Replace preprocessed text with phone sequence from MFA
 TextGrid

---
 .../FastPitch/common/text/symbols.py          |  8 ++++---
 .../FastPitch/common/text/text_processing.py  | 18 +++------------
 .../FastPitch/fastpitch/data_function.py      | 23 +++++++++----------
 .../FastPitch/prepare_dataset.py              | 15 +++++-------
 .../FastPitch/scripts/prepare_dataset.sh      |  4 ++--
 5 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py
index cfdb5755a..7262b1284 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/symbols.py
@@ -9,6 +9,8 @@
 
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
 _arpabet = ['@' + s for s in valid_symbols]
+# In phones extracted from MFA TextGrid
+_silences = ['@sp', '@sil']
 
 
 def get_symbols(symbol_set='english_basic'):
@@ -17,20 +19,20 @@ def get_symbols(symbol_set='english_basic'):
         _punctuation = '!\'(),.:;? '
         _special = '-'
         _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
     elif symbol_set == 'english_basic_lowercase':
         _pad = '_'
         _punctuation = '!\'"(),.:;? '
         _special = '-'
         _letters = 'abcdefghijklmnopqrstuvwxyz'
-        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
+        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet + _silences
     elif symbol_set == 'english_expanded':
         _punctuation = '!\'",.:;? '
         _math = '#%&*+-/[]()'
         _special = '_@©°½—₩€$'
         _accented = 'áçéêëñöøćž'
         _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
-        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet
+        symbols = list(_punctuation + _math + _special + _accented + _letters) + _arpabet + _silences
     else:
         raise Exception("{} symbol set does not exist".format(symbol_set))
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
index e13b3ed1c..8a7e3e638 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/common/text/text_processing.py
@@ -121,9 +121,7 @@ def get_arpabet(self, word):
         else:
             arpabet = arpabet[0]
 
-        # arpabet = "{" + arpabet + arpabet_suffix + "}"
-
-        return arpabet
+        return arpabet + arpabet_suffix
 
     def encode_text(self, text, return_all=False):
         if self.expand_currency:
@@ -150,23 +148,13 @@ def encode_text(self, text, return_all=False):
                 text_arpabet = [[word[1]] if word[0] == ''
                                 else self.get_arpabet(word[0]).split(' ')
                                 for word in words]
-                text_arpabet = [phone for phone_list in text_arpabet for phone in phone_list if phone != ' ']
-                # text_arpabet = [
-                #     word[1] if word[0] == '' else (
-                #         self.get_arpabet(word[0])
-                #         if np.random.uniform() < self.p_arpabet
-                #         else word[0])
-                #     for word in words]
-                print('ARPABET: ', text_arpabet[:10])
-                #text_arpabet = ''.join(text_arpabet)
+                text_arpabet = [phone for phone_list in text_arpabet
+                                for phone in phone_list if phone != ' ']
                 text = text_arpabet
             elif self.handle_arpabet != '':
                 raise Exception("{} handle_arpabet is not supported".format(
                     self.handle_arpabet))
-        # text_encoded = self.arpabet_to_sequence(text)
-        # text_encoded = self.text_to_sequence(text)
         text_encoded = self.arpabet_list_to_sequence(text)
-        print(len(text_encoded))
         if return_all:
             return text_encoded, text_clean, text_arpabet
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index ab96cbc41..dcd4baa6a 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -71,7 +71,7 @@ def parse_textgrid(tier, sampling_rate, hop_length):
 
         durations.append(int(np.ceil(p_end * sampling_rate / hop_length)
                              - np.ceil(p_start * sampling_rate / hop_length)))
-    print('PHONES', phones[:15])
+
     return phones, durations, start_time, end_time
 
 
@@ -196,11 +196,10 @@ def __getitem__(self, index):
             speaker = None
 
         mel = self.get_mel(audiopath)
-        text = self.get_text(text)
         pitch = self.get_pitch(index, mel.size(-1))
         energy = torch.norm(mel.float(), dim=0, p=2)
-        dur = self.get_dur(index)
-        print('get batch dur: ', len(dur))
+        dur, phones = self.get_dur(index)
+        text = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones))
         assert pitch.size(-1) == mel.size(-1)
 
         # No higher formants?
@@ -237,7 +236,7 @@ def get_mel(self, filename):
         return melspec
 
     def get_text(self, text):
-        text = self.tp.encode_text(text)
+        text, text_clean, text_arpabet = self.tp.encode_text(text, return_all=True)
         space = [self.tp.encode_text("A A")[1]]
 
         if self.prepend_space_to_text:
@@ -246,8 +245,7 @@ def get_text(self, text):
         if self.append_space_to_text:
             text = text + space
 
-        print('TEXT: ', len(text))
-        return torch.LongTensor(text)
+        return torch.LongTensor(text), text_arpabet
 
     def get_dur(self, index):
         audiopath, *fields = self.audiopaths_and_text[index]
@@ -267,7 +265,7 @@ def get_dur(self, index):
         except FileNotFoundError:
             print(f'{name}.wav TextGrid missing: {tgt_path}')
             raise
-        _, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
+        phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
                                        self.sampling_rate,
                                        self.hop_length)
 
@@ -276,7 +274,7 @@ def get_dur(self, index):
         if self.dur_tmp_dir is not None and not cached_fpath.is_file():
             return torch.save(durs, cached_fpath)
 
-        return durs
+        return durs, phones
 
     def get_pitch(self, index, mel_len=None):
         audiopath, *fields = self.audiopaths_and_text[index]
@@ -326,7 +324,7 @@ def __call__(self, batch):
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
             torch.LongTensor([len(x[0]) for x in batch]),
-            dim=0, descending=True)
+            dim=0, descending=False)
         max_input_len = input_lengths[0]
 
         text_padded = torch.LongTensor(len(batch), max_input_len)
@@ -340,8 +338,9 @@ def __call__(self, batch):
         dur_lens = torch.zeros(dur_padded.size(0), dtype=torch.int32)
         for i in range(len(ids_sorted_decreasing)):
             dur = batch[ids_sorted_decreasing[i]][6]
-            # ERROR some mismatch between phones in transcript vs phones form text preprocessing
-            print('TEXT LEN', dur_padded.shape, 'DUR LEN', len(dur))
+            # With MFA durations:
+            # some mismatch between phones in transcript vs phones from text preprocessing
+            # for now using phones from texgrid as input
             dur_padded[i, :len(dur)] = torch.Tensor(dur)
             dur_lens[i] = len(dur)
             assert dur_lens[i] == input_lengths[i]
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index f523c9f6d..5b0a4bef7 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -144,23 +144,20 @@ def main():
             drop_last=False)
 
         all_filenames = set()
-        print('TIME TO LOOP')
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
             # From TTSCollate __call__
             # (text_padded, dur_padded, input_lengths, mel_padded,
-            # output_lengths, len_x, pitch_padded, energy_padded, speaker,
-            # attn_prior_padded, audiopaths)
-            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, attn_prior, fpaths = batch
-            print(f'BATCH: {fpaths}')
+            # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths)
+            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths = batch
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name
                 if fname in all_filenames:
                     raise ValueError(f'Filename is not unique: {fname}')
                 all_filenames.add(fname)
-            print('filename check complete')
+
             if args.extract_mels:
                 for j, mel in enumerate(mels):
                     fname = Path(fpaths[j]).with_suffix('.pt').name
@@ -175,11 +172,11 @@ def main():
 
             if args.extract_durations:
                 # From Dan Wells
-                for j, _ in range(len(durs)):
-                    filename = fpaths[j]
+                for j, d in enumerate(durs):
+                    filename = Path(fpaths[j]).stem
                     dur_path = Path(args.dataset_path,
                                     'durations', f'{filename}.pt')
-                    torch.save(torch.LongTensor(durs).cpu().int(), dur_path)
+                    torch.save(d, dur_path)
 
 
 if __name__ == '__main__':
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index a1baf1c3c..ae544f25d 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -30,8 +30,8 @@ fi
 
 python prepare_dataset.py \
     --wav-text-filelists ${FILELIST} \
-    --n-workers 2 \
-    --batch-size 1 \
+    --n-workers 4 \
+    --batch-size 1 \  # don't change this
     --dataset-path $DATA_DIR \
     --textgrid-path $ALIGNMENT_DIR \
     --extract-pitch \

From ab11e60e8946cac5105b724cbeb245609797fbee Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Wed, 9 Mar 2022 19:50:54 +0000
Subject: [PATCH 07/21] Fix up descending keyword

---
 .../SpeechSynthesis/FastPitch/fastpitch/data_function.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index dcd4baa6a..74f41342a 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -266,8 +266,8 @@ def get_dur(self, index):
             print(f'{name}.wav TextGrid missing: {tgt_path}')
             raise
         phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
-                                       self.sampling_rate,
-                                       self.hop_length)
+                                            self.sampling_rate,
+                                            self.hop_length)
 
         check_durations(durs, self.get_mel(audiopath).size(1), name)
 
@@ -324,7 +324,7 @@ def __call__(self, batch):
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
             torch.LongTensor([len(x[0]) for x in batch]),
-            dim=0, descending=False)
+            dim=0, descending=True)
         max_input_len = input_lengths[0]
 
         text_padded = torch.LongTensor(len(batch), max_input_len)

From 7f66533224ac757f6dc641a3a7d5b9651aa85d82 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 10 Mar 2022 13:56:39 +0000
Subject: [PATCH 08/21] Save transcriptions from textGrids to be used as inputs
 (often kept separate for no good reason so far

---
 .../FastPitch/add_durations_lj_filelist.py    | 26 +++++++++++
 .../FastPitch/fastpitch/data_function.py      | 46 ++++++++++++-------
 .../FastPitch/fastpitch/model.py              |  4 +-
 .../FastPitch/prepare_dataset.py              | 14 +++++-
 .../FastPitch/scripts/prepare_dataset.sh      |  3 +-
 .../FastPitch/scripts/train.sh                | 24 +++++-----
 PyTorch/SpeechSynthesis/FastPitch/train.py    |  9 ++--
 7 files changed, 86 insertions(+), 40 deletions(-)
 create mode 100644 PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py

diff --git a/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py b/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py
new file mode 100644
index 000000000..56bd9e378
--- /dev/null
+++ b/PyTorch/SpeechSynthesis/FastPitch/add_durations_lj_filelist.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+
+def add_duration_column(filename, output_filename):
+    all_info = []
+    with open(filename) as f:
+        for line in f:
+            file_path, pitch_path, transcript = line.strip().split('|', maxsplit=2)
+            name_stem = Path(os.path.basename(file_path)).stem
+            # stop hard-coding which columns already exist (no mels or speakers)
+            all_info.append('|'.join([file_path,
+                                      pitch_path,
+                                      f'durations/{name_stem}.pt',
+                                      transcript]))
+
+    with open(output_filename, 'w') as f:
+        f.writelines('\n'.join(all_info))
+
+
+if __name__ == '__main__':
+    filelists = {'filelists/ljs_audio_pitch_text_test.txt': 'filelists/ljs_audio_pitch_durs_text_test.txt',
+                 'filelists/ljs_audio_pitch_text_train_v3.txt': 'filelists/ljs_audio_pitch_durs_text_train_v3.txt',
+                 'filelists/ljs_audio_pitch_text_val.txt': 'filelists/ljs_audio_pitch_durs_text_val.txt'}
+    for file_name, output_name in filelists.items():
+        add_duration_column(file_name, output_name)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 74f41342a..638e116fb 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -133,7 +133,7 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
                  pitch_std=65.72038, max_wav_value=None, sampling_rate=None,
                  filter_length=None, hop_length=None, win_length=None,
                  mel_fmin=None, mel_fmax=None, prepend_space_to_text=False,
-                 append_space_to_text=False,
+                 append_space_to_text=False, load_durs_from_disk=False,
                  dur_online_dir=None, textgrid_path=None,
                  pitch_online_dir=None, pitch_online_method='pyin', **ignored):
 
@@ -155,6 +155,7 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
                 filter_length, hop_length, win_length,
                 n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
         self.load_pitch_from_disk = load_pitch_from_disk
+        self.load_durs_from_disk = load_durs_from_disk
 
         self.prepend_space_to_text = prepend_space_to_text
         self.append_space_to_text = append_space_to_text
@@ -171,8 +172,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
         self.dur_tmp_dir = dur_online_dir
         self.f0_method = pitch_online_method
 
-        expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1))
-
+        expected_columns = (2 + int(load_durs_from_disk) + int(load_pitch_from_disk) + (n_speakers > 1))
+        print(load_durs_from_disk, load_pitch_from_disk, expected_columns)
         assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)
 
         if len(self.audiopaths_and_text[0]) < expected_columns:
@@ -199,7 +200,7 @@ def __getitem__(self, index):
         pitch = self.get_pitch(index, mel.size(-1))
         energy = torch.norm(mel.float(), dim=0, p=2)
         dur, phones = self.get_dur(index)
-        text = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones))
+        text = phones
         assert pitch.size(-1) == mel.size(-1)
 
         # No higher formants?
@@ -209,7 +210,7 @@ def __getitem__(self, index):
         # this is a batch
         # FastPitch 1.0: (text, mel, len_text, dur, pitch, speaker)
         return (text, mel, len(text), pitch, energy, speaker, dur,
-                audiopath)
+                audiopath, phones)
 
     def __len__(self):
         return len(self.audiopaths_and_text)
@@ -251,13 +252,23 @@ def get_dur(self, index):
         audiopath, *fields = self.audiopaths_and_text[index]
         name = Path(audiopath).stem
 
+        # TODO: check what happens here with absolute vs relative paths
         path = Path(self.dataset_path, 'durations') if self.dataset_path else Path(audiopath)
         fname = Path(path, name).with_suffix('.pt')
 
         if self.dur_tmp_dir is not None:
-            cached_fpath = Path(self.dur_tmp_dir, fname)
-            if cached_fpath.is_file():
-                return torch.load(cached_fpath)
+            cached_durpath = Path(self.dur_tmp_dir, fname)
+            cached_phonepath = Path(self.dur_tmp_dir, name + '_phones').with_suffix('.pt')
+            if cached_durpath.is_file():
+                # assume if one exists the other does too
+                return torch.load(cached_durpath), torch.load(cached_phonepath)
+
+        if self.load_durs_from_disk:
+            duration_path = fields[1]  # assume durations come after pitch
+            # assume phone_path is known from duration_path
+            phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt')
+            print(duration_path, phone_path)
+            return torch.load(duration_path), torch.load(phone_path)
 
         tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid')
         try:
@@ -268,11 +279,11 @@ def get_dur(self, index):
         phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
                                             self.sampling_rate,
                                             self.hop_length)
-
+        phones = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones))
         check_durations(durs, self.get_mel(audiopath).size(1), name)
 
-        if self.dur_tmp_dir is not None and not cached_fpath.is_file():
-            return torch.save(durs, cached_fpath)
+        if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file():
+            return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath)
 
         return durs, phones
 
@@ -317,8 +328,7 @@ def get_pitch(self, index, mel_len=None):
 
 class TTSCollate:
     """Zero-pads model inputs and targets based on number of frames per step"""
-    # (text_padded, durs_padded, input_lengths, mel_padded, output_lengths,
-    # len_x, pitch_padded, energy_padded, speaker, DUR, audiopaths) = batch
+    # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones) = batch
     def __call__(self, batch):
         """Collate training batch from normalized text and mel-spec"""
         # Right zero-pad all one-hot text sequences to max input length
@@ -361,12 +371,15 @@ def __call__(self, batch):
         pitch_padded = torch.zeros(mel_padded.size(0), n_formants,
                                    mel_padded.size(2), dtype=batch[0][3].dtype)
         energy_padded = torch.zeros_like(pitch_padded[:, 0, :])
+        phones_padded = torch.zeros_like(pitch_padded[:, 0, :])
 
         for i in range(len(ids_sorted_decreasing)):
             pitch = batch[ids_sorted_decreasing[i]][3]
             energy = batch[ids_sorted_decreasing[i]][4]
+            phones = batch[ids_sorted_decreasing[i]][8]
             pitch_padded[i, :, :pitch.shape[1]] = pitch
             energy_padded[i, :energy.shape[0]] = energy
+            phones_padded[i, :phones.shape[0]] = phones
 
         if batch[0][5] is not None:
             speaker = torch.zeros_like(input_lengths)
@@ -382,12 +395,12 @@ def __call__(self, batch):
         audiopaths = [batch[i][7] for i in ids_sorted_decreasing]
 
         return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x,
-                pitch_padded, energy_padded, speaker, audiopaths)
+                pitch_padded, energy_padded, speaker, audiopaths, phones_padded)
 
 
 def batch_to_gpu(batch):
     (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x,
-     pitch_padded, energy_padded, speaker, dur_lens, audiopaths) = batch
+     pitch_padded, energy_padded, speaker, dur_lens, audiopaths, phones_padded) = batch
 
     text_padded = to_gpu(text_padded).long()
     durs_padded = to_gpu(durs_padded).long()
@@ -397,12 +410,13 @@ def batch_to_gpu(batch):
     output_lengths = to_gpu(output_lengths).long()
     pitch_padded = to_gpu(pitch_padded).float()
     energy_padded = to_gpu(energy_padded).float()
+    phones_padded = to_gpu(phones_padded).long()
     if speaker is not None:
         speaker = to_gpu(speaker).long()
 
     # Alignments act as both inputs and targets - pass shallow copies
     x = [text_padded, input_lengths, mel_padded, output_lengths,
-         pitch_padded, energy_padded, speaker, durs_padded, audiopaths]
+         pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded]
     y = [mel_padded, durs_padded, dur_lens, output_lengths]
     len_x = torch.sum(output_lengths)
     return (x, y, len_x)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index ac0b188fe..ee2981a05 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -207,7 +207,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma
         # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs
         # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs
         (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense,
-         speaker, audiopaths) = inputs
+         speaker, audiopaths, phones_padded) = inputs
 
         mel_max_len = mel_tgt.size(2)
 
@@ -219,7 +219,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma
             spk_emb.mul_(self.speaker_emb_weight)
 
         # Input FFT
-        enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
+        enc_out, enc_mask = self.encoder(phones_padded, conditioning=spk_emb)
 
         # Predict durations
         log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index 5b0a4bef7..94794eebd 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -27,6 +27,7 @@
 
 import argparse
 import os
+import sys
 import time
 from pathlib import Path
 
@@ -88,8 +89,10 @@ def parse_args(parser):
 def main():
     parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing')
     parser = parse_args(parser)
+    print(sys.argv)
     args, unk_args = parser.parse_known_args()
     if len(unk_args) > 0:
+        print(unk_args)
         raise ValueError(f'Invalid options {unk_args}')
 
     DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, Path(args.dataset_path, args.log_file)),
@@ -149,8 +152,8 @@ def main():
 
             # From TTSCollate __call__
             # (text_padded, dur_padded, input_lengths, mel_padded,
-            # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths)
-            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths = batch
+            # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths, phones)
+            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name
@@ -174,9 +177,16 @@ def main():
                 # From Dan Wells
                 for j, d in enumerate(durs):
                     filename = Path(fpaths[j]).stem
+                    # TODO remove hardcoding dataset path?
                     dur_path = Path(args.dataset_path,
                                     'durations', f'{filename}.pt')
                     torch.save(d, dur_path)
+                for j, p in enumerate(phones):
+                    filename = Path(fpaths[j]).stem
+                    # save phones too
+                    phones_path = Path(args.dataset_path,
+                                       'durations', f'{filename}_phones.pt')
+                    torch.save(p, phones_path)
 
 
 if __name__ == '__main__':
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index ae544f25d..b69a8aac0 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -28,10 +28,11 @@ fi
 # mfa validate ${WAV_DIR} english english
 # mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}
 
+# don't change batch size
 python prepare_dataset.py \
     --wav-text-filelists ${FILELIST} \
     --n-workers 4 \
-    --batch-size 1 \  # don't change this
+    --batch-size 1 \
     --dataset-path $DATA_DIR \
     --textgrid-path $ALIGNMENT_DIR \
     --extract-pitch \
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index ba041a33f..1c4e816ef 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -2,13 +2,13 @@
 
 export OMP_NUM_THREADS=1
 
-: ${NUM_GPUS:=8}
-: ${BATCH_SIZE:=16}
+: ${NUM_GPUS:=1}
+: ${BATCH_SIZE:=8}
 : ${GRAD_ACCUMULATION:=2}
-: ${OUTPUT_DIR:="./output"}
+: ${OUTPUT_DIR:="./output_mfa"}
 : ${DATASET_PATH:=LJSpeech-1.1}
-: ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_text_train_v3.txt}
-: ${VAL_FILELIST:=filelists/ljs_audio_pitch_text_val.txt}
+: ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt}
+: ${VAL_FILELIST:=filelists/ljs_audio_pitch_durs_text_val.txt}
 : ${AMP:=false}
 : ${SEED:=""}
 
@@ -18,7 +18,6 @@ export OMP_NUM_THREADS=1
 : ${EPOCHS:=1000}
 : ${EPOCHS_PER_CHECKPOINT:=100}
 : ${WARMUP_STEPS:=1000}
-: ${KL_LOSS_WARMUP:=100}
 
 # Train a mixed phoneme/grapheme model
 : ${PHONE:=true}
@@ -28,8 +27,9 @@ export OMP_NUM_THREADS=1
 # Add dummy space prefix/suffix is audio is not precisely trimmed
 : ${APPEND_SPACES:=false}
 
-: ${LOAD_PITCH_FROM_DISK:=true}
-: ${LOAD_MEL_FROM_DISK:=false}
+: ${LOAD_PITCH_FROM_DISK:=TRUE}
+: ${LOAD_DURS_FROM_DISK:=TRUE}
+: ${LOAD_MEL_FROM_DISK:=FALSE}
 
 # For multispeaker models, add speaker ID = {0, 1, ...} as the last filelist column
 : ${NSPEAKERS:=1}
@@ -60,9 +60,6 @@ ARGS+=" --grad-clip-thresh 1000.0"
 ARGS+=" --dur-predictor-loss-scale 0.1"
 ARGS+=" --pitch-predictor-loss-scale 0.1"
 
-# Autoalign & new features
-ARGS+=" --kl-loss-start-epoch 0"
-ARGS+=" --kl-loss-warmup-epochs $KL_LOSS_WARMUP"
 ARGS+=" --text-cleaners $TEXT_CLEANERS"
 ARGS+=" --n-speakers $NSPEAKERS"
 
@@ -72,8 +69,9 @@ ARGS+=" --n-speakers $NSPEAKERS"
 [ "$PHONE" = "true" ]              && ARGS+=" --p-arpabet 1.0"
 [ "$ENERGY" = "true" ]             && ARGS+=" --energy-conditioning"
 [ "$SEED" != "" ]                  && ARGS+=" --seed $SEED"
-[ "$LOAD_MEL_FROM_DISK" = true ]   && ARGS+=" --load-mel-from-disk"
-[ "$LOAD_PITCH_FROM_DISK" = true ] && ARGS+=" --load-pitch-from-disk"
+[ "$LOAD_MEL_FROM_DISK" = TRUE ]   && ARGS+=" --load-mel-from-disk"
+[ "$LOAD_DURS_FROM_DISK" = TRUE ]  && ARGS+=" --load-durs-from-disk"
+[ "$LOAD_PITCH_FROM_DISK" = TRUE ] && ARGS+=" --load-pitch-from-disk"
 [ "$PITCH_ONLINE_DIR" != "" ]      && ARGS+=" --pitch-online-dir $PITCH_ONLINE_DIR"  # e.g., /dev/shm/pitch
 [ "$PITCH_ONLINE_METHOD" != "" ]   && ARGS+=" --pitch-online-method $PITCH_ONLINE_METHOD"
 [ "$APPEND_SPACES" = true ]        && ARGS+=" --prepend-space-to-text"
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index fb59e9fba..7ec53c509 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -89,12 +89,6 @@ def parse_args(parser):
                        help='Discounting factor for training weights EMA')
     train.add_argument('--grad-accumulation', type=int, default=1,
                        help='Training steps to accumulate gradients for')
-    train.add_argument('--kl-loss-start-epoch', type=int, default=250,
-                       help='Start adding the hard attention loss term')
-    train.add_argument('--kl-loss-warmup-epochs', type=int, default=100,
-                       help='Gradually increase the hard attention loss term')
-    train.add_argument('--kl-loss-weight', type=float, default=1.0,
-                       help='Gradually increase the hard attention loss term')
     train.add_argument('--benchmark-epochs-num', type=int, default=20,
                         help='Number of epochs for calculating final stats')
 
@@ -145,6 +139,8 @@ def parse_args(parser):
     cond.add_argument('--n-speakers', type=int, default=1,
                       help='Number of speakers in the dataset. '
                            'n_speakers > 1 enables speaker embeddings')
+    cond.add_argument('--load-durs-from-disk', action='store_true',
+                      help='Use durations cached on disk with prepare_dataset.py')
     cond.add_argument('--load-pitch-from-disk', action='store_true',
                       help='Use pitch cached on disk with prepare_dataset.py')
     cond.add_argument('--pitch-online-method', default='pyin',
@@ -580,6 +576,7 @@ def main():
     if args.local_rank == 0:
         prepare_tmp(args.pitch_online_dir)
 
+    print(args)
     trainset = TTSDataset(audiopaths_and_text=args.training_files, **vars(args))
     valset = TTSDataset(audiopaths_and_text=args.validation_files, **vars(args))
 

From d2bdd343d4ceb555eb28ddbca601222743c38982 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Fri, 11 Mar 2022 13:53:20 +0000
Subject: [PATCH 09/21] Actually save phones correctly

---
 .../FastPitch/fastpitch/data_function.py      | 41 +++++++++++++++----
 .../FastPitch/prepare_dataset.py              | 12 ++++--
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 638e116fb..f07ec8f77 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -201,6 +201,7 @@ def __getitem__(self, index):
         energy = torch.norm(mel.float(), dim=0, p=2)
         dur, phones = self.get_dur(index)
         text = phones
+        print('MOAR TEXT LEN: ', len(text))
         assert pitch.size(-1) == mel.size(-1)
 
         # No higher formants?
@@ -249,6 +250,7 @@ def get_text(self, text):
         return torch.LongTensor(text), text_arpabet
 
     def get_dur(self, index):
+        print('GET DUR')
         audiopath, *fields = self.audiopaths_and_text[index]
         name = Path(audiopath).stem
 
@@ -267,8 +269,11 @@ def get_dur(self, index):
             duration_path = fields[1]  # assume durations come after pitch
             # assume phone_path is known from duration_path
             phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt')
-            print(duration_path, phone_path)
-            return torch.load(duration_path), torch.load(phone_path)
+            a = torch.load(duration_path)
+            b = torch.load(phone_path)
+            print('PHONES', phone_path, b[:10])
+            print('LOADING LENS: ', len(a), len(b))
+            return a, b
 
         tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid')
         try:
@@ -279,12 +284,15 @@ def get_dur(self, index):
         phones, durs, _, _ = parse_textgrid(textgrid.get_tier_by_name('phones'),
                                             self.sampling_rate,
                                             self.hop_length)
-        phones = torch.LongTensor(self.tp.arpabet_list_to_sequence(phones))
+        phones = torch.Tensor(self.tp.arpabet_list_to_sequence(phones))
         check_durations(durs, self.get_mel(audiopath).size(1), name)
+        durs = torch.Tensor(durs)
 
         if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file():
+            print('HOWMANYPHONES', len(phones))
+            print('cached_phonepath: ', cached_phonepath)
             return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath)
-
+        print('HOWMANYPHONES', len(phones))
         return durs, phones
 
     def get_pitch(self, index, mel_len=None):
@@ -329,18 +337,29 @@ def get_pitch(self, index, mel_len=None):
 class TTSCollate:
     """Zero-pads model inputs and targets based on number of frames per step"""
     # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones) = batch
+    # 0: text
+    # 1: mel
+    # 2: len_text
+    # 3: pitch
+    # 4: energy
+    # 5: speaker
+    # 6: dur
+    # 7: audiopath
+    # 8: phones
     def __call__(self, batch):
         """Collate training batch from normalized text and mel-spec"""
         # Right zero-pad all one-hot text sequences to max input length
         input_lengths, ids_sorted_decreasing = torch.sort(
             torch.LongTensor([len(x[0]) for x in batch]),
             dim=0, descending=True)
+
         max_input_len = input_lengths[0]
 
         text_padded = torch.LongTensor(len(batch), max_input_len)
         text_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             text = batch[ids_sorted_decreasing[i]][0]
+            print('LEN TEXT AS WE SAVE: ', text.size(0))
             text_padded[i, :text.size(0)] = text
 
         dur_padded = torch.zeros_like(text_padded, dtype=torch.int32)
@@ -351,9 +370,12 @@ def __call__(self, batch):
             # With MFA durations:
             # some mismatch between phones in transcript vs phones from text preprocessing
             # for now using phones from texgrid as input
-            dur_padded[i, :len(dur)] = torch.Tensor(dur)
+            # PREP DATASET: DUR = LIST, TRAIN: DUR = TENSOR
+            dur_padded[i, :len(dur)] = dur
             dur_lens[i] = len(dur)
+            print('LENS: ', dur_lens[i], input_lengths[i])
             assert dur_lens[i] == input_lengths[i]
+
         # Right zero-pad mel-spec
         num_mels = batch[0][1].size(0)
         max_target_len = max([x[1].size(1) for x in batch])
@@ -371,16 +393,21 @@ def __call__(self, batch):
         pitch_padded = torch.zeros(mel_padded.size(0), n_formants,
                                    mel_padded.size(2), dtype=batch[0][3].dtype)
         energy_padded = torch.zeros_like(pitch_padded[:, 0, :])
-        phones_padded = torch.zeros_like(pitch_padded[:, 0, :])
-
+        phones_padded = torch.zeros_like(text_padded, dtype=int)
+        print('PHONES PAD SETUP: ', phones_padded.shape)
         for i in range(len(ids_sorted_decreasing)):
             pitch = batch[ids_sorted_decreasing[i]][3]
             energy = batch[ids_sorted_decreasing[i]][4]
             phones = batch[ids_sorted_decreasing[i]][8]
+            print('BATCH OF PHONES: ', phones.shape[0])
             pitch_padded[i, :, :pitch.shape[1]] = pitch
             energy_padded[i, :energy.shape[0]] = energy
+            print('ADD TO PAD', i, phones.shape)
             phones_padded[i, :phones.shape[0]] = phones
 
+        print('PHONES PADDED SHAPE: ', phones_padded.shape)
+        print('ENERGY PADDED SHAPE: ', energy_padded.shape)
+
         if batch[0][5] is not None:
             speaker = torch.zeros_like(input_lengths)
             for i in range(len(ids_sorted_decreasing)):
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index 94794eebd..f0bf0728d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -150,10 +150,13 @@ def main():
         for i, batch in enumerate(tqdm.tqdm(data_loader)):
             tik = time.time()
 
-            # From TTSCollate __call__
-            # (text_padded, dur_padded, input_lengths, mel_padded,
-            # output_lengths, len_x, pitch_padded, energy_padded, speaker, audiopaths, phones)
-            _, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch
+            # DATASET GETITEM
+            # (text, mel, len(text), pitch, energy, speaker, dur, audiopath, phones)
+            # TTSCOLLATE CALL
+            #  (text_padded, dur_padded, input_lengths, mel_padded,
+            #  output_lengths, len_x, pitch_padded, energy_padded, speaker,
+            #  audiopaths, phones_padded)
+            text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name
@@ -182,6 +185,7 @@ def main():
                                     'durations', f'{filename}.pt')
                     torch.save(d, dur_path)
                 for j, p in enumerate(phones):
+                    print('LEN PHONES BEFORE SAVING: ', len(p))
                     filename = Path(fpaths[j]).stem
                     # save phones too
                     phones_path = Path(args.dataset_path,

From a95012b8053c864af2c9e94897164774385035ac Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Fri, 11 Mar 2022 20:04:41 +0000
Subject: [PATCH 10/21] Get training to work with saved MFA durs

---
 .../FastPitch/fastpitch/data_function.py      | 28 +++++--------------
 .../FastPitch/fastpitch/loss_function.py      |  5 ++--
 .../FastPitch/fastpitch/model.py              | 12 ++++++--
 .../FastPitch/prepare_dataset.py              |  3 +-
 PyTorch/SpeechSynthesis/FastPitch/train.py    |  3 +-
 5 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index f07ec8f77..58fe19c30 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -173,7 +173,6 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
         self.f0_method = pitch_online_method
 
         expected_columns = (2 + int(load_durs_from_disk) + int(load_pitch_from_disk) + (n_speakers > 1))
-        print(load_durs_from_disk, load_pitch_from_disk, expected_columns)
         assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)
 
         if len(self.audiopaths_and_text[0]) < expected_columns:
@@ -201,7 +200,6 @@ def __getitem__(self, index):
         energy = torch.norm(mel.float(), dim=0, p=2)
         dur, phones = self.get_dur(index)
         text = phones
-        print('MOAR TEXT LEN: ', len(text))
         assert pitch.size(-1) == mel.size(-1)
 
         # No higher formants?
@@ -250,7 +248,6 @@ def get_text(self, text):
         return torch.LongTensor(text), text_arpabet
 
     def get_dur(self, index):
-        print('GET DUR')
         audiopath, *fields = self.audiopaths_and_text[index]
         name = Path(audiopath).stem
 
@@ -269,11 +266,7 @@ def get_dur(self, index):
             duration_path = fields[1]  # assume durations come after pitch
             # assume phone_path is known from duration_path
             phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt')
-            a = torch.load(duration_path)
-            b = torch.load(phone_path)
-            print('PHONES', phone_path, b[:10])
-            print('LOADING LENS: ', len(a), len(b))
-            return a, b
+            return torch.load(duration_path), torch.load(phone_path)
 
         tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid')
         try:
@@ -289,10 +282,8 @@ def get_dur(self, index):
         durs = torch.Tensor(durs)
 
         if self.dur_tmp_dir is not None and not cached_durpath.is_file() and not cached_phonepath.is_file():
-            print('HOWMANYPHONES', len(phones))
-            print('cached_phonepath: ', cached_phonepath)
             return torch.save(durs, cached_durpath), torch.save(phones, cached_phonepath)
-        print('HOWMANYPHONES', len(phones))
+
         return durs, phones
 
     def get_pitch(self, index, mel_len=None):
@@ -304,8 +295,11 @@ def get_pitch(self, index, mel_len=None):
             spk = 0
 
         if self.load_pitch_from_disk:
+            print('WE RE LOADING PITCH')
             pitchpath = fields[0]
+            print('PATH: ', pitchpath)
             pitch = torch.load(pitchpath)
+            print('AND ITS SIZE: ', pitch.shape)
             if self.pitch_mean is not None:
                 assert self.pitch_std is not None
                 pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std)
@@ -359,7 +353,6 @@ def __call__(self, batch):
         text_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             text = batch[ids_sorted_decreasing[i]][0]
-            print('LEN TEXT AS WE SAVE: ', text.size(0))
             text_padded[i, :text.size(0)] = text
 
         dur_padded = torch.zeros_like(text_padded, dtype=torch.int32)
@@ -373,7 +366,6 @@ def __call__(self, batch):
             # PREP DATASET: DUR = LIST, TRAIN: DUR = TENSOR
             dur_padded[i, :len(dur)] = dur
             dur_lens[i] = len(dur)
-            print('LENS: ', dur_lens[i], input_lengths[i])
             assert dur_lens[i] == input_lengths[i]
 
         # Right zero-pad mel-spec
@@ -394,20 +386,14 @@ def __call__(self, batch):
                                    mel_padded.size(2), dtype=batch[0][3].dtype)
         energy_padded = torch.zeros_like(pitch_padded[:, 0, :])
         phones_padded = torch.zeros_like(text_padded, dtype=int)
-        print('PHONES PAD SETUP: ', phones_padded.shape)
         for i in range(len(ids_sorted_decreasing)):
             pitch = batch[ids_sorted_decreasing[i]][3]
             energy = batch[ids_sorted_decreasing[i]][4]
             phones = batch[ids_sorted_decreasing[i]][8]
-            print('BATCH OF PHONES: ', phones.shape[0])
             pitch_padded[i, :, :pitch.shape[1]] = pitch
             energy_padded[i, :energy.shape[0]] = energy
-            print('ADD TO PAD', i, phones.shape)
             phones_padded[i, :phones.shape[0]] = phones
 
-        print('PHONES PADDED SHAPE: ', phones_padded.shape)
-        print('ENERGY PADDED SHAPE: ', energy_padded.shape)
-
         if batch[0][5] is not None:
             speaker = torch.zeros_like(input_lengths)
             for i in range(len(ids_sorted_decreasing)):
@@ -422,12 +408,12 @@ def __call__(self, batch):
         audiopaths = [batch[i][7] for i in ids_sorted_decreasing]
 
         return (text_padded, dur_padded, input_lengths, mel_padded, output_lengths, len_x,
-                pitch_padded, energy_padded, speaker, audiopaths, phones_padded)
+                pitch_padded, energy_padded, dur_lens, speaker, audiopaths, phones_padded)
 
 
 def batch_to_gpu(batch):
     (text_padded, durs_padded, input_lengths, mel_padded, output_lengths, len_x,
-     pitch_padded, energy_padded, speaker, dur_lens, audiopaths, phones_padded) = batch
+     pitch_padded, energy_padded, dur_lens, speaker, audiopaths, phones_padded) = batch
 
     text_padded = to_gpu(text_padded).long()
     durs_padded = to_gpu(durs_padded).long()
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
index 5b789a9a2..00bcbfd71 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
@@ -46,7 +46,8 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
          energy_pred, energy_tgt) = model_out
         # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
         #(mel_tgt, in_lens, out_lens) = targets
-        mel_tgt, dur_tgt, dur_lens, pitch_tgt = targets
+        # mel_padded, durs_padded, dur_lens,
+        mel_tgt, dur_tgt, dur_lens, output_lengths = targets
         #dur_lens = in_lens
 
         mel_tgt.requires_grad = False
@@ -65,7 +66,7 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
         loss_fn = F.mse_loss
         mel_loss = loss_fn(mel_out, mel_tgt, reduction='none')
         mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum()
-
+        print('SHAPES IN LOSS FUNCTION: ', pitch_tgt.shape, pitch_pred.shape)
         ldiff = pitch_tgt.size(2) - pitch_pred.size(2)
         pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0)
         pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none')
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index ee2981a05..636451b45 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -206,9 +206,14 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
     def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75):
         # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs
         # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs
-        (inputs, input_lens, mel_tgt, mel_lens, dur_tgt, pitch_dense, energy_dense,
-         speaker, audiopaths, phones_padded) = inputs
-
+        print('NUMBER OF INPUTS', len(inputs))
+        # text_padded, input_lengths, mel_padded, output_lengths,
+        # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, \
+        # phones_padded
+        (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
+          speaker, dur_tgt, audiopaths, phones_padded) = inputs
+        # text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, \
+        # phones = batch
         mel_max_len = mel_tgt.size(2)
 
         # Calculate speaker embedding
@@ -230,6 +235,7 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma
 
         # Average pitch over characters
         pitch_tgt = average_pitch(pitch_dense, dur_tgt)
+        print('DENSE PITCH SHAPE: ', pitch_tgt.shape, pitch_pred.shape)
 
         if use_gt_pitch and pitch_tgt is not None:
             pitch_emb = self.pitch_emb(pitch_tgt)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index f0bf0728d..f918ef74d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -89,7 +89,6 @@ def parse_args(parser):
 def main():
     parser = argparse.ArgumentParser(description='FastPitch Data Pre-processing')
     parser = parse_args(parser)
-    print(sys.argv)
     args, unk_args = parser.parse_known_args()
     if len(unk_args) > 0:
         print(unk_args)
@@ -172,6 +171,7 @@ def main():
 
             if args.extract_pitch:
                 for j, p in enumerate(pitch):
+                    print('SIZE OF P: ', len(p), type(p))
                     fname = Path(fpaths[j]).with_suffix('.pt').name
                     fpath = Path(args.dataset_path, 'pitch', fname)
                     torch.save(p[:mel_lens[j]], fpath)
@@ -185,7 +185,6 @@ def main():
                                     'durations', f'{filename}.pt')
                     torch.save(d, dur_path)
                 for j, p in enumerate(phones):
-                    print('LEN PHONES BEFORE SAVING: ', len(p))
                     filename = Path(fpaths[j]).stem
                     # save phones too
                     phones_path = Path(args.dataset_path,
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 7ec53c509..813c241dd 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -576,7 +576,6 @@ def main():
     if args.local_rank == 0:
         prepare_tmp(args.pitch_online_dir)
 
-    print(args)
     trainset = TTSDataset(audiopaths_and_text=args.training_files, **vars(args))
     valset = TTSDataset(audiopaths_and_text=args.validation_files, **vars(args))
 
@@ -633,10 +632,12 @@ def main():
                 model.zero_grad(set_to_none=True)
 
             x, y, num_frames = batch_to_gpu(batch)
+            print('NUMBER OF X: ', len(x))
 
             with torch.cuda.amp.autocast(enabled=args.amp):
                 # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
                 y_pred = model(x, use_gt_durations=True)
+                print('LEN Y: ', len(y_pred), len(y))
                 # y = mel_padded, input_lengths, output_lengths
                 loss, meta = criterion(y_pred, y)
                 loss /= args.grad_accumulation

From 2ade2cd2ce1874f5f7d13a6b4c087cceb269a89b Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Fri, 11 Mar 2022 20:16:43 +0000
Subject: [PATCH 11/21] Remove print statements

---
 .../SpeechSynthesis/FastPitch/fastpitch/data_function.py  | 3 ---
 .../SpeechSynthesis/FastPitch/fastpitch/loss_function.py  | 1 -
 PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py      | 8 +-------
 PyTorch/SpeechSynthesis/FastPitch/train.py                | 2 --
 4 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 58fe19c30..2f9047993 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -295,11 +295,8 @@ def get_pitch(self, index, mel_len=None):
             spk = 0
 
         if self.load_pitch_from_disk:
-            print('WE RE LOADING PITCH')
             pitchpath = fields[0]
-            print('PATH: ', pitchpath)
             pitch = torch.load(pitchpath)
-            print('AND ITS SIZE: ', pitch.shape)
             if self.pitch_mean is not None:
                 assert self.pitch_std is not None
                 pitch = normalize_pitch(pitch, self.pitch_mean, self.pitch_std)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
index 00bcbfd71..53d0439db 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
@@ -66,7 +66,6 @@ def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
         loss_fn = F.mse_loss
         mel_loss = loss_fn(mel_out, mel_tgt, reduction='none')
         mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum()
-        print('SHAPES IN LOSS FUNCTION: ', pitch_tgt.shape, pitch_pred.shape)
         ldiff = pitch_tgt.size(2) - pitch_pred.size(2)
         pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0)
         pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none')
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index 636451b45..29ce2a42a 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -206,14 +206,9 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
     def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, max_duration=75):
         # was FP1.0 : inputs, _, mel_tgt, _, DUR_TGT, _, pitch_tgt, speaker = inputs
         # will be: inputs, input_lens, mel_tgt, mel_lens, DUR_TGT, pitch_dense, energy_dense, speaker, audiopaths = inputs
-        print('NUMBER OF INPUTS', len(inputs))
-        # text_padded, input_lengths, mel_padded, output_lengths,
-        # pitch_padded, energy_padded, speaker, durs_padded, audiopaths, \
-        # phones_padded
         (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
           speaker, dur_tgt, audiopaths, phones_padded) = inputs
-        # text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, \
-        # phones = batch
+
         mel_max_len = mel_tgt.size(2)
 
         # Calculate speaker embedding
@@ -235,7 +230,6 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma
 
         # Average pitch over characters
         pitch_tgt = average_pitch(pitch_dense, dur_tgt)
-        print('DENSE PITCH SHAPE: ', pitch_tgt.shape, pitch_pred.shape)
 
         if use_gt_pitch and pitch_tgt is not None:
             pitch_emb = self.pitch_emb(pitch_tgt)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 813c241dd..3ddd1ceb8 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -632,12 +632,10 @@ def main():
                 model.zero_grad(set_to_none=True)
 
             x, y, num_frames = batch_to_gpu(batch)
-            print('NUMBER OF X: ', len(x))
 
             with torch.cuda.amp.autocast(enabled=args.amp):
                 # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
                 y_pred = model(x, use_gt_durations=True)
-                print('LEN Y: ', len(y_pred), len(y))
                 # y = mel_padded, input_lengths, output_lengths
                 loss, meta = criterion(y_pred, y)
                 loss /= args.grad_accumulation

From 13f871066bfd84bded38a734c503ab51316817d2 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Fri, 11 Mar 2022 20:43:26 +0000
Subject: [PATCH 12/21] Fix up setup steps

---
 PyTorch/SpeechSynthesis/FastPitch/install.sh              | 2 ++
 .../SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh  | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/install.sh b/PyTorch/SpeechSynthesis/FastPitch/install.sh
index b788fdcc3..2e6b138b7 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/install.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/install.sh
@@ -42,6 +42,7 @@ conda uninstall pytorch
 
 ## Then we reinstall and this for some reason downgrades the gcc to 7 and then installing apex works/
 conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+conda install -c conda-forge montreal-forced-aligner
 
 ## Apex
 cd /disk/scratch1/${USER}/FastPitches/PyTorch/SpeechSynthesis/FastPitch/
@@ -58,6 +59,7 @@ pip install wandb
 pip install llvmlite==0.35.0
 ## Ignore warning around here
 pip install numba==0.49.1
+pip install tgt
 
 ## for logging
 ## if needed, create a free account here: https://app.wandb.ai/login?signup=true
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index b69a8aac0..9b2b3925b 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -23,10 +23,10 @@ then
   python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS}
 fi
 
-# mfa model download acoustic english
-# mfa model download dictionary english
-# mfa validate ${WAV_DIR} english english
-# mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}
+mfa model download acoustic english
+mfa model download dictionary english
+mfa validate ${WAV_DIR} english english
+mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}
 
 # don't change batch size
 python prepare_dataset.py \

From 93f83423f44d0c4ac4cd0af5dd58482c1c2bcc32 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Sun, 13 Mar 2022 11:13:28 +0000
Subject: [PATCH 13/21] MFA training fixups

---
 .../SpeechSynthesis/FastPitch/fastpitch/data_function.py  | 2 +-
 PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py      | 3 +--
 .../SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh  | 8 ++++----
 PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh        | 7 +++++--
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 2f9047993..fcbf7cb05 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -268,7 +268,7 @@ def get_dur(self, index):
             phone_path = Path(Path(duration_path).parent, name + '_phones').with_suffix('.pt')
             return torch.load(duration_path), torch.load(phone_path)
 
-        tgt_path = Path(self.textgrid_path, 'wavs', f'{name}.TextGrid')
+        tgt_path = Path(self.textgrid_path, f'{name}.TextGrid')
         try:
             textgrid = read_textgrid(tgt_path, include_empty_intervals=True)
         except FileNotFoundError:
diff --git a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
index f918ef74d..cb17fcd14 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/prepare_dataset.py
@@ -155,7 +155,7 @@ def main():
             #  (text_padded, dur_padded, input_lengths, mel_padded,
             #  output_lengths, len_x, pitch_padded, energy_padded, speaker,
             #  audiopaths, phones_padded)
-            text, durs, input_lens, mels, mel_lens, _, pitch, _, _, fpaths, phones = batch
+            text, durs, input_lens, mels, mel_lens, _, pitch, _, _, _, fpaths, phones = batch
             # Ensure filenames are unique
             for p in fpaths:
                 fname = Path(p).name
@@ -171,7 +171,6 @@ def main():
 
             if args.extract_pitch:
                 for j, p in enumerate(pitch):
-                    print('SIZE OF P: ', len(p), type(p))
                     fname = Path(fpaths[j]).with_suffix('.pt').name
                     fpath = Path(args.dataset_path, 'pitch', fname)
                     torch.save(p[:mel_lens[j]], fpath)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
index 9b2b3925b..c7f9f5846 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/prepare_dataset.sh
@@ -23,10 +23,10 @@ then
   python ./create_lab_files.py --dataset ${WAV_DIR} --filelist ${FILELIST} --n-speakers ${NSPEAKERS}
 fi
 
-mfa model download acoustic english
-mfa model download dictionary english
-mfa validate ${WAV_DIR} english english
-mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}
+#mfa model download acoustic english --temp_directory /disk/scratch1/evdv/tmp/MFA
+#mfa model download dictionary english  --temp_directory /disk/scratch1/evdv/tmp/MFA
+#mfa validate ${WAV_DIR} english english  --temp_directory /disk/scratch1/evdv/tmp/MFA
+#mfa align ${WAV_DIR} english english ${ALIGNMENT_DIR}  --temp_directory /disk/scratch1/evdv/tmp/MFA
 
 # don't change batch size
 python prepare_dataset.py \
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index 1c4e816ef..6a62ab5d7 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -1,9 +1,12 @@
 #!/usr/bin/env bash
 
 export OMP_NUM_THREADS=1
+export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/
+export WANDB_SHOW_RUN=true
+export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb
 
-: ${NUM_GPUS:=1}
-: ${BATCH_SIZE:=8}
+: ${NUM_GPUS:=2}
+: ${BATCH_SIZE:=16}
 : ${GRAD_ACCUMULATION:=2}
 : ${OUTPUT_DIR:="./output_mfa"}
 : ${DATASET_PATH:=LJSpeech-1.1}

From c2bafd5b2c0f4195bdbb37d2ec58f2873a103d22 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Sun, 13 Mar 2022 11:27:08 +0000
Subject: [PATCH 14/21] Update keys used for spectrogram plotting during
 validation

---
 PyTorch/SpeechSynthesis/FastPitch/train.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 3ddd1ceb8..97cf740bf 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -362,20 +362,20 @@ def plot_batch_mels(pred_tgt_lists, rank):
 
 
 def log_validation_batch(x, y_pred, rank):
+    # x = [text_padded, input_lengths, mel_padded, output_lengths,
+    #      pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded]
+    # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
     x_fields = ['text_padded', 'input_lengths', 'mel_padded',
                 'output_lengths', 'pitch_padded', 'energy_padded',
-                'speaker', 'attn_prior', 'audiopaths']
-    y_pred_fields = ['mel_out', 'dec_mask', 'dur_pred', 'log_dur_pred',
-                     'pitch_pred', 'pitch_tgt', 'energy_pred',
-                     'energy_tgt', 'attn_soft', 'attn_hard',
-                     'attn_hard_dur', 'attn_logprob']
+                'speaker', 'durs_padded', 'audiopaths', 'phones_padded']
+    y_pred_fields = ['mel_out', 'dec_mask', 'dur_pred', 'pitch_pred', 'energy_pred']
 
     validation_dict = dict(zip(x_fields + y_pred_fields,
                                list(x) + list(y_pred)))
     log(validation_dict, rank)  # something in here returns a warning
 
-    pred_specs_keys = ['mel_out', 'pitch_pred', 'energy_pred', 'attn_hard_dur']
-    tgt_specs_keys = ['mel_padded', 'pitch_tgt', 'energy_tgt', 'attn_hard_dur']
+    pred_specs_keys = ['mel_out', 'pitch_pred', 'energy_pred', 'durs_padded']
+    tgt_specs_keys = ['mel_padded', 'pitch_padded', 'energy_padded', 'durs_padded']
     plot_batch_mels([[validation_dict[key] for key in pred_specs_keys],
                      [validation_dict[key] for key in tgt_specs_keys]], rank)
 
@@ -396,7 +396,12 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run,
         val_meta = defaultdict(float)
         val_num_frames = 0
         for i, batch in enumerate(val_loader):
+            # x = [text_padded, input_lengths, mel_padded, output_lengths,
+            #      pitch_padded, energy_padded, speaker, durs_padded, audiopaths, phones_padded]
+            # y = [mel_padded, durs_padded, dur_lens, output_lengths]
+            # len_x = torch.sum(output_lengths)
             x, y, num_frames = batch_to_gpu(batch)
+            # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
             y_pred = model(x)
 
             if i % 5 == 0:

From 1755903717fa6fda46e2e45f013c35baff79f679 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Mon, 14 Mar 2022 14:46:29 +0000
Subject: [PATCH 15/21] Fix spectrogram plotting

---
 .../FastPitch/fastpitch/loss_function.py      |  6 +---
 .../FastPitch/fastpitch/model.py              |  1 +
 .../FastPitch/scripts/train.sh                |  8 ++---
 PyTorch/SpeechSynthesis/FastPitch/train.py    | 31 +++++++++++++------
 4 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
index 53d0439db..dc2361cbe 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/loss_function.py
@@ -42,13 +42,9 @@ def __init__(self, dur_predictor_loss_scale=1.0,
         self.energy_predictor_loss_scale = energy_predictor_loss_scale
 
     def forward(self, model_out, targets, is_training=True, meta_agg='mean'):
-        (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt,
-         energy_pred, energy_tgt) = model_out
+        (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt) = model_out
         # model_out = (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
-        #(mel_tgt, in_lens, out_lens) = targets
-        # mel_padded, durs_padded, dur_lens,
         mel_tgt, dur_tgt, dur_lens, output_lengths = targets
-        #dur_lens = in_lens
 
         mel_tgt.requires_grad = False
         # (B,H,T) => (B,T,H)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index 29ce2a42a..0f883e7c5 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -54,6 +54,7 @@ def regulate_len(durations, enc_out, pace: float = 1.0,
     mult = ((reps_cumsum[:, :, :-1] <= range_) &
             (reps_cumsum[:, :, 1:] > range_))
     mult = mult.to(dtype)
+    print('THESE SHAPES WILL BE MATMULLED: ', mult.shape, enc_out.shape)
     enc_rep = torch.matmul(mult, enc_out)
 
     if mel_max_len is not None:
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index 6a62ab5d7..0088bc1f4 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 
 export OMP_NUM_THREADS=1
-export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/
+#export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/
 export WANDB_SHOW_RUN=true
-export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb
+#export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb
 
-: ${NUM_GPUS:=2}
-: ${BATCH_SIZE:=16}
+: ${NUM_GPUS:=1}
+: ${BATCH_SIZE:=2}
 : ${GRAD_ACCUMULATION:=2}
 : ${OUTPUT_DIR:="./output_mfa"}
 : ${DATASET_PATH:=LJSpeech-1.1}
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 97cf740bf..59b6e3815 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -332,19 +332,27 @@ def plot_batch_mels(pred_tgt_lists, rank):
     regulated_features = []
     # prediction: mel, pitch, energy
     # target: mel, pitch, energy
-    for mel_pitch_energy in pred_tgt_lists:
+    for i, mel_pitch_energy in enumerate(pred_tgt_lists):
+        if i == 0:
+            print('PREDICTION')
+        elif i == 1:
+            print('TARGET')
         mels = mel_pitch_energy[0]
         if mels.size(dim=2) == 80:  # tgt and pred mel have diff dimension order
             mels = mels.permute(0, 2, 1)
-        mel_lens = mel_pitch_energy[-1]
+        mel_lens = mel_pitch_energy[-1].squeeze()
+        pitch = mel_pitch_energy[1].squeeze().unsqueeze(dim=-1)
+        energy = mel_pitch_energy[2].squeeze().unsqueeze(dim=-1)
         # reverse regulation for plotting: for every mel frame get pitch+energy
-        new_pitch = regulate_len(mel_lens,
-                                 mel_pitch_energy[1].permute(0, 2, 1))[0]
-        new_energy = regulate_len(mel_lens,
-                                  mel_pitch_energy[2].unsqueeze(dim=-1))[0]
+        if i == 0:
+            energy = regulate_len(mel_lens, energy)[0]
+            pitch = regulate_len(mel_lens, pitch)[0]
+
+        print('PITCH: ', pitch.shape)
+        print('ENERGY', energy.shape)
         regulated_features.append([mels,
-                                   new_pitch.squeeze(axis=2),
-                                   new_energy.squeeze(axis=2)])
+                                   pitch.squeeze(axis=2),
+                                   energy.squeeze(axis=2)])
 
     batch_sizes = [feature.size(dim=0)
                    for pred_tgt in regulated_features
@@ -404,8 +412,8 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run,
             # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
             y_pred = model(x)
 
-            if i % 5 == 0:
-                log_validation_batch(x, y_pred, rank)
+            #if i % 5 == 0:
+            log_validation_batch(x, y_pred, rank)
 
             loss, meta = criterion(y_pred, y, is_training=False, meta_agg='sum')
 
@@ -709,6 +717,9 @@ def main():
                 iter_num_frames = 0
                 iter_meta = {}
                 iter_start_time = time.perf_counter()
+                # for debugging only
+                # validate(model, criterion, valset, args.batch_size, collate_fn,
+                #          distributed_run, batch_to_gpu, args.local_rank)
 
         # Finished epoch
         epoch_loss /= epoch_iter

From d0fa41cdd1b01b4fcfd3b82555c055adb004941d Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Mon, 14 Mar 2022 17:00:53 +0000
Subject: [PATCH 16/21] Add pitch, energy, and duration losses

---
 PyTorch/SpeechSynthesis/FastPitch/train.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 59b6e3815..faf7a4382 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -434,6 +434,9 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run,
     log({
         'loss/validation-loss': val_meta['loss'].item(),
         'mel-loss/validation-mel-loss': val_meta['mel_loss'].item(),
+        'pitch-loss/validation-pitch-loss': val_meta['pitch_loss'].item(),
+        'energy-loss/validation-energy-loss': val_meta['energy_loss'].item(),
+        'dur-loss/validation-dur-error': val_meta['duration_predictor_loss'].item(),
         'validation-frames per s': num_frames.item() / val_meta['took'],
         'validation-took': val_meta['took'],
     }, rank)
@@ -617,6 +620,9 @@ def main():
 
         epoch_loss = 0.0
         epoch_mel_loss = 0.0
+        epoch_pitch_loss = 0.0
+        epoch_energy_loss = 0.0
+        epoch_dur_loss = 0.0
         epoch_num_frames = 0
         epoch_frames_per_sec = 0.0
 
@@ -693,11 +699,17 @@ def main():
                     apply_multi_tensor_ema(args.ema_decay, *mt_ema_params)
 
                 iter_mel_loss = iter_meta['mel_loss'].item()
+                iter_pitch_loss = iter_meta['pitch_loss'].item()
+                iter_energy_loss = iter_meta['energy_loss'].item()
+                iter_dur_loss = iter_meta['duration_predictor_loss'].item()
                 iter_time = time.perf_counter() - iter_start_time
                 epoch_frames_per_sec += iter_num_frames / iter_time
                 epoch_loss += iter_loss
                 epoch_num_frames += iter_num_frames
                 epoch_mel_loss += iter_mel_loss
+                epoch_pitch_loss += iter_pitch_loss
+                epoch_energy_loss += iter_energy_loss
+                epoch_dur_loss += iter_dur_loss
 
                 if epoch_iter % 5 == 0:
                     log({
@@ -707,6 +719,9 @@ def main():
                         'total_steps': total_iter,
                         'loss/loss': iter_loss,
                         'mel-loss/mel_loss': iter_mel_loss,
+                        'pitch-loss/pitch_loss': iter_pitch_loss,
+                        'energy-loss/energy_loss': iter_energy_loss,
+                        'dur-loss/dur_loss': iter_dur_loss,
                         'frames per s': iter_num_frames / iter_time,
                         'took': iter_time,
                         'lrate': optimizer.param_groups[0]['lr'],
@@ -730,6 +745,9 @@ def main():
             'epoch': epoch,
             'loss/epoch_loss': epoch_loss,
             'mel-loss/epoch_mel_loss': epoch_mel_loss,
+            'pitch-loss/epoch_pitch_loss': epoch_pitch_loss,
+            'energy-loss/epoch_energy_loss': epoch_energy_loss,
+            'dur-loss/epoch_dur_loss': epoch_dur_loss,
             'epoch_frames per s': epoch_num_frames / epoch_time,
             'epoch_took': epoch_time,
         }, args.local_rank)

From 20a375cb617aa0e9412688da473b1c03e49ba804 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Wed, 16 Mar 2022 15:29:13 +0000
Subject: [PATCH 17/21] Still figuring out the issue

---
 .../FastPitch/fastpitch/model.py              |  1 -
 ...ini_ljs_audio_pitch_durs_text_train_v3.txt | 20 +++++++++++++++++++
 .../mini_ljs_audio_pitch_durs_text_val.txt    | 16 +++++++++++++++
 .../FastPitch/scripts/train.sh                |  9 ++++-----
 PyTorch/SpeechSynthesis/FastPitch/train.py    |  6 ------
 5 files changed, 40 insertions(+), 12 deletions(-)
 create mode 100644 PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt
 create mode 100644 PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index 0f883e7c5..29ce2a42a 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -54,7 +54,6 @@ def regulate_len(durations, enc_out, pace: float = 1.0,
     mult = ((reps_cumsum[:, :, :-1] <= range_) &
             (reps_cumsum[:, :, 1:] > range_))
     mult = mult.to(dtype)
-    print('THESE SHAPES WILL BE MATMULLED: ', mult.shape, enc_out.shape)
     enc_rep = torch.matmul(mult, enc_out)
 
     if mel_max_len is not None:
diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt
new file mode 100644
index 000000000..db6e29b47
--- /dev/null
+++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_train_v3.txt
@@ -0,0 +1,20 @@
+wavs/LJ050-0234.wav|pitch/LJ050-0234.pt|durations/LJ050-0234.pt|It has used other Treasury law enforcement agents on special experiments in building and route surveys in places to which the President frequently travels.
+wavs/LJ019-0373.wav|pitch/LJ019-0373.pt|durations/LJ019-0373.pt|to avail himself of his powers, as it was difficult to bring home the derelictions of duties and evasion of the acts. Too much was left to the inspectors.
+wavs/LJ050-0207.wav|pitch/LJ050-0207.pt|durations/LJ050-0207.pt|Although Chief Rowley does not complain about the pay scale for Secret Service agents,
+wavs/LJ048-0203.wav|pitch/LJ048-0203.pt|durations/LJ048-0203.pt|The three officers confirm that their primary concern was crowd and traffic control,
+wavs/LJ003-0182.wav|pitch/LJ003-0182.pt|durations/LJ003-0182.pt|The tried and the untried, young and old, were herded together
+wavs/LJ044-0166.wav|pitch/LJ044-0166.pt|durations/LJ044-0166.pt|According to Marina Oswald, he thought that would help him when he got to Cuba.
+wavs/LJ019-0208.wav|pitch/LJ019-0208.pt|durations/LJ019-0208.pt|The proposal made was to purchase some fifty thousand square feet between Newgate, Warwick Lane, and the Sessions House,
+wavs/LJ021-0146.wav|pitch/LJ021-0146.pt|durations/LJ021-0146.pt|I shall seek assurances of the making and maintenance of agreements, which can be mutually relied upon,
+wavs/LJ013-0214.wav|pitch/LJ013-0214.pt|durations/LJ013-0214.pt|who took a carving-knife from the sideboard in the dining-room, went upstairs to Lord William's bedroom, and drew the knife across his throat.
+wavs/LJ011-0256.wav|pitch/LJ011-0256.pt|durations/LJ011-0256.pt|By this time the neighbors were aroused, and several people came to the scene of the affray.
+wavs/LJ014-0083.wav|pitch/LJ014-0083.pt|durations/LJ014-0083.pt|which, having possessed herself of the murdered man's keys, she rifled from end to end.
+wavs/LJ035-0121.wav|pitch/LJ035-0121.pt|durations/LJ035-0121.pt|This is the period during which Oswald would have descended the stairs. In all likelihood
+wavs/LJ049-0118.wav|pitch/LJ049-0118.pt|durations/LJ049-0118.pt|Enactment of this statute would mean that the investigation of any of the acts covered and of the possibility of a further attempt
+wavs/LJ006-0132.wav|pitch/LJ006-0132.pt|durations/LJ006-0132.pt|All the wardsmen alike were more or less irresponsible.
+wavs/LJ049-0084.wav|pitch/LJ049-0084.pt|durations/LJ049-0084.pt|Murder of the President has never been covered by Federal law, however, so that once it became reasonably clear that the killing was the act of a single person,
+wavs/LJ012-0052.wav|pitch/LJ012-0052.pt|durations/LJ012-0052.pt|He claimed to be admitted to bail, and was taken from Newgate on a writ of habeas before one of the judges sitting at Westminster.
+wavs/LJ011-0203.wav|pitch/LJ011-0203.pt|durations/LJ011-0203.pt|Monsieur le Maire was appealed to, and decided to leave it to the young lady, who at once abandoned Wakefield.
+wavs/LJ019-0141.wav|pitch/LJ019-0141.pt|durations/LJ019-0141.pt|The old wards, day rooms and sleeping rooms combined, of which the reader has already heard so much,
+wavs/LJ003-0322.wav|pitch/LJ003-0322.pt|durations/LJ003-0322.pt|except for the use of the debtors, or as medical comforts for the infirmary.
+wavs/LJ027-0028.wav|pitch/LJ027-0028.pt|durations/LJ027-0028.pt|Such structures or organs are most often found internally.
diff --git a/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt
new file mode 100644
index 000000000..eda515c7e
--- /dev/null
+++ b/PyTorch/SpeechSynthesis/FastPitch/filelists/mini_ljs_audio_pitch_durs_text_val.txt
@@ -0,0 +1,16 @@
+wavs/LJ016-0288.wav|pitch/LJ016-0288.pt|durations/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
+wavs/LJ028-0275.wav|pitch/LJ028-0275.pt|durations/LJ028-0275.pt|At last, in the twentieth month,
+wavs/LJ019-0273.wav|pitch/LJ019-0273.pt|durations/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
+wavs/LJ021-0145.wav|pitch/LJ021-0145.pt|durations/LJ021-0145.pt|From those willing to join in establishing this hoped-for period of peace,
+wavs/LJ009-0076.wav|pitch/LJ009-0076.pt|durations/LJ009-0076.pt|We come to the sermon.
+wavs/LJ048-0194.wav|pitch/LJ048-0194.pt|durations/LJ048-0194.pt|during the morning of November twenty-two prior to the motorcade.
+wavs/LJ049-0050.wav|pitch/LJ049-0050.pt|durations/LJ049-0050.pt|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
+wavs/LJ022-0023.wav|pitch/LJ022-0023.pt|durations/LJ022-0023.pt|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
+wavs/LJ034-0053.wav|pitch/LJ034-0053.pt|durations/LJ034-0053.pt|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
+wavs/LJ035-0129.wav|pitch/LJ035-0129.pt|durations/LJ035-0129.pt|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
+wavs/LJ039-0075.wav|pitch/LJ039-0075.pt|durations/LJ039-0075.pt|once you know that you must put the crosshairs on the target and that is all that is necessary.
+wavs/LJ046-0184.wav|pitch/LJ046-0184.pt|durations/LJ046-0184.pt|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
+wavs/LJ003-0111.wav|pitch/LJ003-0111.pt|durations/LJ003-0111.pt|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
+wavs/LJ037-0234.wav|pitch/LJ037-0234.pt|durations/LJ037-0234.pt|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
+wavs/LJ047-0044.wav|pitch/LJ047-0044.pt|durations/LJ047-0044.pt|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
+wavs/LJ028-0081.wav|pitch/LJ028-0081.pt|durations/LJ028-0081.pt|Years later, when the archaeologists could readily distinguish the false from the true,
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index 0088bc1f4..819a21fb6 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -2,7 +2,6 @@
 
 export OMP_NUM_THREADS=1
 #export MPLCONFIGDIR=/disk/scratch1/evdv/tmp/
-export WANDB_SHOW_RUN=true
 #export WANDB_CONFIG_DIR=/disk/scratch1/evdv/tmp/.config/wandb
 
 : ${NUM_GPUS:=1}
@@ -11,16 +10,16 @@ export WANDB_SHOW_RUN=true
 : ${OUTPUT_DIR:="./output_mfa"}
 : ${DATASET_PATH:=LJSpeech-1.1}
 : ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt}
-: ${VAL_FILELIST:=filelists/ljs_audio_pitch_durs_text_val.txt}
+: ${VAL_FILELIST:=filelists/mini_ljs_audio_pitch_durs_text_val.txt}
 : ${AMP:=false}
 : ${SEED:=""}
 
 : ${LEARNING_RATE:=0.1}
 
 # Adjust these when the amount of data changes
-: ${EPOCHS:=1000}
-: ${EPOCHS_PER_CHECKPOINT:=100}
-: ${WARMUP_STEPS:=1000}
+: ${EPOCHS:=50}
+: ${EPOCHS_PER_CHECKPOINT:=10}
+: ${WARMUP_STEPS:=10}
 
 # Train a mixed phoneme/grapheme model
 : ${PHONE:=true}
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index faf7a4382..396f86fc5 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -333,10 +333,6 @@ def plot_batch_mels(pred_tgt_lists, rank):
     # prediction: mel, pitch, energy
     # target: mel, pitch, energy
     for i, mel_pitch_energy in enumerate(pred_tgt_lists):
-        if i == 0:
-            print('PREDICTION')
-        elif i == 1:
-            print('TARGET')
         mels = mel_pitch_energy[0]
         if mels.size(dim=2) == 80:  # tgt and pred mel have diff dimension order
             mels = mels.permute(0, 2, 1)
@@ -348,8 +344,6 @@ def plot_batch_mels(pred_tgt_lists, rank):
             energy = regulate_len(mel_lens, energy)[0]
             pitch = regulate_len(mel_lens, pitch)[0]
 
-        print('PITCH: ', pitch.shape)
-        print('ENERGY', energy.shape)
         regulated_features.append([mels,
                                    pitch.squeeze(axis=2),
                                    energy.squeeze(axis=2)])

From c71ace3367fb0824bb57e62be13a4a8fd647ccbb Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 17 Mar 2022 14:13:45 +0000
Subject: [PATCH 18/21] Fix up some settings

---
 .../SpeechSynthesis/FastPitch/fastpitch/data_function.py    | 3 +++
 PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh          | 1 +
 PyTorch/SpeechSynthesis/FastPitch/train.py                  | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index fcbf7cb05..2d498192f 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -235,6 +235,7 @@ def get_mel(self, filename):
 
         return melspec
 
+    @lru_cache()
     def get_text(self, text):
         text, text_clean, text_arpabet = self.tp.encode_text(text, return_all=True)
         space = [self.tp.encode_text("A A")[1]]
@@ -247,6 +248,7 @@ def get_text(self, text):
 
         return torch.LongTensor(text), text_arpabet
 
+    @lru_cache()
     def get_dur(self, index):
         audiopath, *fields = self.audiopaths_and_text[index]
         name = Path(audiopath).stem
@@ -286,6 +288,7 @@ def get_dur(self, index):
 
         return durs, phones
 
+    @lru_cache()
     def get_pitch(self, index, mel_len=None):
         audiopath, *fields = self.audiopaths_and_text[index]
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index 819a21fb6..db09b0edd 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -75,6 +75,7 @@ ARGS+=" --n-speakers $NSPEAKERS"
 [ "$LOAD_DURS_FROM_DISK" = TRUE ]  && ARGS+=" --load-durs-from-disk"
 [ "$LOAD_PITCH_FROM_DISK" = TRUE ] && ARGS+=" --load-pitch-from-disk"
 [ "$PITCH_ONLINE_DIR" != "" ]      && ARGS+=" --pitch-online-dir $PITCH_ONLINE_DIR"  # e.g., /dev/shm/pitch
+[ "$DUR_ONLINE_DIR" != "" ]        && ARGS+=" --dur-online-dir $DUR_ONLINE_DIR"  # e.g., /dev/shm/dur
 [ "$PITCH_ONLINE_METHOD" != "" ]   && ARGS+=" --pitch-online-method $PITCH_ONLINE_METHOD"
 [ "$APPEND_SPACES" = true ]        && ARGS+=" --prepend-space-to-text"
 [ "$APPEND_SPACES" = true ]        && ARGS+=" --append-space-to-text"
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 396f86fc5..fee177209 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -148,6 +148,8 @@ def parse_args(parser):
                       help='Calculate pitch on the fly during trainig')
     cond.add_argument('--pitch-online-dir', type=str, default=None,
                       help='A directory for storing pitch calculated on-line')
+    cond.add_argument('--dur-online-dir', type=str, default=None,
+                      help='A directory for storing durations calculated on-line')
     cond.add_argument('--pitch-mean', type=float, default=214.72203,
                       help='Normalization value for pitch')
     cond.add_argument('--pitch-std', type=float, default=65.72038,
@@ -406,8 +408,8 @@ def validate(model, criterion, valset, batch_size, collate_fn, distributed_run,
             # y_pred = mel_out, dec_lens, dur_pred, pitch_pred, energy_pred
             y_pred = model(x)
 
-            #if i % 5 == 0:
-            log_validation_batch(x, y_pred, rank)
+            if i % 5 == 0:
+                log_validation_batch(x, y_pred, rank)
 
             loss, meta = criterion(y_pred, y, is_training=False, meta_agg='sum')
 

From 913ee6c463b6006c2a3ccfadab2aee9ee907d94c Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 17 Mar 2022 15:27:54 +0000
Subject: [PATCH 19/21] Train with normalised energy

---
 .../SpeechSynthesis/FastPitch/fastpitch/arg_parser.py  |  1 +
 .../FastPitch/fastpitch/data_function.py               | 10 +++++++++-
 PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py   |  6 ++++--
 PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh     |  2 +-
 PyTorch/SpeechSynthesis/FastPitch/train.py             |  2 ++
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
index 4e5b13764..cb1d8a581 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
@@ -110,6 +110,7 @@ def parse_fastpitch_args(parent, add_help=False):
 
     energy_pred = parser.add_argument_group('energy predictor parameters')
     energy_pred.add_argument('--energy-conditioning', action='store_true')
+    energy_pred.add_argument('--norm_energy', action='store_true')
     energy_pred.add_argument('--energy-predictor-kernel-size', default=3, type=int,
                             help='Pitch predictor conv-1D kernel size')
     energy_pred.add_argument('--energy-predictor-filter-size', default=256, type=int,
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
index 2d498192f..001c90f22 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
@@ -130,7 +130,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
                  cmu_dict='cmudict/cmudict-0.7b',
                  n_speakers=1, load_mel_from_disk=True,
                  load_pitch_from_disk=True, pitch_mean=214.72203,
-                 pitch_std=65.72038, max_wav_value=None, sampling_rate=None,
+                 pitch_std=65.72038, energy_mean=51.796032, energy_std=9.861213,
+                 max_wav_value=None, sampling_rate=None,
                  filter_length=None, hop_length=None, win_length=None,
                  mel_fmin=None, mel_fmax=None, prepend_space_to_text=False,
                  append_space_to_text=False, load_durs_from_disk=False,
@@ -185,6 +186,8 @@ def __init__(self, dataset_path, audiopaths_and_text, text_cleaners,
         to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x
         self.pitch_mean = to_tensor(pitch_mean)
         self.pitch_std = to_tensor(pitch_std)
+        self.energy_mean = to_tensor(energy_mean)
+        self.energy_std = to_tensor(energy_std)
 
     def __getitem__(self, index):
         # Separate filename and text
@@ -198,6 +201,11 @@ def __getitem__(self, index):
         mel = self.get_mel(audiopath)
         pitch = self.get_pitch(index, mel.size(-1))
         energy = torch.norm(mel.float(), dim=0, p=2)
+        if self.energy_mean is not None:
+            assert self.energy_std is not None
+            norm_energy = normalize_pitch(energy.unsqueeze(dim=0), self.energy_mean, self.energy_std)
+            energy = norm_energy.squeeze()
+
         dur, phones = self.get_dur(index)
         text = phones
         assert pitch.size(-1) == mel.size(-1)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
index 29ce2a42a..b8f02300b 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -125,7 +125,7 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
                  energy_predictor_kernel_size, energy_predictor_filter_size,
                  p_energy_predictor_dropout, energy_predictor_n_layers,
                  energy_embedding_kernel_size,
-                 n_speakers, speaker_emb_weight, pitch_conditioning_formants=1):
+                 n_speakers, speaker_emb_weight, pitch_conditioning_formants=1, norm_energy=True):
         super(FastPitch, self).__init__()
 
         self.encoder = FFTransformer(
@@ -186,6 +186,7 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
         self.register_buffer('pitch_std', torch.zeros(1))
 
         self.energy_conditioning = energy_conditioning
+        self.norm_energy = norm_energy
         if energy_conditioning:
             self.energy_predictor = TemporalPredictor(
                 in_fft_output_size,
@@ -243,7 +244,8 @@ def forward(self, inputs, use_gt_pitch=True, use_gt_durations=True, pace=1.0, ma
 
             # Average energy over characters
             energy_tgt = average_pitch(energy_dense.unsqueeze(1), dur_tgt)
-            energy_tgt = torch.log(1.0 + energy_tgt)
+            if not self.norm_energy:
+                energy_tgt = torch.log(1.0 + energy_tgt)
 
             energy_emb = self.energy_emb(energy_tgt)
             energy_tgt = energy_tgt.squeeze(1)
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
index db09b0edd..1b2f0ce4d 100755
--- a/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
+++ b/PyTorch/SpeechSynthesis/FastPitch/scripts/train.sh
@@ -7,7 +7,7 @@ export OMP_NUM_THREADS=1
 : ${NUM_GPUS:=1}
 : ${BATCH_SIZE:=2}
 : ${GRAD_ACCUMULATION:=2}
-: ${OUTPUT_DIR:="./output_mfa"}
+: ${OUTPUT_DIR:="./output_mfa/norm"}
 : ${DATASET_PATH:=LJSpeech-1.1}
 : ${TRAIN_FILELIST:=filelists/ljs_audio_pitch_durs_text_train_v3.txt}
 : ${VAL_FILELIST:=filelists/mini_ljs_audio_pitch_durs_text_val.txt}
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index fee177209..13079a87d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -653,6 +653,8 @@ def main():
                 y_pred = model(x, use_gt_durations=True)
                 # y = mel_padded, input_lengths, output_lengths
                 loss, meta = criterion(y_pred, y)
+                print(loss)
+                print(meta)
                 loss /= args.grad_accumulation
 
             meta = {k: v / args.grad_accumulation

From b57ef295ac98de3face4da3ca2e8530eca78a965 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 17 Mar 2022 17:18:33 +0000
Subject: [PATCH 20/21] Remove loss print

---
 PyTorch/SpeechSynthesis/FastPitch/train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index 13079a87d..fee177209 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -653,8 +653,6 @@ def main():
                 y_pred = model(x, use_gt_durations=True)
                 # y = mel_padded, input_lengths, output_lengths
                 loss, meta = criterion(y_pred, y)
-                print(loss)
-                print(meta)
                 loss /= args.grad_accumulation
 
             meta = {k: v / args.grad_accumulation

From 035bcf6c177218a63a0e687a74ea3551cb47ac85 Mon Sep 17 00:00:00 2001
From: evdv <emelie.vdvreken@gmail.com>
Date: Thu, 17 Mar 2022 23:50:51 +0000
Subject: [PATCH 21/21] Remove all contents from the training data loader loop
 thing

---
 PyTorch/SpeechSynthesis/FastPitch/train.py | 251 +++++++++++----------
 1 file changed, 126 insertions(+), 125 deletions(-)

diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
index fee177209..3b682713d 100644
--- a/PyTorch/SpeechSynthesis/FastPitch/train.py
+++ b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -634,131 +634,132 @@ def main():
         epoch_iter = 0
         num_iters = len(train_loader) // args.grad_accumulation
         for batch in train_loader:
-
-            if accumulated_steps == 0:
-                if epoch_iter == num_iters:
-                    break
-                total_iter += 1
-                epoch_iter += 1
-
-                adjust_learning_rate(total_iter, optimizer, args.learning_rate,
-                                     args.warmup_steps)
-
-                model.zero_grad(set_to_none=True)
-
-            x, y, num_frames = batch_to_gpu(batch)
-
-            with torch.cuda.amp.autocast(enabled=args.amp):
-                # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
-                y_pred = model(x, use_gt_durations=True)
-                # y = mel_padded, input_lengths, output_lengths
-                loss, meta = criterion(y_pred, y)
-                loss /= args.grad_accumulation
-
-            meta = {k: v / args.grad_accumulation
-                    for k, v in meta.items()}
-
-            if args.amp:
-                scaler.scale(loss).backward()
-            else:
-                loss.backward()
-
-            if distributed_run:
-                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
-                reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
-                meta = {k: reduce_tensor(v, args.world_size) for k, v in meta.items()}
-            else:
-                reduced_loss = loss.item()
-                reduced_num_frames = num_frames.item()
-            if np.isnan(reduced_loss):
-                raise Exception("loss is NaN")
-
-            accumulated_steps += 1
-            iter_loss += reduced_loss
-            iter_num_frames += reduced_num_frames
-            iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta}
-
-            if accumulated_steps % args.grad_accumulation == 0:
-
-                if args.amp:
-                    scaler.unscale_(optimizer)
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), args.grad_clip_thresh)
-                    scaler.step(optimizer)
-                    scaler.update()
-                else:
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), args.grad_clip_thresh)
-                    optimizer.step()
-
-                if args.ema_decay > 0.0:
-                    apply_multi_tensor_ema(args.ema_decay, *mt_ema_params)
-
-                iter_mel_loss = iter_meta['mel_loss'].item()
-                iter_pitch_loss = iter_meta['pitch_loss'].item()
-                iter_energy_loss = iter_meta['energy_loss'].item()
-                iter_dur_loss = iter_meta['duration_predictor_loss'].item()
-                iter_time = time.perf_counter() - iter_start_time
-                epoch_frames_per_sec += iter_num_frames / iter_time
-                epoch_loss += iter_loss
-                epoch_num_frames += iter_num_frames
-                epoch_mel_loss += iter_mel_loss
-                epoch_pitch_loss += iter_pitch_loss
-                epoch_energy_loss += iter_energy_loss
-                epoch_dur_loss += iter_dur_loss
-
-                if epoch_iter % 5 == 0:
-                    log({
-                        'epoch': epoch,
-                        'epoch_iter': epoch_iter,
-                        'num_iters': num_iters,
-                        'total_steps': total_iter,
-                        'loss/loss': iter_loss,
-                        'mel-loss/mel_loss': iter_mel_loss,
-                        'pitch-loss/pitch_loss': iter_pitch_loss,
-                        'energy-loss/energy_loss': iter_energy_loss,
-                        'dur-loss/dur_loss': iter_dur_loss,
-                        'frames per s': iter_num_frames / iter_time,
-                        'took': iter_time,
-                        'lrate': optimizer.param_groups[0]['lr'],
-                    }, args.local_rank)
-
-                accumulated_steps = 0
-                iter_loss = 0
-                iter_num_frames = 0
-                iter_meta = {}
-                iter_start_time = time.perf_counter()
-                # for debugging only
-                # validate(model, criterion, valset, args.batch_size, collate_fn,
-                #          distributed_run, batch_to_gpu, args.local_rank)
-
-        # Finished epoch
-        epoch_loss /= epoch_iter
-        epoch_mel_loss /= epoch_iter
-        epoch_time = time.perf_counter() - epoch_start_time
-
-        log({
-            'epoch': epoch,
-            'loss/epoch_loss': epoch_loss,
-            'mel-loss/epoch_mel_loss': epoch_mel_loss,
-            'pitch-loss/epoch_pitch_loss': epoch_pitch_loss,
-            'energy-loss/epoch_energy_loss': epoch_energy_loss,
-            'dur-loss/epoch_dur_loss': epoch_dur_loss,
-            'epoch_frames per s': epoch_num_frames / epoch_time,
-            'epoch_took': epoch_time,
-        }, args.local_rank)
-        bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss,
-                           epoch_time)
-
-        validate(model, criterion, valset, args.batch_size, collate_fn,
-                 distributed_run, batch_to_gpu, args.local_rank)
-
-        if args.ema_decay > 0:
-            validate(ema_model, criterion, valset, args.batch_size, collate_fn,
-                     distributed_run, batch_to_gpu, args.local_rank)
-
-        maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch,
-                              total_iter, model_config)
+            print(batch[-1])
+        #
+        #     if accumulated_steps == 0:
+        #         if epoch_iter == num_iters:
+        #             break
+        #         total_iter += 1
+        #         epoch_iter += 1
+        #
+        #         adjust_learning_rate(total_iter, optimizer, args.learning_rate,
+        #                              args.warmup_steps)
+        #
+        #         model.zero_grad(set_to_none=True)
+        #
+        #     x, y, num_frames = batch_to_gpu(batch)
+        #
+        #     with torch.cuda.amp.autocast(enabled=args.amp):
+        #         # (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, energy_pred, energy_tgt)
+        #         y_pred = model(x, use_gt_durations=True)
+        #         # y = mel_padded, input_lengths, output_lengths
+        #         loss, meta = criterion(y_pred, y)
+        #         loss /= args.grad_accumulation
+        #
+        #     meta = {k: v / args.grad_accumulation
+        #             for k, v in meta.items()}
+        #
+        #     if args.amp:
+        #         scaler.scale(loss).backward()
+        #     else:
+        #         loss.backward()
+        #
+        #     if distributed_run:
+        #         reduced_loss = reduce_tensor(loss.data, args.world_size).item()
+        #         reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
+        #         meta = {k: reduce_tensor(v, args.world_size) for k, v in meta.items()}
+        #     else:
+        #         reduced_loss = loss.item()
+        #         reduced_num_frames = num_frames.item()
+        #     if np.isnan(reduced_loss):
+        #         raise Exception("loss is NaN")
+        #
+        #     accumulated_steps += 1
+        #     iter_loss += reduced_loss
+        #     iter_num_frames += reduced_num_frames
+        #     iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta}
+        #
+        #     if accumulated_steps % args.grad_accumulation == 0:
+        #
+        #         if args.amp:
+        #             scaler.unscale_(optimizer)
+        #             torch.nn.utils.clip_grad_norm_(
+        #                 model.parameters(), args.grad_clip_thresh)
+        #             scaler.step(optimizer)
+        #             scaler.update()
+        #         else:
+        #             torch.nn.utils.clip_grad_norm_(
+        #                 model.parameters(), args.grad_clip_thresh)
+        #             optimizer.step()
+        #
+        #         if args.ema_decay > 0.0:
+        #             apply_multi_tensor_ema(args.ema_decay, *mt_ema_params)
+        #
+        #         iter_mel_loss = iter_meta['mel_loss'].item()
+        #         iter_pitch_loss = iter_meta['pitch_loss'].item()
+        #         iter_energy_loss = iter_meta['energy_loss'].item()
+        #         iter_dur_loss = iter_meta['duration_predictor_loss'].item()
+        #         iter_time = time.perf_counter() - iter_start_time
+        #         epoch_frames_per_sec += iter_num_frames / iter_time
+        #         epoch_loss += iter_loss
+        #         epoch_num_frames += iter_num_frames
+        #         epoch_mel_loss += iter_mel_loss
+        #         epoch_pitch_loss += iter_pitch_loss
+        #         epoch_energy_loss += iter_energy_loss
+        #         epoch_dur_loss += iter_dur_loss
+        #
+        #         if epoch_iter % 5 == 0:
+        #             log({
+        #                 'epoch': epoch,
+        #                 'epoch_iter': epoch_iter,
+        #                 'num_iters': num_iters,
+        #                 'total_steps': total_iter,
+        #                 'loss/loss': iter_loss,
+        #                 'mel-loss/mel_loss': iter_mel_loss,
+        #                 'pitch-loss/pitch_loss': iter_pitch_loss,
+        #                 'energy-loss/energy_loss': iter_energy_loss,
+        #                 'dur-loss/dur_loss': iter_dur_loss,
+        #                 'frames per s': iter_num_frames / iter_time,
+        #                 'took': iter_time,
+        #                 'lrate': optimizer.param_groups[0]['lr'],
+        #             }, args.local_rank)
+        #
+        #         accumulated_steps = 0
+        #         iter_loss = 0
+        #         iter_num_frames = 0
+        #         iter_meta = {}
+        #         iter_start_time = time.perf_counter()
+        #         # for debugging only
+        #         # validate(model, criterion, valset, args.batch_size, collate_fn,
+        #         #          distributed_run, batch_to_gpu, args.local_rank)
+        #
+        # # Finished epoch
+        # epoch_loss /= epoch_iter
+        # epoch_mel_loss /= epoch_iter
+        # epoch_time = time.perf_counter() - epoch_start_time
+        #
+        # log({
+        #     'epoch': epoch,
+        #     'loss/epoch_loss': epoch_loss,
+        #     'mel-loss/epoch_mel_loss': epoch_mel_loss,
+        #     'pitch-loss/epoch_pitch_loss': epoch_pitch_loss,
+        #     'energy-loss/epoch_energy_loss': epoch_energy_loss,
+        #     'dur-loss/epoch_dur_loss': epoch_dur_loss,
+        #     'epoch_frames per s': epoch_num_frames / epoch_time,
+        #     'epoch_took': epoch_time,
+        # }, args.local_rank)
+        # bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss,
+        #                    epoch_time)
+        #
+        # validate(model, criterion, valset, args.batch_size, collate_fn,
+        #          distributed_run, batch_to_gpu, args.local_rank)
+        #
+        # if args.ema_decay > 0:
+        #     validate(ema_model, criterion, valset, args.batch_size, collate_fn,
+        #              distributed_run, batch_to_gpu, args.local_rank)
+        #
+        # maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch,
+        #                       total_iter, model_config)
 
     # Finished training
     if len(bmark_stats) > 0: