Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 7 additions & 16 deletions PyTorch/SpeechSynthesis/FastPitch/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
import warnings
from pathlib import Path
from typing import Optional

import librosa
import numpy as np

import torch
from scipy.io.wavfile import read

from csv import DictReader

class BenchmarkStats:
""" Tracks statistics used for benchmarking. """
Expand Down Expand Up @@ -67,23 +65,16 @@ def load_wav_to_torch(full_path, force_sampling_rate=None):
return torch.FloatTensor(data.astype(np.float32)), sampling_rate


def load_filepaths_and_text(fnames, dataset_path=None, has_speakers=False,
def load_filepaths_and_text(fnames, dataset_path=None, has_speakers=False, has_conditions=False,
split="|"):
def split_line(line, root=None):
parts = line.strip().split(split)
if has_speakers:
paths, non_paths = parts[:-2], parts[-2:]
else:
paths, non_paths = parts[:-1], parts[-1:]
if root:
return tuple(str(Path(root, p)) for p in paths) + tuple(non_paths)
else:
return tuple(str(Path(p)) for p in paths) + tuple(non_paths)

# Reads in csv with headers mels|pitch|text|optional-speaker
# Returns list of dicts
fpaths_and_text = []
for fname in fnames:
with open(fname, encoding='utf-8') as f:
fpaths_and_text += [split_line(line, dataset_path) for line in f]
dict_reader = DictReader(f, delimiter='|')
fpaths_and_text = list(dict_reader)

return fpaths_and_text


Expand Down
2 changes: 2 additions & 0 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,7 @@ def parse_fastpitch_args(parent, add_help=False):
help='Pitch embedding conv-1D kernel size')
cond.add_argument('--speaker-emb-weight', type=float, default=1.0,
help='Scale speaker embedding')
cond.add_argument('--condition-emb-weight', type=float, default=1.0,
help='Scale condition embedding')

return parser
61 changes: 42 additions & 19 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def __init__(self,
symbol_set='english_basic',
p_arpabet=1.0,
n_speakers=1,
n_conditions=1,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default is 1 but to have conditions there should be more than 1? If 1 is a way of saying there are no conditions, why not 0?:

load_mel_from_disk=True,
load_pitch_from_disk=True,
pitch_mean=214.72203, # LJSpeech defaults
Expand All @@ -160,9 +161,10 @@ def __init__(self,
audiopaths_and_text = [audiopaths_and_text]

self.dataset_path = dataset_path
# this now returns a list of dicts
self.audiopaths_and_text = load_filepaths_and_text(
audiopaths_and_text, dataset_path,
has_speakers=(n_speakers > 1))
has_speakers=(n_speakers > 1), has_conditions=(n_conditions > 1))
self.load_mel_from_disk = load_mel_from_disk
if not load_mel_from_disk:
self.max_wav_value = max_wav_value
Expand All @@ -181,6 +183,7 @@ def __init__(self,

self.tp = TextProcessing(symbol_set, text_cleaners, p_arpabet=p_arpabet)
self.n_speakers = n_speakers
self.n_conditions = n_conditions
self.pitch_tmp_dir = pitch_online_dir
self.f0_method = pitch_online_method
self.betabinomial_tmp_dir = betabinomial_online_dir
Expand All @@ -189,13 +192,13 @@ def __init__(self,
if use_betabinomial_interpolator:
self.betabinomial_interpolator = BetaBinomialInterpolator()

expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1))

expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1) + (n_conditions > 1))
print('EXPECTED COLUMNS IS ' + str(expected_columns))
assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)

if len(self.audiopaths_and_text[0]) < expected_columns:
raise ValueError(f'Expected {expected_columns} columns in audiopaths file. '
'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]')
'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>|<condition_id>]')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, I guess we're checking it here?


if len(self.audiopaths_and_text[0]) > expected_columns:
print('WARNING: Audiopaths file has more columns than expected')
Expand All @@ -205,16 +208,26 @@ def __init__(self,
self.pitch_std = to_tensor(pitch_std)

def __getitem__(self, index):
# Separate filename and text
# Indexing items using dictionary entries
audiopath = self.audiopaths_and_text[index]['mels']
text = self.audiopaths_and_text[index]['text']
speaker = None
condition = None
if self.n_speakers > 1:
audiopath, *extra, text, speaker = self.audiopaths_and_text[index]
speaker = int(speaker)
else:
audiopath, *extra, text = self.audiopaths_and_text[index]
speaker = None
speaker = int(self.audiopaths_and_text[index]['speaker'])
if self.n_conditions > 1:
cond = self.audiopaths_and_text[index]['condition']
if cond is None or cond == 'None':
print(audiopath, text, self.audiopaths_and_text[index])
condition = int(self.audiopaths_and_text[index]['condition'])

mel = self.get_mel(audiopath)
if mel.size(1) > 700:
print('MEL LEN: ', mel.size(), audiopath)
text = self.get_text(text)
length = len(text)
if length >= 130:
print('LENGTH: ', len(text), audiopath)
pitch = self.get_pitch(index, mel.size(-1))
energy = torch.norm(mel.float(), dim=0, p=2)
attn_prior = self.get_prior(index, mel.shape[1], text.shape[0])
Expand All @@ -226,7 +239,7 @@ def __getitem__(self, index):
pitch = pitch[None, :]

return (text, mel, len(text), pitch, energy, speaker, attn_prior,
audiopath)
audiopath, condition)

def __len__(self):
return len(self.audiopaths_and_text)
Expand Down Expand Up @@ -287,15 +300,15 @@ def get_prior(self, index, mel_len, text_len):
return attn_prior

def get_pitch(self, index, mel_len=None):
audiopath, *fields = self.audiopaths_and_text[index]
audiopath = self.audiopaths_and_text[index]['mels']

# why do we need the speaker here?
spk = 0
if self.n_speakers > 1:
spk = int(fields[-1])
else:
spk = 0
spk = int(self.audiopaths_and_text[index]['speaker'])

if self.load_pitch_from_disk:
pitchpath = fields[0]
pitchpath = self.audiopaths_and_text[index]['pitch']
pitch = torch.load(pitchpath)
if self.pitch_mean is not None:
assert self.pitch_std is not None
Expand Down Expand Up @@ -386,14 +399,21 @@ def __call__(self, batch):

audiopaths = [batch[i][7] for i in ids_sorted_decreasing]

if batch[0][8] is not None:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imagine this is the bit that would need updating once the other code is merged?

condition = torch.zeros_like(input_lengths)
for i in range(len(ids_sorted_decreasing)):
condition[i] = batch[ids_sorted_decreasing[i]][8]
else:
condition = None

return (text_padded, input_lengths, mel_padded, output_lengths, len_x,
pitch_padded, energy_padded, speaker, attn_prior_padded,
audiopaths)
audiopaths, condition)


def batch_to_gpu(batch):
(text_padded, input_lengths, mel_padded, output_lengths, len_x,
pitch_padded, energy_padded, speaker, attn_prior, audiopaths) = batch
pitch_padded, energy_padded, speaker, attn_prior, audiopaths, condition) = batch

text_padded = to_gpu(text_padded).long()
input_lengths = to_gpu(input_lengths).long()
Expand All @@ -404,10 +424,13 @@ def batch_to_gpu(batch):
attn_prior = to_gpu(attn_prior).float()
if speaker is not None:
speaker = to_gpu(speaker).long()
if condition is not None:
condition = to_gpu(condition).long()

# Alignments act as both inputs and targets - pass shallow copies
x = [text_padded, input_lengths, mel_padded, output_lengths,
pitch_padded, energy_padded, speaker, attn_prior, audiopaths]
pitch_padded, energy_padded, speaker, attn_prior, audiopaths, condition]
y = [mel_padded, input_lengths, output_lengths]
len_x = torch.sum(output_lengths)
return (x, y, len_x)

41 changes: 32 additions & 9 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
energy_predictor_kernel_size, energy_predictor_filter_size,
p_energy_predictor_dropout, energy_predictor_n_layers,
energy_embedding_kernel_size,
n_speakers, speaker_emb_weight, pitch_conditioning_formants=1):
n_speakers, speaker_emb_weight, n_conditions, condition_emb_weight, pitch_conditioning_formants=1):
super(FastPitch, self).__init__()

self.encoder = FFTransformer(
Expand All @@ -149,6 +149,14 @@ def __init__(self, n_mel_channels, n_symbols, padding_idx,
self.speaker_emb = None
self.speaker_emb_weight = speaker_emb_weight

#Have to figure out what symbols_embedding_dim is
if n_conditions > 1:
self.condition_emb = nn.Embedding(n_conditions, symbols_embedding_dim)
else:
self.condition_emb = None
self.condition_emb_weight = condition_emb_weight


self.duration_predictor = TemporalPredictor(
in_fft_output_size,
filter_size=dur_predictor_filter_size,
Expand Down Expand Up @@ -242,7 +250,7 @@ def binarize_attention_parallel(self, attn, in_lens, out_lens):
def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):

(inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
speaker, attn_prior, audiopaths) = inputs
speaker, attn_prior, audiopaths, condition) = inputs
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a side-note I am wondring if we should make inputs/outputs some enum or datatype that also doesn't rely on indices to get different things out


mel_max_len = mel_tgt.size(2)

Expand All @@ -253,8 +261,15 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
spk_emb = self.speaker_emb(speaker).unsqueeze(1)
spk_emb.mul_(self.speaker_emb_weight)

# Calculate discrete condition embedding
if self.condition_emb is None:
cond_emb = 0
else:
cond_emb = self.condition_emb(condition).unsqueeze(1)
cond_emb.mul_(self.condition_emb_weight)

# Input FFT
enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb, conditioning_2=cond_emb) #need to add condition conditioning here

# Alignment
text_emb = self.encoder.word_emb(inputs)
Expand All @@ -281,7 +296,7 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)

# Predict pitch
pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1) #maybe we want to condition pitch prediction on the conditioning parameter.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool idea, we should make a ticket for this


# Average pitch over characters
pitch_tgt = average_pitch(pitch_dense, dur_tgt)
Expand All @@ -290,7 +305,7 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
pitch_emb = self.pitch_emb(pitch_tgt)
else:
pitch_emb = self.pitch_emb(pitch_pred)
enc_out = enc_out + pitch_emb.transpose(1, 2)
enc_out = enc_out + pitch_emb.transpose(1, 2) #Adding with encoder output

# Predict energy
if self.energy_conditioning:
Expand All @@ -302,13 +317,13 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):

energy_emb = self.energy_emb(energy_tgt)
energy_tgt = energy_tgt.squeeze(1)
enc_out = enc_out + energy_emb.transpose(1, 2)
enc_out = enc_out + energy_emb.transpose(1, 2) #adding to encoder output
else:
energy_pred = None
energy_tgt = None

len_regulated, dec_lens = regulate_len(
dur_tgt, enc_out, pace, mel_max_len)
dur_tgt, enc_out, pace, mel_max_len) #upsampling

# Output FFT
dec_out, dec_mask = self.decoder(len_regulated, dec_lens)
Expand All @@ -319,7 +334,7 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):

def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None,
energy_tgt=None, pitch_transform=None, max_duration=75,
speaker=0):
speaker=0, condition=0):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so because this is the condition index, the default is 0 (despite no condition being n_conditions = 1 and so far there only being able to be 1 condition?)
Just making sure I understand, once again my brain is melting


if self.speaker_emb is None:
spk_emb = 0
Expand All @@ -329,8 +344,16 @@ def infer(self, inputs, pace=1.0, dur_tgt=None, pitch_tgt=None,
spk_emb = self.speaker_emb(speaker).unsqueeze(1)
spk_emb.mul_(self.speaker_emb_weight)

if self.condition_emb is None:
cond_emb = 0
else:
condition = (torch.ones(inputs.size(0)).long().to(inputs.device)
* condition)
cond_emb = self.condition_emb(condition).unsqueeze(1)
cond_emb.mul_(self.condition_emb_weight)

# Input FFT
enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb, conditioning_2=cond_emb) #need to add conditioning here but will it take list?

# Predict durations
log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
Expand Down
4 changes: 2 additions & 2 deletions PyTorch/SpeechSynthesis/FastPitch/fastpitch/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size,
dropatt=dropatt, pre_lnorm=pre_lnorm)
)

def forward(self, dec_inp, seq_lens=None, conditioning=0):
def forward(self, dec_inp, seq_lens=None, conditioning=0, conditioning_2=0): #here when called we add speaker or other discrete condition
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could make condition a tuple, or rename the conditionings to conditioning_speaker conditioning_other

if self.word_emb is None:
inp = dec_inp
mask = mask_from_lens(seq_lens).unsqueeze(2)
Expand All @@ -204,7 +204,7 @@ def forward(self, dec_inp, seq_lens=None, conditioning=0):
pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype)
pos_emb = self.pos_emb(pos_seq) * mask

out = self.drop(inp + pos_emb + conditioning)
out = self.drop(inp + pos_emb + conditioning + conditioning_2) # so here we add more conditioning

for layer in self.layers:
out = layer(out, mask=mask)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mels|pitch|text
wavs/LJ045-0096.wav|pitch/LJ045-0096.pt|Mrs. De Mohrenschildt thought that Oswald,
wavs/LJ049-0022.wav|pitch/LJ049-0022.pt|The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle with a fixed top, even though transparent.
wavs/LJ033-0042.wav|pitch/LJ033-0042.pt|Between the hours of eight and nine p.m. they were occupied with the children in the bedrooms located at the extreme east end of the house.
Expand Down
Loading