athena.data.datasets.tts.speech_synthesis

audio dataset

Module Contents

Classes

SpeechSynthesisDatasetBuilder

SpeechSynthesisDatasetBuilder

class athena.data.datasets.tts.speech_synthesis.SpeechSynthesisDatasetBuilder(config=None)

Bases: athena.data.datasets.base.SpeechBaseDatasetBuilder

SpeechSynthesisDatasetBuilder

property num_class

@property

Returns

the max_index of the vocabulary

Return type

int

property feat_dim

return the number of feature dims

property sample_type

@property

Returns

sample_type of the dataset:

{
    "utt_id": tf.string,
    "input": tf.int32,
    "input_length": tf.int32,
    "output_length": tf.int32,
    "output": tf.float32,
    "speaker": tf.int32
}

Return type

dict

property sample_shape

@property

Returns

sample_shape of the dataset:

{
    "utt_id": tf.TensorShape([]),
    "input": tf.TensorShape([None]),
    "input_length": tf.TensorShape([]),
    "output_length": tf.TensorShape([]),
    "output": tf.TensorShape([None, feature_dim]),
    "speaker": tf.TensorShape([])
}

Return type

dict

property sample_signature

@property

Returns

sample_signature of the dataset:

{
    "utt_id": tf.TensorSpec(shape=(None), dtype=tf.string),
    "input": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    "input_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output_length": tf.TensorSpec(shape=(None), dtype=tf.int32),
    "output": tf.TensorSpec(shape=(None, None, feature_dim),
                            dtype=tf.float32),
    "speaker": tf.TensorSpec(shape=(None), dtype=tf.int32)
}

Return type

dict

default_config
preprocess_data(file_path)

generate a list of tuples (wav_filename, wav_length_ms, transcript, speaker).

__getitem__(index)