Text-to-Speech
Safetensors
English
Chinese
MegaTTS3 / duration_lm /config.yaml
ZiyueJiang's picture
Upload 15 files
0a454f4 verified
acous_params:
- - 480
- 1200
- 80
- - 240
- 1200
- 160
amp: false
audio_num_mel_bins: 160
audio_sample_rate: 24000
balance_sil: true
c_spk_enc: 512
char_dict_size: 15000
conv_use_pos: false
dec0_dilations:
- 1
- 2
- 4
- 1
- 2
- 4
- 1
dec0_kernel_size: 3
dec_dilations:
- 1
- 2
- 1
- 2
- 1
dec_ffn_kernel_size: 9
dec_hidden_size: 2048
dec_inp_add_noise: false
dec_kernel_size: 5
dec_layers: 4
dec_num_heads: 8
dec_post_net_kernel: 3
decoder_rnn_dim: 0
decoder_type: conv
dropout: 0.0
ds_add_pitch_embed: false
dur_alpha: 1.0
dur_code_size: 128
dur_context_enc: true
dur_log: true
dur_model_hidden_size: 512
dur_model_layers: 8
dur_model_type: ar_mse
dur_predictor_kernel: 3
dur_predictor_layers: 2
dur_txt_hs: 512
dur_use_char: true
dur_use_spk: true
enc_dec_norm: ln
enc_dilations:
- 1
- 1
- 1
- 1
enc_ffn_kernel_size: 3
enc_hidden_size: 256
enc_kernel_size: 5
enc_layers: 4
enc_post_net_kernel: 3
enc_pre_ln: true
enc_prenet: true
encoder_K: 8
encoder_type: rel_fft
f0_max: 600
f0_min: 60
ffn_act: gelu
ffn_hidden_size: 1024
fft_size: 1200
fg_spk_enc_hidden: 256
flatten_dec: true
fmax: 12000
fmin: 0
frames_multiple: 8
hidden_size: 512
hop_size: 240
ignore_begin_end_sil: false
lat_for_dur: false
latent_size: 256
layers_in_block: 2
ling_label_dict_size:
- 20
- 4
- 5
- 2
- 3
- 3
- 3
- 6
- 15
ling_labels:
- tone
lm_num_layers: 24
lm_use_enc: true
loud_norm: false
max_tokens: 6000
mel_vmax: 0.5
mel_vmin: -6
min_frames: 0
mix_melout_timbre: true
mix_ph_timbre: false
model_type: 1
multistage: false
no_text_enc: false
num_heads: 2
out_wav_norm: true
pad_frames: false
precision: fp16
predict_pitch: false
predictor_dropout: 0.0
predictor_grad: 1.0
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 5
print_nan_grads: true
ref_mel_bins: 160
ref_size_max: 2000
ref_size_min: 1000
remove_sil: false
shuffle_ref: false
split_ref: true
temperature: 0.8
tone_percep_ckpt: ''
train_spk_embed_only: false
use_bert_input: false
use_char: true
use_cur_global: false
use_cur_global_dec: true
use_dur_embed: true
use_dur_mask_embed: true
use_finegrained_spk: false
use_global_lat: false
use_gpt: true
use_gt_dur: false
use_gt_f0: false
use_mix_spk_embed: false
use_new_vae: false
use_ph_level_f0: false
use_ph_pos_embed: true
use_pitch_embed: false
use_pitch_embed_dec: false
use_pitch_pred: true
use_pos_embed: false
use_post_ln: false
use_random_spk_embed: false
use_rot_embed: true
use_spk_embed: false
use_spk_enc: false
use_spk_id: false
use_text_postnet: true
use_uv: true
use_vae: true
use_vqvae: true
use_word_encoder: true
use_word_input: false
vae_dur_grad: 0.1
vae_enc_hidden_size: 384
vae_word_conder_layers: 0
vq_stride: 8
w_nonsil: 10.0
w_sil: 1.0
word_dict_size: 10000
z_channels: 64
z_clamp: 2.0