Text-to-Speech
Safetensors
English
Chinese
ZiyueJiang commited on
Commit
0a454f4
·
verified ·
1 Parent(s): 8607901

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ g2p/tokenizer.json filter=lfs diff=lfs merge=lfs -text
aligner_lm/config.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: true
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ base_config:
12
+ - ./base_config.yaml
13
+ c_spk_enc: 512
14
+ char_dict_size: 15000
15
+ conv_use_pos: false
16
+ dec0_dilations:
17
+ - 1
18
+ - 2
19
+ - 4
20
+ - 1
21
+ - 2
22
+ - 4
23
+ - 1
24
+ dec0_kernel_size: 3
25
+ dec_dilations:
26
+ - 1
27
+ - 2
28
+ - 1
29
+ - 2
30
+ - 1
31
+ dec_ffn_kernel_size: 9
32
+ dec_kernel_size: 5
33
+ dec_layers: 4
34
+ dec_post_net_kernel: 3
35
+ decoder_rnn_dim: 0
36
+ decoder_type: conv
37
+ dropout: 0.0
38
+ dur_alpha: 1.0
39
+ dur_context_enc: true
40
+ dur_log: true
41
+ dur_predictor_kernel: 3
42
+ dur_predictor_layers: 2
43
+ dur_use_char: true
44
+ dur_use_spk: true
45
+ enc_dec_norm: ln
46
+ enc_dilations:
47
+ - 1
48
+ - 1
49
+ - 1
50
+ - 1
51
+ enc_ffn_kernel_size: 5
52
+ enc_kernel_size: 5
53
+ enc_layers: 8
54
+ enc_post_net_kernel: 3
55
+ enc_pre_ln: true
56
+ enc_prenet: true
57
+ encoder_K: 8
58
+ encoder_type: rel_fft
59
+ endless_ds: true
60
+ eval_max_batches: 0
61
+ f0_max: 600
62
+ f0_min: 60
63
+ ffn_act: gelu
64
+ ffn_hidden_size: 1024
65
+ fft_size: 1200
66
+ fg_spk_enc_hidden: 256
67
+ fmax: 12000
68
+ fmin: 0
69
+ frames_multiple: 8
70
+ hidden_size: 512
71
+ hop_size: 240
72
+ keep_c0_init: true
73
+ lat_for_dur: false
74
+ latent_dim: 16
75
+ latent_size: 256
76
+ layers_in_block: 2
77
+ ling_label_dict_size:
78
+ - 20
79
+ - 4
80
+ - 5
81
+ - 2
82
+ - 3
83
+ - 3
84
+ - 3
85
+ - 6
86
+ - 15
87
+ ling_labels:
88
+ - tone
89
+ loud_norm: false
90
+ mel_vmax: 0.5
91
+ mel_vmin: -6
92
+ min_frames: 50
93
+ mix_melout_timbre: true
94
+ mix_ph_timbre: false
95
+ mixed_precision: bf16
96
+ model_type: 1
97
+ multistage: false
98
+ no_text_enc: false
99
+ num_ckpt_keep: 5
100
+ num_heads: 2
101
+ num_spk: 50000
102
+ out_wav_norm: true
103
+ pitch_extractor: reaper
104
+ pitch_key: pitch
105
+ pitch_type: frame
106
+ precision: bf16
107
+ ref_mel_bins: 160
108
+ seed: 1234
109
+ split_ref: true
110
+ use_bert_input: false
111
+ use_cfg: true
112
+ use_char: true
113
+ use_cur_global: false
114
+ use_cur_global_dec: true
115
+ use_dur_embed: true
116
+ use_dur_mask_embed: true
117
+ use_ema: false
118
+ use_expand_ph: true
119
+ use_finegrained_spk: false
120
+ use_gt_dur: false
121
+ use_gt_f0: false
122
+ use_mix_spk_embed: false
123
+ use_new_vae: false
124
+ use_ph_level_f0: false
125
+ use_ph_pos_embed: true
126
+ use_pitch_embed: false
127
+ use_pitch_embed_dec: false
128
+ use_pitch_pred: true
129
+ use_pos_embed: true
130
+ use_qk_norm: true
131
+ use_random_spk_embed: false
132
+ use_seq_cfg: true
133
+ use_spk_embed: false
134
+ use_spk_enc: true
135
+ use_spk_id: false
136
+ use_uv: true
137
+ use_vae: true
138
+ use_vpcfm: true
139
+ use_vqvae: true
140
+ use_word_encoder: true
141
+ use_word_input: false
142
+ vae_dur_grad: 0.1
143
+ vae_enc_hidden_size: 384
144
+ vae_stride: 4
145
+ vae_word_conder_layers: 0
146
+ vq_stride: 8
147
+ vqvae_start_steps: 0
148
+ win_size: 1200
149
+ word_dict_size: 10000
150
+ z_channels: 64
151
+ z_clamp: 2.0
aligner_lm/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00f18ec36f8c1328ddab7a405c8e388790a1c14fdbdd07c546fcacaf5d19296
3
+ size 218434266
duration_lm/config.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: false
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ balance_sil: true
12
+ c_spk_enc: 512
13
+ char_dict_size: 15000
14
+ conv_use_pos: false
15
+ dec0_dilations:
16
+ - 1
17
+ - 2
18
+ - 4
19
+ - 1
20
+ - 2
21
+ - 4
22
+ - 1
23
+ dec0_kernel_size: 3
24
+ dec_dilations:
25
+ - 1
26
+ - 2
27
+ - 1
28
+ - 2
29
+ - 1
30
+ dec_ffn_kernel_size: 9
31
+ dec_hidden_size: 2048
32
+ dec_inp_add_noise: false
33
+ dec_kernel_size: 5
34
+ dec_layers: 4
35
+ dec_num_heads: 8
36
+ dec_post_net_kernel: 3
37
+ decoder_rnn_dim: 0
38
+ decoder_type: conv
39
+ dropout: 0.0
40
+ ds_add_pitch_embed: false
41
+ dur_alpha: 1.0
42
+ dur_code_size: 128
43
+ dur_context_enc: true
44
+ dur_log: true
45
+ dur_model_hidden_size: 512
46
+ dur_model_layers: 8
47
+ dur_model_type: ar_mse
48
+ dur_predictor_kernel: 3
49
+ dur_predictor_layers: 2
50
+ dur_txt_hs: 512
51
+ dur_use_char: true
52
+ dur_use_spk: true
53
+ enc_dec_norm: ln
54
+ enc_dilations:
55
+ - 1
56
+ - 1
57
+ - 1
58
+ - 1
59
+ enc_ffn_kernel_size: 3
60
+ enc_hidden_size: 256
61
+ enc_kernel_size: 5
62
+ enc_layers: 4
63
+ enc_post_net_kernel: 3
64
+ enc_pre_ln: true
65
+ enc_prenet: true
66
+ encoder_K: 8
67
+ encoder_type: rel_fft
68
+ f0_max: 600
69
+ f0_min: 60
70
+ ffn_act: gelu
71
+ ffn_hidden_size: 1024
72
+ fft_size: 1200
73
+ fg_spk_enc_hidden: 256
74
+ flatten_dec: true
75
+ fmax: 12000
76
+ fmin: 0
77
+ frames_multiple: 8
78
+ hidden_size: 512
79
+ hop_size: 240
80
+ ignore_begin_end_sil: false
81
+ lat_for_dur: false
82
+ latent_size: 256
83
+ layers_in_block: 2
84
+ ling_label_dict_size:
85
+ - 20
86
+ - 4
87
+ - 5
88
+ - 2
89
+ - 3
90
+ - 3
91
+ - 3
92
+ - 6
93
+ - 15
94
+ ling_labels:
95
+ - tone
96
+ lm_num_layers: 24
97
+ lm_use_enc: true
98
+ loud_norm: false
99
+ max_tokens: 6000
100
+ mel_vmax: 0.5
101
+ mel_vmin: -6
102
+ min_frames: 0
103
+ mix_melout_timbre: true
104
+ mix_ph_timbre: false
105
+ model_type: 1
106
+ multistage: false
107
+ no_text_enc: false
108
+ num_heads: 2
109
+ out_wav_norm: true
110
+ pad_frames: false
111
+
112
+ precision: fp16
113
+ predict_pitch: false
114
+ predictor_dropout: 0.0
115
+ predictor_grad: 1.0
116
+ predictor_hidden: -1
117
+ predictor_kernel: 5
118
+ predictor_layers: 5
119
+ print_nan_grads: true
120
+ ref_mel_bins: 160
121
+ ref_size_max: 2000
122
+ ref_size_min: 1000
123
+ remove_sil: false
124
+ shuffle_ref: false
125
+ split_ref: true
126
+ temperature: 0.8
127
+ tone_percep_ckpt: ''
128
+ train_spk_embed_only: false
129
+ use_bert_input: false
130
+ use_char: true
131
+ use_cur_global: false
132
+ use_cur_global_dec: true
133
+ use_dur_embed: true
134
+ use_dur_mask_embed: true
135
+ use_finegrained_spk: false
136
+ use_global_lat: false
137
+ use_gpt: true
138
+ use_gt_dur: false
139
+ use_gt_f0: false
140
+ use_mix_spk_embed: false
141
+ use_new_vae: false
142
+ use_ph_level_f0: false
143
+ use_ph_pos_embed: true
144
+ use_pitch_embed: false
145
+ use_pitch_embed_dec: false
146
+ use_pitch_pred: true
147
+ use_pos_embed: false
148
+ use_post_ln: false
149
+ use_random_spk_embed: false
150
+ use_rot_embed: true
151
+ use_spk_embed: false
152
+ use_spk_enc: false
153
+ use_spk_id: false
154
+ use_text_postnet: true
155
+ use_uv: true
156
+ use_vae: true
157
+ use_vqvae: true
158
+ use_word_encoder: true
159
+ use_word_input: false
160
+ vae_dur_grad: 0.1
161
+ vae_enc_hidden_size: 384
162
+ vae_word_conder_layers: 0
163
+ vq_stride: 8
164
+ w_nonsil: 10.0
165
+ w_sil: 1.0
166
+ word_dict_size: 10000
167
+ z_channels: 64
168
+ z_clamp: 2.0
duration_lm/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f21f4205c5d3ec4bef69716a85ca3d37f25c35b429bac500477a2085039b43f
3
+ size 267955084
g2p/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./user/checkpoints/Qwen2-0.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 24,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.48.3",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 168896
30
+ }
g2p/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.48.3"
6
+ }
g2p/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step95500
g2p/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
g2p/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f9d70d454ee35d023a9a54552716a8ccf2411c967abc6a857160527046f62a2
3
+ size 1018490136
g2p/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
g2p/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18bf578a236efa19e36ee7be04c327ba4abc23aed0213a31d3199a55ea7d2411
3
+ size 14796960
g2p/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/vocab.json ADDED
The diff for this file is too large to render. See raw diff