dangtr0408 commited on
Commit
0d15013
·
1 Parent(s): 2db5c30

update extension, fix speed has no effect

Browse files
Files changed (4) hide show
  1. Models/{config.yml → config.yaml} +70 -70
  2. app.py +1 -1
  3. inference.py +3 -1
  4. run.ipynb +7 -7
Models/{config.yml → config.yaml} RENAMED
@@ -1,71 +1,71 @@
1
- log_dir: "Models/Finetune_Extend"
2
- save_freq: 1
3
- log_interval: 5
4
- device: "cuda"
5
- epochs: 50
6
- batch_size: 3
7
- max_len: 210 # maximum number of frames
8
- pretrained_model: "Models/Finetune_Extend/current_model.pth"
9
- load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
10
-
11
- data_params:
12
- train_data: "../../Data_Speech/viVoice/train.txt"
13
- val_data: "../../Data_Speech/combine/combine_val.txt"
14
- root_path: "../../Data_Speech/"
15
- min_length: 50 # sample until texts with this size are obtained for OOD texts
16
-
17
- preprocess_params:
18
- sr: 24000
19
- spect_params:
20
- n_fft: 2048
21
- win_length: 1200
22
- hop_length: 300
23
-
24
- model_params:
25
- dim_in: 64
26
- hidden_dim: 512
27
- max_conv_dim: 512
28
- n_layer: 3
29
- n_mels: 80
30
-
31
- n_token: 189 # number of phoneme tokens
32
- max_dur: 50 # maximum duration of a single phoneme
33
- style_dim: 128 # style vector size
34
-
35
- dropout: 0.2
36
-
37
- ASR_params:
38
- input_dim: 80
39
- hidden_dim: 256
40
- n_token: 189 # number of phoneme tokens
41
- n_layers: 6
42
- token_embedding_dim: 512
43
-
44
- JDC_params:
45
- num_class: 1
46
- seq_len: 192
47
-
48
- # config for decoder
49
- decoder:
50
- type: 'hifigan' # either hifigan or istftnet
51
- resblock_kernel_sizes: [3,7,11]
52
- upsample_rates : [10,5,3,2]
53
- upsample_initial_channel: 512
54
- resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
- upsample_kernel_sizes: [20,10,6,4]
56
-
57
- loss_params:
58
- lambda_mel: 5. # mel reconstruction loss
59
- lambda_gen: 1. # generator loss
60
-
61
- lambda_mono: 1. # monotonic alignment loss (TMA)
62
- lambda_s2s: 1. # sequence-to-sequence loss (TMA)
63
-
64
- lambda_F0: 1. # F0 reconstruction loss
65
- lambda_norm: 1. # norm reconstruction loss
66
- lambda_dur: 1. # duration loss
67
- lambda_ce: 20. # duration predictor probability output CE loss
68
-
69
- optimizer_params:
70
- lr: 0.0001 # general learning rate
71
  ft_lr: 0.00001 # learning rate for acoustic modules
 
1
+ log_dir: "Models/Finetune_Extend"
2
+ save_freq: 1
3
+ log_interval: 5
4
+ device: "cuda"
5
+ epochs: 50
6
+ batch_size: 3
7
+ max_len: 210 # maximum number of frames
8
+ pretrained_model: "Models/Finetune_Extend/current_model.pth"
9
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
10
+
11
+ data_params:
12
+ train_data: "../../Data_Speech/viVoice/train.txt"
13
+ val_data: "../../Data_Speech/combine/combine_val.txt"
14
+ root_path: "../../Data_Speech/"
15
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
16
+
17
+ preprocess_params:
18
+ sr: 24000
19
+ spect_params:
20
+ n_fft: 2048
21
+ win_length: 1200
22
+ hop_length: 300
23
+
24
+ model_params:
25
+ dim_in: 64
26
+ hidden_dim: 512
27
+ max_conv_dim: 512
28
+ n_layer: 3
29
+ n_mels: 80
30
+
31
+ n_token: 189 # number of phoneme tokens
32
+ max_dur: 50 # maximum duration of a single phoneme
33
+ style_dim: 128 # style vector size
34
+
35
+ dropout: 0.2
36
+
37
+ ASR_params:
38
+ input_dim: 80
39
+ hidden_dim: 256
40
+ n_token: 189 # number of phoneme tokens
41
+ n_layers: 6
42
+ token_embedding_dim: 512
43
+
44
+ JDC_params:
45
+ num_class: 1
46
+ seq_len: 192
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'hifigan' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10,5,3,2]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20,10,6,4]
56
+
57
+ loss_params:
58
+ lambda_mel: 5. # mel reconstruction loss
59
+ lambda_gen: 1. # generator loss
60
+
61
+ lambda_mono: 1. # monotonic alignment loss (TMA)
62
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
63
+
64
+ lambda_F0: 1. # F0 reconstruction loss
65
+ lambda_norm: 1. # norm reconstruction loss
66
+ lambda_dur: 1. # duration loss
67
+ lambda_ce: 20. # duration predictor probability output CE loss
68
+
69
+ optimizer_params:
70
+ lr: 0.0001 # general learning rate
71
  ft_lr: 0.00001 # learning rate for acoustic modules
app.py CHANGED
@@ -7,7 +7,7 @@ import traceback
7
  from inference import StyleTTS2
8
  repo_dir = './'
9
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
- config_path = os.path.join(repo_dir, "Models", "config.yml")
11
  models_path = os.path.join(repo_dir, "Models", "model.pth")
12
  model = StyleTTS2(config_path, models_path).eval().to(device)
13
  voice_path = os.path.join(repo_dir, "reference_audio")
 
7
  from inference import StyleTTS2
8
  repo_dir = './'
9
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ config_path = os.path.join(repo_dir, "Models", "config.yaml")
11
  models_path = os.path.join(repo_dir, "Models", "model.pth")
12
  model = StyleTTS2(config_path, models_path).eval().to(device)
13
  voice_path = os.path.join(repo_dir, "reference_audio")
inference.py CHANGED
@@ -261,7 +261,7 @@ class StyleTTS2(torch.nn.Module):
261
  # cal alignment
262
  d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
263
  x, _ = self.predictor.lstm(d)
264
- duration = self.predictor.duration_proj(x) / speed
265
  duration = torch.sigmoid(duration).sum(axis=-1)
266
 
267
  if prev_d_mean != 0:#Stabilize speaking speed between splits
@@ -270,6 +270,8 @@ class StyleTTS2(torch.nn.Module):
270
  dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
271
  duration = duration*(1-t) + dur_stats*t
272
  duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
 
 
273
 
274
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
275
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
 
261
  # cal alignment
262
  d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
263
  x, _ = self.predictor.lstm(d)
264
+ duration = self.predictor.duration_proj(x)
265
  duration = torch.sigmoid(duration).sum(axis=-1)
266
 
267
  if prev_d_mean != 0:#Stabilize speaking speed between splits
 
270
  dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
271
  duration = duration*(1-t) + dur_stats*t
272
  duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
273
+
274
+ duration /= speed
275
 
276
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
277
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
run.ipynb CHANGED
@@ -38,12 +38,12 @@
38
  },
39
  {
40
  "cell_type": "code",
41
- "execution_count": 2,
42
  "id": "e7b9c01d",
43
  "metadata": {},
44
  "outputs": [],
45
  "source": [
46
- "config_path = \"Models/config.yml\"\n",
47
  "models_path = \"Models/model.pth\""
48
  ]
49
  },
@@ -63,7 +63,7 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 3,
67
  "id": "78396f70",
68
  "metadata": {},
69
  "outputs": [
@@ -121,12 +121,12 @@
121
  " \"id_1\": {\n",
122
  " \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
123
  " \"lang\": \"vi\", #Default language\n",
124
- " \"speed\": 1.1, #Speaking speed\n",
125
  " },\n",
126
  " \"id_2\": {\n",
127
  " \"path\": \"./reference_audio/vn_4.wav\",\n",
128
  " \"lang\": \"vi\",\n",
129
- " \"speed\": 1.1,\n",
130
  " },\n",
131
  "}\n",
132
  "for id in speakers:\n",
@@ -159,7 +159,7 @@
159
  },
160
  {
161
  "cell_type": "code",
162
- "execution_count": 5,
163
  "id": "16194211",
164
  "metadata": {},
165
  "outputs": [
@@ -192,7 +192,7 @@
192
  "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
193
  "stabilize = True #BOOL Stabilize speaking speed.\n",
194
  "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
195
- "n_merge = 20 #INT Avoid short sentences by merging when a sentence has fewer than n words"
196
  ]
197
  },
198
  {
 
38
  },
39
  {
40
  "cell_type": "code",
41
+ "execution_count": null,
42
  "id": "e7b9c01d",
43
  "metadata": {},
44
  "outputs": [],
45
  "source": [
46
+ "config_path = \"Models/config.yaml\"\n",
47
  "models_path = \"Models/model.pth\""
48
  ]
49
  },
 
63
  },
64
  {
65
  "cell_type": "code",
66
+ "execution_count": null,
67
  "id": "78396f70",
68
  "metadata": {},
69
  "outputs": [
 
121
  " \"id_1\": {\n",
122
  " \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
123
  " \"lang\": \"vi\", #Default language\n",
124
+ " \"speed\": 1.0, #Speaking speed\n",
125
  " },\n",
126
  " \"id_2\": {\n",
127
  " \"path\": \"./reference_audio/vn_4.wav\",\n",
128
  " \"lang\": \"vi\",\n",
129
+ " \"speed\": 1.0,\n",
130
  " },\n",
131
  "}\n",
132
  "for id in speakers:\n",
 
159
  },
160
  {
161
  "cell_type": "code",
162
+ "execution_count": null,
163
  "id": "16194211",
164
  "metadata": {},
165
  "outputs": [
 
192
  "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
193
  "stabilize = True #BOOL Stabilize speaking speed.\n",
194
  "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
195
+ "n_merge = 18 #INT Avoid short sentences by merging when a sentence has fewer than n words"
196
  ]
197
  },
198
  {