Commit
·
0d15013
1
Parent(s):
2db5c30
update extension, fix speed has no effect
Browse files- Models/{config.yml → config.yaml} +70 -70
- app.py +1 -1
- inference.py +3 -1
- run.ipynb +7 -7
Models/{config.yml → config.yaml}
RENAMED
@@ -1,71 +1,71 @@
|
|
1 |
-
log_dir: "Models/Finetune_Extend"
|
2 |
-
save_freq: 1
|
3 |
-
log_interval: 5
|
4 |
-
device: "cuda"
|
5 |
-
epochs: 50
|
6 |
-
batch_size: 3
|
7 |
-
max_len: 210 # maximum number of frames
|
8 |
-
pretrained_model: "Models/Finetune_Extend/current_model.pth"
|
9 |
-
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
10 |
-
|
11 |
-
data_params:
|
12 |
-
train_data: "../../Data_Speech/viVoice/train.txt"
|
13 |
-
val_data: "../../Data_Speech/combine/combine_val.txt"
|
14 |
-
root_path: "../../Data_Speech/"
|
15 |
-
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
16 |
-
|
17 |
-
preprocess_params:
|
18 |
-
sr: 24000
|
19 |
-
spect_params:
|
20 |
-
n_fft: 2048
|
21 |
-
win_length: 1200
|
22 |
-
hop_length: 300
|
23 |
-
|
24 |
-
model_params:
|
25 |
-
dim_in: 64
|
26 |
-
hidden_dim: 512
|
27 |
-
max_conv_dim: 512
|
28 |
-
n_layer: 3
|
29 |
-
n_mels: 80
|
30 |
-
|
31 |
-
n_token: 189 # number of phoneme tokens
|
32 |
-
max_dur: 50 # maximum duration of a single phoneme
|
33 |
-
style_dim: 128 # style vector size
|
34 |
-
|
35 |
-
dropout: 0.2
|
36 |
-
|
37 |
-
ASR_params:
|
38 |
-
input_dim: 80
|
39 |
-
hidden_dim: 256
|
40 |
-
n_token: 189 # number of phoneme tokens
|
41 |
-
n_layers: 6
|
42 |
-
token_embedding_dim: 512
|
43 |
-
|
44 |
-
JDC_params:
|
45 |
-
num_class: 1
|
46 |
-
seq_len: 192
|
47 |
-
|
48 |
-
# config for decoder
|
49 |
-
decoder:
|
50 |
-
type: 'hifigan' # either hifigan or istftnet
|
51 |
-
resblock_kernel_sizes: [3,7,11]
|
52 |
-
upsample_rates : [10,5,3,2]
|
53 |
-
upsample_initial_channel: 512
|
54 |
-
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
55 |
-
upsample_kernel_sizes: [20,10,6,4]
|
56 |
-
|
57 |
-
loss_params:
|
58 |
-
lambda_mel: 5. # mel reconstruction loss
|
59 |
-
lambda_gen: 1. # generator loss
|
60 |
-
|
61 |
-
lambda_mono: 1. # monotonic alignment loss (TMA)
|
62 |
-
lambda_s2s: 1. # sequence-to-sequence loss (TMA)
|
63 |
-
|
64 |
-
lambda_F0: 1. # F0 reconstruction loss
|
65 |
-
lambda_norm: 1. # norm reconstruction loss
|
66 |
-
lambda_dur: 1. # duration loss
|
67 |
-
lambda_ce: 20. # duration predictor probability output CE loss
|
68 |
-
|
69 |
-
optimizer_params:
|
70 |
-
lr: 0.0001 # general learning rate
|
71 |
ft_lr: 0.00001 # learning rate for acoustic modules
|
|
|
1 |
+
log_dir: "Models/Finetune_Extend"
|
2 |
+
save_freq: 1
|
3 |
+
log_interval: 5
|
4 |
+
device: "cuda"
|
5 |
+
epochs: 50
|
6 |
+
batch_size: 3
|
7 |
+
max_len: 210 # maximum number of frames
|
8 |
+
pretrained_model: "Models/Finetune_Extend/current_model.pth"
|
9 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
10 |
+
|
11 |
+
data_params:
|
12 |
+
train_data: "../../Data_Speech/viVoice/train.txt"
|
13 |
+
val_data: "../../Data_Speech/combine/combine_val.txt"
|
14 |
+
root_path: "../../Data_Speech/"
|
15 |
+
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
16 |
+
|
17 |
+
preprocess_params:
|
18 |
+
sr: 24000
|
19 |
+
spect_params:
|
20 |
+
n_fft: 2048
|
21 |
+
win_length: 1200
|
22 |
+
hop_length: 300
|
23 |
+
|
24 |
+
model_params:
|
25 |
+
dim_in: 64
|
26 |
+
hidden_dim: 512
|
27 |
+
max_conv_dim: 512
|
28 |
+
n_layer: 3
|
29 |
+
n_mels: 80
|
30 |
+
|
31 |
+
n_token: 189 # number of phoneme tokens
|
32 |
+
max_dur: 50 # maximum duration of a single phoneme
|
33 |
+
style_dim: 128 # style vector size
|
34 |
+
|
35 |
+
dropout: 0.2
|
36 |
+
|
37 |
+
ASR_params:
|
38 |
+
input_dim: 80
|
39 |
+
hidden_dim: 256
|
40 |
+
n_token: 189 # number of phoneme tokens
|
41 |
+
n_layers: 6
|
42 |
+
token_embedding_dim: 512
|
43 |
+
|
44 |
+
JDC_params:
|
45 |
+
num_class: 1
|
46 |
+
seq_len: 192
|
47 |
+
|
48 |
+
# config for decoder
|
49 |
+
decoder:
|
50 |
+
type: 'hifigan' # either hifigan or istftnet
|
51 |
+
resblock_kernel_sizes: [3,7,11]
|
52 |
+
upsample_rates : [10,5,3,2]
|
53 |
+
upsample_initial_channel: 512
|
54 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
55 |
+
upsample_kernel_sizes: [20,10,6,4]
|
56 |
+
|
57 |
+
loss_params:
|
58 |
+
lambda_mel: 5. # mel reconstruction loss
|
59 |
+
lambda_gen: 1. # generator loss
|
60 |
+
|
61 |
+
lambda_mono: 1. # monotonic alignment loss (TMA)
|
62 |
+
lambda_s2s: 1. # sequence-to-sequence loss (TMA)
|
63 |
+
|
64 |
+
lambda_F0: 1. # F0 reconstruction loss
|
65 |
+
lambda_norm: 1. # norm reconstruction loss
|
66 |
+
lambda_dur: 1. # duration loss
|
67 |
+
lambda_ce: 20. # duration predictor probability output CE loss
|
68 |
+
|
69 |
+
optimizer_params:
|
70 |
+
lr: 0.0001 # general learning rate
|
71 |
ft_lr: 0.00001 # learning rate for acoustic modules
|
app.py
CHANGED
@@ -7,7 +7,7 @@ import traceback
|
|
7 |
from inference import StyleTTS2
|
8 |
repo_dir = './'
|
9 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
10 |
-
config_path = os.path.join(repo_dir, "Models", "config.
|
11 |
models_path = os.path.join(repo_dir, "Models", "model.pth")
|
12 |
model = StyleTTS2(config_path, models_path).eval().to(device)
|
13 |
voice_path = os.path.join(repo_dir, "reference_audio")
|
|
|
7 |
from inference import StyleTTS2
|
8 |
repo_dir = './'
|
9 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
10 |
+
config_path = os.path.join(repo_dir, "Models", "config.yaml")
|
11 |
models_path = os.path.join(repo_dir, "Models", "model.pth")
|
12 |
model = StyleTTS2(config_path, models_path).eval().to(device)
|
13 |
voice_path = os.path.join(repo_dir, "reference_audio")
|
inference.py
CHANGED
@@ -261,7 +261,7 @@ class StyleTTS2(torch.nn.Module):
|
|
261 |
# cal alignment
|
262 |
d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
|
263 |
x, _ = self.predictor.lstm(d)
|
264 |
-
duration = self.predictor.duration_proj(x)
|
265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
266 |
|
267 |
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
@@ -270,6 +270,8 @@ class StyleTTS2(torch.nn.Module):
|
|
270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
271 |
duration = duration*(1-t) + dur_stats*t
|
272 |
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
|
|
|
|
273 |
|
274 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
275 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
|
|
261 |
# cal alignment
|
262 |
d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
|
263 |
x, _ = self.predictor.lstm(d)
|
264 |
+
duration = self.predictor.duration_proj(x)
|
265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
266 |
|
267 |
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
|
|
270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
271 |
duration = duration*(1-t) + dur_stats*t
|
272 |
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
273 |
+
|
274 |
+
duration /= speed
|
275 |
|
276 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
277 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
run.ipynb
CHANGED
@@ -38,12 +38,12 @@
|
|
38 |
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
-
"execution_count":
|
42 |
"id": "e7b9c01d",
|
43 |
"metadata": {},
|
44 |
"outputs": [],
|
45 |
"source": [
|
46 |
-
"config_path = \"Models/config.
|
47 |
"models_path = \"Models/model.pth\""
|
48 |
]
|
49 |
},
|
@@ -63,7 +63,7 @@
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
-
"execution_count":
|
67 |
"id": "78396f70",
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
@@ -121,12 +121,12 @@
|
|
121 |
" \"id_1\": {\n",
|
122 |
" \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
|
123 |
" \"lang\": \"vi\", #Default language\n",
|
124 |
-
" \"speed\": 1.
|
125 |
" },\n",
|
126 |
" \"id_2\": {\n",
|
127 |
" \"path\": \"./reference_audio/vn_4.wav\",\n",
|
128 |
" \"lang\": \"vi\",\n",
|
129 |
-
" \"speed\": 1.
|
130 |
" },\n",
|
131 |
"}\n",
|
132 |
"for id in speakers:\n",
|
@@ -159,7 +159,7 @@
|
|
159 |
},
|
160 |
{
|
161 |
"cell_type": "code",
|
162 |
-
"execution_count":
|
163 |
"id": "16194211",
|
164 |
"metadata": {},
|
165 |
"outputs": [
|
@@ -192,7 +192,7 @@
|
|
192 |
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
|
193 |
"stabilize = True #BOOL Stabilize speaking speed.\n",
|
194 |
"denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
|
195 |
-
"n_merge =
|
196 |
]
|
197 |
},
|
198 |
{
|
|
|
38 |
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
+
"execution_count": null,
|
42 |
"id": "e7b9c01d",
|
43 |
"metadata": {},
|
44 |
"outputs": [],
|
45 |
"source": [
|
46 |
+
"config_path = \"Models/config.yaml\"\n",
|
47 |
"models_path = \"Models/model.pth\""
|
48 |
]
|
49 |
},
|
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
+
"execution_count": null,
|
67 |
"id": "78396f70",
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
|
|
121 |
" \"id_1\": {\n",
|
122 |
" \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
|
123 |
" \"lang\": \"vi\", #Default language\n",
|
124 |
+
" \"speed\": 1.0, #Speaking speed\n",
|
125 |
" },\n",
|
126 |
" \"id_2\": {\n",
|
127 |
" \"path\": \"./reference_audio/vn_4.wav\",\n",
|
128 |
" \"lang\": \"vi\",\n",
|
129 |
+
" \"speed\": 1.0,\n",
|
130 |
" },\n",
|
131 |
"}\n",
|
132 |
"for id in speakers:\n",
|
|
|
159 |
},
|
160 |
{
|
161 |
"cell_type": "code",
|
162 |
+
"execution_count": null,
|
163 |
"id": "16194211",
|
164 |
"metadata": {},
|
165 |
"outputs": [
|
|
|
192 |
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
|
193 |
"stabilize = True #BOOL Stabilize speaking speed.\n",
|
194 |
"denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
|
195 |
+
"n_merge = 18 #INT Avoid short sentences by merging when a sentence has fewer than n words"
|
196 |
]
|
197 |
},
|
198 |
{
|