update extension, fix speed has no effect

Browse files

Files changed (4) hide show

Models/{config.yml → config.yaml} +70 -70
app.py +1 -1
inference.py +3 -1
run.ipynb +7 -7

Models/{config.yml → config.yaml} RENAMED Viewed

@@ -1,71 +1,71 @@
-log_dir: "Models/Finetune_Extend"
-save_freq: 1
-log_interval: 5
-device: "cuda"
-epochs: 50
-batch_size: 3
-max_len: 210 # maximum number of frames
-pretrained_model: "Models/Finetune_Extend/current_model.pth"
-load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
-data_params:
-  train_data: "../../Data_Speech/viVoice/train.txt"
-  val_data: "../../Data_Speech/combine/combine_val.txt"
-  root_path: "../../Data_Speech/"
-  min_length: 50 # sample until texts with this size are obtained for OOD texts
-preprocess_params:
-  sr: 24000
-  spect_params:
-    n_fft: 2048
-    win_length: 1200
-    hop_length: 300
-model_params:
-  dim_in: 64
-  hidden_dim: 512
-  max_conv_dim: 512
-  n_layer: 3
-  n_mels: 80
-  n_token: 189 # number of phoneme tokens
-  max_dur: 50 # maximum duration of a single phoneme
-  style_dim: 128 # style vector size
-  dropout: 0.2
-  ASR_params:
-    input_dim: 80
-    hidden_dim: 256
-    n_token: 189 # number of phoneme tokens
-    n_layers: 6
-    token_embedding_dim: 512
-  JDC_params:
-    num_class: 1
-    seq_len: 192
-  # config for decoder
-  decoder:
-      type: 'hifigan' # either hifigan or istftnet
-      resblock_kernel_sizes: [3,7,11]
-      upsample_rates :  [10,5,3,2]
-      upsample_initial_channel: 512
-      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
-      upsample_kernel_sizes: [20,10,6,4]
-loss_params:
-    lambda_mel: 5. # mel reconstruction loss
-    lambda_gen: 1. # generator loss
-    lambda_mono: 1. # monotonic alignment loss (TMA)
-    lambda_s2s: 1. # sequence-to-sequence loss (TMA)
-    lambda_F0: 1. # F0 reconstruction loss
-    lambda_norm: 1. # norm reconstruction loss
-    lambda_dur: 1. # duration loss
-    lambda_ce: 20. # duration predictor probability output CE loss
-optimizer_params:
-  lr: 0.0001 # general learning rate
   ft_lr: 0.00001 # learning rate for acoustic modules

+log_dir: "Models/Finetune_Extend"
+save_freq: 1
+log_interval: 5
+device: "cuda"
+epochs: 50
+batch_size: 3
+max_len: 210 # maximum number of frames
+pretrained_model: "Models/Finetune_Extend/current_model.pth"
+load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
+data_params:
+  train_data: "../../Data_Speech/viVoice/train.txt"
+  val_data: "../../Data_Speech/combine/combine_val.txt"
+  root_path: "../../Data_Speech/"
+  min_length: 50 # sample until texts with this size are obtained for OOD texts
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  n_token: 189 # number of phoneme tokens
+  max_dur: 50 # maximum duration of a single phoneme
+  style_dim: 128 # style vector size
+  dropout: 0.2
+  ASR_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 189 # number of phoneme tokens
+    n_layers: 6
+    token_embedding_dim: 512
+  JDC_params:
+    num_class: 1
+    seq_len: 192
+  # config for decoder
+  decoder:
+      type: 'hifigan' # either hifigan or istftnet
+      resblock_kernel_sizes: [3,7,11]
+      upsample_rates :  [10,5,3,2]
+      upsample_initial_channel: 512
+      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+      upsample_kernel_sizes: [20,10,6,4]
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss
+    lambda_gen: 1. # generator loss
+    lambda_mono: 1. # monotonic alignment loss (TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (TMA)
+    lambda_F0: 1. # F0 reconstruction loss
+    lambda_norm: 1. # norm reconstruction loss
+    lambda_dur: 1. # duration loss
+    lambda_ce: 20. # duration predictor probability output CE loss
+optimizer_params:
+  lr: 0.0001 # general learning rate
   ft_lr: 0.00001 # learning rate for acoustic modules

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import traceback
 from inference import StyleTTS2
 repo_dir = './'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
-config_path = os.path.join(repo_dir, "Models", "config.yml")
 models_path = os.path.join(repo_dir, "Models", "model.pth")
 model = StyleTTS2(config_path, models_path).eval().to(device)
 voice_path = os.path.join(repo_dir, "reference_audio")

 from inference import StyleTTS2
 repo_dir = './'
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
+config_path = os.path.join(repo_dir, "Models", "config.yaml")
 models_path = os.path.join(repo_dir, "Models", "model.pth")
 model = StyleTTS2(config_path, models_path).eval().to(device)
 voice_path = os.path.join(repo_dir, "reference_audio")

inference.py CHANGED Viewed

@@ -261,7 +261,7 @@ class StyleTTS2(torch.nn.Module):
             # cal alignment
             d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
             x, _ = self.predictor.lstm(d)
-            duration = self.predictor.duration_proj(x) / speed
             duration = torch.sigmoid(duration).sum(axis=-1)
             if prev_d_mean != 0:#Stabilize speaking speed between splits
@@ -270,6 +270,8 @@ class StyleTTS2(torch.nn.Module):
                 dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
             duration = duration*(1-t) + dur_stats*t
             duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
             pred_dur = torch.round(duration.squeeze()).clamp(min=1)
             pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))

             # cal alignment
             d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
             x, _ = self.predictor.lstm(d)
+            duration = self.predictor.duration_proj(x)
             duration = torch.sigmoid(duration).sum(axis=-1)
             if prev_d_mean != 0:#Stabilize speaking speed between splits
                 dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
             duration = duration*(1-t) + dur_stats*t
             duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
+            duration /= speed
             pred_dur = torch.round(duration.squeeze()).clamp(min=1)
             pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))

run.ipynb CHANGED Viewed

@@ -38,12 +38,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "e7b9c01d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "config_path = \"Models/config.yml\"\n",
     "models_path = \"Models/model.pth\""
    ]
   },
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "78396f70",
    "metadata": {},
    "outputs": [
@@ -121,12 +121,12 @@
     "    \"id_1\": {\n",
     "        \"path\": \"./reference_audio/vn_3.wav\",   #Ref audio path\n",
     "        \"lang\": \"vi\",                           #Default language\n",
-    "        \"speed\": 1.1,                           #Speaking speed\n",
     "    },\n",
     "    \"id_2\": {\n",
     "        \"path\": \"./reference_audio/vn_4.wav\",\n",
     "        \"lang\": \"vi\",\n",
-    "        \"speed\": 1.1,\n",
     "    },\n",
     "}\n",
     "for id in speakers:\n",
@@ -159,7 +159,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "16194211",
    "metadata": {},
    "outputs": [
@@ -192,7 +192,7 @@
     "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
     "stabilize         = True      #BOOL   Stabilize speaking speed.\n",
     "denoise           = 0.6       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
-    "n_merge           = 20        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "e7b9c01d",
    "metadata": {},
    "outputs": [],
    "source": [
+    "config_path = \"Models/config.yaml\"\n",
     "models_path = \"Models/model.pth\""
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "78396f70",
    "metadata": {},
    "outputs": [
     "    \"id_1\": {\n",
     "        \"path\": \"./reference_audio/vn_3.wav\",   #Ref audio path\n",
     "        \"lang\": \"vi\",                           #Default language\n",
+    "        \"speed\": 1.0,                           #Speaking speed\n",
     "    },\n",
     "    \"id_2\": {\n",
     "        \"path\": \"./reference_audio/vn_4.wav\",\n",
     "        \"lang\": \"vi\",\n",
+    "        \"speed\": 1.0,\n",
     "    },\n",
     "}\n",
     "for id in speakers:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "16194211",
    "metadata": {},
    "outputs": [
     "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
     "stabilize         = True      #BOOL   Stabilize speaking speed.\n",
     "denoise           = 0.6       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
+    "n_merge           = 18        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
    ]
   },
   {