Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on May 4

Commit

0bebc31

verified ·

1 Parent(s): 0abf49a

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (41) hide show

.pre-commit-config.yaml +5 -2
app.py +166 -97
pyproject.toml +1 -1
ruff.toml +1 -1
src/f5_tts/api.py +2 -2
src/f5_tts/eval/ecapa_tdnn.py +1 -0
src/f5_tts/eval/eval_infer_batch.py +2 -0
src/f5_tts/eval/eval_librispeech_test_clean.py +4 -5
src/f5_tts/eval/eval_seedtts_testset.py +4 -5
src/f5_tts/infer/infer_cli.py +7 -7
src/f5_tts/infer/speech_edit.py +3 -1
src/f5_tts/infer/utils_infer.py +4 -4
src/f5_tts/model/__init__.py +2 -4
src/f5_tts/model/backbones/dit.py +4 -5
src/f5_tts/model/backbones/mmdit.py +3 -4
src/f5_tts/model/backbones/unett.py +6 -6
src/f5_tts/model/trainer.py +1 -0
src/f5_tts/model/utils.py +2 -3
src/f5_tts/runtime/triton_trtllm/benchmark.py +12 -11
src/f5_tts/runtime/triton_trtllm/client_grpc.py +0 -1
src/f5_tts/runtime/triton_trtllm/client_http.py +3 -2
src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/f5_tts_trtllm.py +6 -7
src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py +6 -5
src/f5_tts/runtime/triton_trtllm/patch/__init__.py +3 -2
src/f5_tts/runtime/triton_trtllm/patch/f5tts/model.py +9 -12
src/f5_tts/runtime/triton_trtllm/patch/f5tts/modules.py +14 -12
src/f5_tts/runtime/triton_trtllm/scripts/conv_stft.py +1 -0
src/f5_tts/runtime/triton_trtllm/scripts/convert_checkpoint.py +0 -1
src/f5_tts/runtime/triton_trtllm/scripts/export_vocoder_to_onnx.py +4 -3
src/f5_tts/scripts/count_params_gflops.py +5 -4
src/f5_tts/socket_client.py +5 -3
src/f5_tts/socket_server.py +5 -4
src/f5_tts/train/datasets/prepare_csv_wavs.py +7 -8
src/f5_tts/train/datasets/prepare_emilia.py +3 -5
src/f5_tts/train/datasets/prepare_emilia_v2.py +6 -6
src/f5_tts/train/datasets/prepare_libritts.py +3 -1
src/f5_tts/train/datasets/prepare_ljspeech.py +3 -1
src/f5_tts/train/datasets/prepare_wenetspeech4tts.py +2 -1
src/f5_tts/train/finetune_cli.py +2 -2
src/f5_tts/train/finetune_gradio.py +5 -5
src/f5_tts/train/train.py +1 -0

.pre-commit-config.yaml CHANGED Viewed

@@ -3,11 +3,14 @@ repos:
     # Ruff version.
     rev: v0.11.2
     hooks:
-      # Run the linter.
       - id: ruff
         args: [--fix]
-      # Run the formatter.
       - id: ruff-format
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:

     # Ruff version.
     rev: v0.11.2
     hooks:
       - id: ruff
+        name: ruff linter
         args: [--fix]
       - id: ruff-format
+        name: ruff formatter
+      - id: ruff
+        name: ruff sorter
+        args: [--select, I, --fix]
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import re
 import tempfile
 from collections import OrderedDict
 from importlib.resources import files
 import click
@@ -17,6 +18,7 @@ import torchaudio
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
     import spaces
@@ -32,15 +34,15 @@ def gpu_decorator(func):
         return func
-from f5_tts.model import DiT, UNetT
 from f5_tts.infer.utils_infer import (
-    load_vocoder,
     load_model,
     preprocess_ref_audio_text,
-    infer_process,
     remove_silence_for_generated_wav,
     save_spectrogram,
 )
 DEFAULT_TTS_MODEL = "F5-TTS_v1"
@@ -122,6 +124,7 @@ def load_text_from_file(file):
     return gr.update(value=text)
 @gpu_decorator
 def infer(
     ref_audio_orig,
@@ -140,7 +143,11 @@ def infer(
         return gr.update(), gr.update(), ref_text
     # Set inference seed
     torch.manual_seed(seed)
     if not gen_text.strip():
         gr.Warning("Please enter text to generate or upload a text file.")
@@ -191,7 +198,7 @@ def infer(
         spectrogram_path = tmp_spectrogram.name
         save_spectrogram(combined_spectrogram, spectrogram_path)
-    return (final_sample_rate, final_wave), spectrogram_path, ref_text
 with gr.Blocks() as app_credits:
@@ -277,27 +284,21 @@ with gr.Blocks() as app_tts:
         nfe_slider,
         speed_slider,
     ):
-        # Determine the seed to use
         if randomize_seed:
-            seed = np.random.randint(0, 2**31 - 1)
-        else:
-            seed = seed_input
-            if seed < 0 or seed > 2**31 - 1:
-                gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
-                seed = np.random.randint(0, 2**31 - 1)
-        audio_out, spectrogram_path, ref_text_out = infer(
             ref_audio_input,
             ref_text_input,
             gen_text_input,
             tts_model_choice,
             remove_silence,
-            seed=seed,
             cross_fade_duration=cross_fade_duration_slider,
             nfe_step=nfe_slider,
             speed=speed_slider,
         )
-        return audio_out, spectrogram_path, ref_text_out, seed
     gen_text_file.upload(
         load_text_from_file,
@@ -329,26 +330,34 @@ with gr.Blocks() as app_tts:
 def parse_speechtypes_text(gen_text):
-    # Pattern to find {speechtype}
-    pattern = r"\{(.*?)\}"
     # Split the text by the pattern
     tokens = re.split(pattern, gen_text)
     segments = []
-    current_style = "Regular"
     for i in range(len(tokens)):
         if i % 2 == 0:
             # This is text
             text = tokens[i].strip()
             if text:
-                segments.append({"style": current_style, "text": text})
         else:
-            # This is style
-            style = tokens[i].strip()
-            current_style = style
     return segments
@@ -366,41 +375,48 @@ with gr.Blocks() as app_multistyle:
     with gr.Row():
         gr.Markdown(
             """
-            **Example Input:**
-            {Regular} Hello, I'd like to order a sandwich please.
-            {Surprised} What do you mean you're out of bread?
-            {Sad} I really wanted a sandwich though...
-            {Angry} You know what, darn you and your little shop!
-            {Whisper} I'll just go back home and cry now.
             {Shouting} Why me?!
             """
         )
         gr.Markdown(
             """
-            **Example Input 2:**
-            {Speaker1_Happy} Hello, I'd like to order a sandwich please.
-            {Speaker2_Regular} Sorry, we're out of bread.
-            {Speaker1_Sad} I really wanted a sandwich though...
-            {Speaker2_Whisper} I'll give you the last one I was hiding.
             """
         )
     gr.Markdown(
-        "Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button."
     )
     # Regular speech type (mandatory)
-    with gr.Row() as regular_row:
         with gr.Column(scale=1, min_width=160):
             regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
             regular_insert = gr.Button("Insert Label", variant="secondary")
         with gr.Column(scale=3):
             regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
         with gr.Column(scale=3):
-            regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=8, scale=3)
-        with gr.Column(scale=1):
-            regular_ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1)
     # Regular speech type (max 100)
     max_speech_types = 100
@@ -409,32 +425,54 @@ with gr.Blocks() as app_multistyle:
     speech_type_audios = [regular_audio]
     speech_type_ref_texts = [regular_ref_text]
     speech_type_ref_text_files = [regular_ref_text_file]
     speech_type_delete_btns = [None]
     speech_type_insert_btns = [regular_insert]
     # Additional speech types (99 more)
     for i in range(max_speech_types - 1):
-        with gr.Row(visible=False) as row:
             with gr.Column(scale=1, min_width=160):
                 name_input = gr.Textbox(label="Speech Type Name")
-                delete_btn = gr.Button("Delete Type", variant="secondary")
                 insert_btn = gr.Button("Insert Label", variant="secondary")
             with gr.Column(scale=3):
                 audio_input = gr.Audio(label="Reference Audio", type="filepath")
             with gr.Column(scale=3):
-                ref_text_input = gr.Textbox(label="Reference Text", lines=8, scale=3)
-            with gr.Column(scale=1):
-                ref_text_file_input = gr.File(
-                    label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1
-                )
         speech_type_rows.append(row)
         speech_type_names.append(name_input)
         speech_type_audios.append(audio_input)
         speech_type_ref_texts.append(ref_text_input)
         speech_type_ref_text_files.append(ref_text_file_input)
         speech_type_delete_btns.append(delete_btn)
         speech_type_insert_btns.append(insert_btn)
     # Button to add speech type
     add_speech_type_btn = gr.Button("Add Speech Type")
@@ -470,18 +508,6 @@ with gr.Blocks() as app_multistyle:
                 speech_type_ref_text_files[i],
             ],
         )
-        speech_type_ref_text_files[i].upload(
-            load_text_from_file,
-            inputs=[speech_type_ref_text_files[i]],
-            outputs=[speech_type_ref_texts[i]],
-        )
-    # Update regular speech type ref text file
-    regular_ref_text_file.upload(
-        load_text_from_file,
-        inputs=[regular_ref_text_file],
-        outputs=[regular_ref_text],
-    )
     # Text input for the prompt
     with gr.Row():
@@ -495,10 +521,17 @@ with gr.Blocks() as app_multistyle:
         gen_text_file_multistyle = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
     def make_insert_speech_type_fn(index):
-        def insert_speech_type_fn(current_text, speech_type_name):
             current_text = current_text or ""
-            speech_type_name = speech_type_name or "None"
-            updated_text = current_text + f"{{{speech_type_name}}} "
             return updated_text
         return insert_speech_type_fn
@@ -507,16 +540,24 @@ with gr.Blocks() as app_multistyle:
         insert_fn = make_insert_speech_type_fn(i)
         insert_btn.click(
             insert_fn,
-            inputs=[gen_text_input_multistyle, speech_type_names[i]],
             outputs=gen_text_input_multistyle,
         )
-    with gr.Accordion("Advanced Settings", open=False):
-        remove_silence_multistyle = gr.Checkbox(
-            label="Remove Silences",
-            info="Turn on to automatically detect and crop long silences.",
-            value=True,
-        )
     # Generate button
     generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
@@ -524,6 +565,24 @@ with gr.Blocks() as app_multistyle:
     # Output audio
     audio_output_multistyle = gr.Audio(label="Synthesized Audio")
     gen_text_file_multistyle.upload(
         load_text_from_file,
         inputs=[gen_text_file_multistyle],
@@ -557,44 +616,60 @@ with gr.Blocks() as app_multistyle:
         # For each segment, generate speech
         generated_audio_segments = []
-        current_style = "Regular"
         for segment in segments:
-            style = segment["style"]
             text = segment["text"]
-            if style in speech_types:
-                current_style = style
             else:
-                gr.Warning(f"Type {style} is not available, will use Regular as default.")
-                current_style = "Regular"
             try:
-                ref_audio = speech_types[current_style]["audio"]
             except KeyError:
-                gr.Warning(f"Please provide reference audio for type {current_style}.")
-                return [None] + [speech_types[style]["ref_text"] for style in speech_types]
-            ref_text = speech_types[current_style].get("ref_text", "")
-            # TODO. Attribute each type a unique seed (maybe also speed, pseudo-feature for #730 #813)
-            seed = np.random.randint(0, 2**31 - 1)
-            # Generate speech for this segment
-            audio_out, _, ref_text_out = infer(
-                ref_audio, ref_text, text, tts_model_choice, remove_silence, seed, 0, show_info=print
-            )  # show_info=print no pull to top when generating
             sr, audio_data = audio_out
             generated_audio_segments.append(audio_data)
-            speech_types[current_style]["ref_text"] = ref_text_out
         # Concatenate all audio segments
         if generated_audio_segments:
             final_audio_data = np.concatenate(generated_audio_segments)
-            return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
         else:
             gr.Warning("No audio generated.")
-            return [None] + [speech_types[style]["ref_text"] for style in speech_types]
     generate_multistyle_btn.click(
         generate_multistyle_speech,
@@ -607,7 +682,7 @@ with gr.Blocks() as app_multistyle:
         + [
             remove_silence_multistyle,
         ],
-        outputs=[audio_output_multistyle] + speech_type_ref_texts,
     )
     # Validation function to disable Generate button if speech types are missing
@@ -624,7 +699,7 @@ with gr.Blocks() as app_multistyle:
         # Parse the gen_text to get the speech types used
         segments = parse_speechtypes_text(gen_text)
-        speech_types_in_text = set(segment["style"] for segment in segments)
         # Check if all speech types in text are available
         missing_speech_types = speech_types_in_text - speech_types_available
@@ -788,27 +863,21 @@ Have a conversation with an AI using your reference voice!
             if not last_ai_response or conv_state[-1]["role"] != "assistant":
                 return None, ref_text, seed_input
-            # Determine the seed to use
             if randomize_seed:
-                seed = np.random.randint(0, 2**31 - 1)
-            else:
-                seed = seed_input
-                if seed < 0 or seed > 2**31 - 1:
-                    gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
-                    seed = np.random.randint(0, 2**31 - 1)
-            audio_result, _, ref_text_out = infer(
                 ref_audio,
                 ref_text,
                 last_ai_response,
                 tts_model_choice,
                 remove_silence,
-                seed=seed,
                 cross_fade_duration=0.15,
                 speed=1.0,
                 show_info=print,  # show_info=print no pull to top when generating
             )
-            return audio_result, ref_text_out, seed
         def clear_conversation():
             """Reset the conversation"""

 import re
 import tempfile
 from collections import OrderedDict
+from functools import lru_cache
 from importlib.resources import files
 import click
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 try:
     import spaces
         return func
 from f5_tts.infer.utils_infer import (
+    infer_process,
     load_model,
+    load_vocoder,
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
     save_spectrogram,
 )
+from f5_tts.model import DiT, UNetT
 DEFAULT_TTS_MODEL = "F5-TTS_v1"
     return gr.update(value=text)
+@lru_cache(maxsize=100)
 @gpu_decorator
 def infer(
     ref_audio_orig,
         return gr.update(), gr.update(), ref_text
     # Set inference seed
+    if seed < 0 or seed > 2**31 - 1:
+        gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
+        seed = np.random.randint(0, 2**31 - 1)
     torch.manual_seed(seed)
+    used_seed = seed
     if not gen_text.strip():
         gr.Warning("Please enter text to generate or upload a text file.")
         spectrogram_path = tmp_spectrogram.name
         save_spectrogram(combined_spectrogram, spectrogram_path)
+    return (final_sample_rate, final_wave), spectrogram_path, ref_text, used_seed
 with gr.Blocks() as app_credits:
         nfe_slider,
         speed_slider,
     ):
         if randomize_seed:
+            seed_input = np.random.randint(0, 2**31 - 1)
+        audio_out, spectrogram_path, ref_text_out, used_seed = infer(
             ref_audio_input,
             ref_text_input,
             gen_text_input,
             tts_model_choice,
             remove_silence,
+            seed=seed_input,
             cross_fade_duration=cross_fade_duration_slider,
             nfe_step=nfe_slider,
             speed=speed_slider,
         )
+        return audio_out, spectrogram_path, ref_text_out, used_seed
     gen_text_file.upload(
         load_text_from_file,
 def parse_speechtypes_text(gen_text):
+    # Pattern to find {str} or {"name": str, "seed": int, "speed": float}
+    pattern = r"(\{.*?\})"
     # Split the text by the pattern
     tokens = re.split(pattern, gen_text)
     segments = []
+    current_type_dict = {
+        "name": "Regular",
+        "seed": -1,
+        "speed": 1.0,
+    }
     for i in range(len(tokens)):
         if i % 2 == 0:
             # This is text
             text = tokens[i].strip()
             if text:
+                current_type_dict["text"] = text
+                segments.append(current_type_dict)
         else:
+            # This is type
+            type_str = tokens[i].strip()
+            try:  # if type dict
+                current_type_dict = json.loads(type_str)
+            except json.decoder.JSONDecodeError:
+                current_type_dict = {"name": type_str, "seed": -1, "speed": 1.0}
     return segments
     with gr.Row():
         gr.Markdown(
             """
+            **Example Input:** <br>
+            {Regular} Hello, I'd like to order a sandwich please. <br>
+            {Surprised} What do you mean you're out of bread? <br>
+            {Sad} I really wanted a sandwich though... <br>
+            {Angry} You know what, darn you and your little shop! <br>
+            {Whisper} I'll just go back home and cry now. <br>
             {Shouting} Why me?!
             """
         )
         gr.Markdown(
             """
+            **Example Input 2:** <br>
+            {"name": "Speaker1_Happy", "seed": -1, "speed": 1} Hello, I'd like to order a sandwich please. <br>
+            {"name": "Speaker2_Regular", "seed": -1, "speed": 1} Sorry, we're out of bread. <br>
+            {"name": "Speaker1_Sad", "seed": -1, "speed": 1} I really wanted a sandwich though... <br>
+            {"name": "Speaker2_Whisper", "seed": -1, "speed": 1} I'll give you the last one I was hiding.
             """
         )
     gr.Markdown(
+        'Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the "Add Speech Type" button.'
     )
     # Regular speech type (mandatory)
+    with gr.Row(variant="compact") as regular_row:
         with gr.Column(scale=1, min_width=160):
             regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
             regular_insert = gr.Button("Insert Label", variant="secondary")
         with gr.Column(scale=3):
             regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
         with gr.Column(scale=3):
+            regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=4)
+            with gr.Row():
+                regular_seed_slider = gr.Slider(
+                    show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed, -1 for random"
+                )
+                regular_speed_slider = gr.Slider(
+                    show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
+                )
+        with gr.Column(scale=1, min_width=160):
+            regular_ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
     # Regular speech type (max 100)
     max_speech_types = 100
     speech_type_audios = [regular_audio]
     speech_type_ref_texts = [regular_ref_text]
     speech_type_ref_text_files = [regular_ref_text_file]
+    speech_type_seeds = [regular_seed_slider]
+    speech_type_speeds = [regular_speed_slider]
     speech_type_delete_btns = [None]
     speech_type_insert_btns = [regular_insert]
     # Additional speech types (99 more)
     for i in range(max_speech_types - 1):
+        with gr.Row(variant="compact", visible=False) as row:
             with gr.Column(scale=1, min_width=160):
                 name_input = gr.Textbox(label="Speech Type Name")
                 insert_btn = gr.Button("Insert Label", variant="secondary")
+                delete_btn = gr.Button("Delete Type", variant="stop")
             with gr.Column(scale=3):
                 audio_input = gr.Audio(label="Reference Audio", type="filepath")
             with gr.Column(scale=3):
+                ref_text_input = gr.Textbox(label="Reference Text", lines=4)
+                with gr.Row():
+                    seed_input = gr.Slider(
+                        show_label=False, minimum=-1, maximum=999, value=-1, step=1, info="Seed. -1 for random"
+                    )
+                    speed_input = gr.Slider(
+                        show_label=False, minimum=0.3, maximum=2.0, value=1.0, step=0.1, info="Adjust the speed"
+                    )
+            with gr.Column(scale=1, min_width=160):
+                ref_text_file_input = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"])
         speech_type_rows.append(row)
         speech_type_names.append(name_input)
         speech_type_audios.append(audio_input)
         speech_type_ref_texts.append(ref_text_input)
         speech_type_ref_text_files.append(ref_text_file_input)
+        speech_type_seeds.append(seed_input)
+        speech_type_speeds.append(speed_input)
         speech_type_delete_btns.append(delete_btn)
         speech_type_insert_btns.append(insert_btn)
+    # Global logic for all speech types
+    for i in range(max_speech_types):
+        speech_type_audios[i].clear(
+            lambda: [None, None],
+            None,
+            [speech_type_ref_texts[i], speech_type_ref_text_files[i]],
+        )
+        speech_type_ref_text_files[i].upload(
+            load_text_from_file,
+            inputs=[speech_type_ref_text_files[i]],
+            outputs=[speech_type_ref_texts[i]],
+        )
     # Button to add speech type
     add_speech_type_btn = gr.Button("Add Speech Type")
                 speech_type_ref_text_files[i],
             ],
         )
     # Text input for the prompt
     with gr.Row():
         gen_text_file_multistyle = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
     def make_insert_speech_type_fn(index):
+        def insert_speech_type_fn(current_text, speech_type_name, speech_type_seed, speech_type_speed):
             current_text = current_text or ""
+            if not speech_type_name:
+                gr.Warning("Please enter speech type name before insert.")
+                return current_text
+            speech_type_dict = {
+                "name": speech_type_name,
+                "seed": speech_type_seed,
+                "speed": speech_type_speed,
+            }
+            updated_text = current_text + json.dumps(speech_type_dict) + " "
             return updated_text
         return insert_speech_type_fn
         insert_fn = make_insert_speech_type_fn(i)
         insert_btn.click(
             insert_fn,
+            inputs=[gen_text_input_multistyle, speech_type_names[i], speech_type_seeds[i], speech_type_speeds[i]],
             outputs=gen_text_input_multistyle,
         )
+    with gr.Accordion("Advanced Settings", open=True):
+        with gr.Row():
+            with gr.Column():
+                show_cherrypick_multistyle = gr.Checkbox(
+                    label="Show Cherry-pick Interface",
+                    info="Turn on to show interface, picking seeds from previous generations.",
+                    value=False,
+                )
+            with gr.Column():
+                remove_silence_multistyle = gr.Checkbox(
+                    label="Remove Silences",
+                    info="Turn on to automatically detect and crop long silences.",
+                    value=True,
+                )
     # Generate button
     generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
     # Output audio
     audio_output_multistyle = gr.Audio(label="Synthesized Audio")
+    # Used seed gallery
+    cherrypick_interface_multistyle = gr.Textbox(
+        label="Cherry-pick Interface",
+        lines=10,
+        max_lines=40,
+        show_copy_button=True,
+        interactive=False,
+        visible=False,
+    )
+    # Logic control to show/hide the cherrypick interface
+    show_cherrypick_multistyle.change(
+        lambda is_visible: gr.update(visible=is_visible),
+        show_cherrypick_multistyle,
+        cherrypick_interface_multistyle,
+    )
+    # Function to load text to generate from file
     gen_text_file_multistyle.upload(
         load_text_from_file,
         inputs=[gen_text_file_multistyle],
         # For each segment, generate speech
         generated_audio_segments = []
+        current_type_name = "Regular"
+        inference_meta_data = ""
         for segment in segments:
+            name = segment["name"]
+            seed_input = segment["seed"]
+            speed = segment["speed"]
             text = segment["text"]
+            if name in speech_types:
+                current_type_name = name
             else:
+                gr.Warning(f"Type {name} is not available, will use Regular as default.")
+                current_type_name = "Regular"
             try:
+                ref_audio = speech_types[current_type_name]["audio"]
             except KeyError:
+                gr.Warning(f"Please provide reference audio for type {current_type_name}.")
+                return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
+            ref_text = speech_types[current_type_name].get("ref_text", "")
+            if seed_input == -1:
+                seed_input = np.random.randint(0, 2**31 - 1)
+            # Generate or retrieve speech for this segment
+            audio_out, _, ref_text_out, used_seed = infer(
+                ref_audio,
+                ref_text,
+                text,
+                tts_model_choice,
+                remove_silence,
+                seed=seed_input,
+                cross_fade_duration=0,
+                speed=speed,
+                show_info=print,  # no pull to top when generating
+            )
             sr, audio_data = audio_out
             generated_audio_segments.append(audio_data)
+            speech_types[current_type_name]["ref_text"] = ref_text_out
+            inference_meta_data += json.dumps(dict(name=name, seed=used_seed, speed=speed)) + f" {text}\n"
         # Concatenate all audio segments
         if generated_audio_segments:
             final_audio_data = np.concatenate(generated_audio_segments)
+            return (
+                [(sr, final_audio_data)]
+                + [speech_types[name]["ref_text"] for name in speech_types]
+                + [inference_meta_data]
+            )
         else:
             gr.Warning("No audio generated.")
+            return [None] + [speech_types[name]["ref_text"] for name in speech_types] + [None]
     generate_multistyle_btn.click(
         generate_multistyle_speech,
         + [
             remove_silence_multistyle,
         ],
+        outputs=[audio_output_multistyle] + speech_type_ref_texts + [cherrypick_interface_multistyle],
     )
     # Validation function to disable Generate button if speech types are missing
         # Parse the gen_text to get the speech types used
         segments = parse_speechtypes_text(gen_text)
+        speech_types_in_text = set(segment["name"] for segment in segments)
         # Check if all speech types in text are available
         missing_speech_types = speech_types_in_text - speech_types_available
             if not last_ai_response or conv_state[-1]["role"] != "assistant":
                 return None, ref_text, seed_input
             if randomize_seed:
+                seed_input = np.random.randint(0, 2**31 - 1)
+            audio_result, _, ref_text_out, used_seed = infer(
                 ref_audio,
                 ref_text,
                 last_ai_response,
                 tts_model_choice,
                 remove_silence,
+                seed=seed_input,
                 cross_fade_duration=0.15,
                 speed=1.0,
                 show_info=print,  # show_info=print no pull to top when generating
             )
+            return audio_result, ref_text_out, used_seed
         def clear_conversation():
             """Reset the conversation"""

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "f5-tts"
-version = "1.1.2"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

 [project]
 name = "f5-tts"
+version = "1.1.3"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}

ruff.toml CHANGED Viewed

@@ -6,5 +6,5 @@ target-version = "py310"
 dummy-variable-rgx = "^_.*$"
 [lint.isort]
-force-single-line = true
 lines-after-imports = 2

 dummy-variable-rgx = "^_.*$"
 [lint.isort]
+force-single-line = false
 lines-after-imports = 2

src/f5_tts/api.py CHANGED Viewed

@@ -9,13 +9,13 @@ from hydra.utils import get_class
 from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
     load_model,
     load_vocoder,
-    transcribe,
     preprocess_ref_audio_text,
-    infer_process,
     remove_silence_for_generated_wav,
     save_spectrogram,
 )
 from f5_tts.model.utils import seed_everything

 from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
+    infer_process,
     load_model,
     load_vocoder,
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
     save_spectrogram,
+    transcribe,
 )
 from f5_tts.model.utils import seed_everything

src/f5_tts/eval/ecapa_tdnn.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
 import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
 import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

src/f5_tts/eval/eval_infer_batch.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import argparse
@@ -23,6 +24,7 @@ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
 from f5_tts.model import CFM
 from f5_tts.model.utils import get_tokenizer
 accelerator = Accelerator()
 device = f"cuda:{accelerator.process_index}"

 import os
 import sys
 sys.path.append(os.getcwd())
 import argparse
 from f5_tts.model import CFM
 from f5_tts.model.utils import get_tokenizer
 accelerator = Accelerator()
 device = f"cuda:{accelerator.process_index}"

src/f5_tts/eval/eval_librispeech_test_clean.py CHANGED Viewed

@@ -5,17 +5,16 @@ import json
 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
-from f5_tts.eval.utils_eval import (
-    get_librispeech_test,
-    run_asr_wer,
-    run_sim,
-)
 rel_path = str(files("f5_tts").joinpath("../../"))

 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
+from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))

src/f5_tts/eval/eval_seedtts_testset.py CHANGED Viewed

@@ -5,17 +5,16 @@ import json
 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
-from f5_tts.eval.utils_eval import (
-    get_seed_tts_test,
-    run_asr_wer,
-    run_sim,
-)
 rel_path = str(files("f5_tts").joinpath("../../"))

 import os
 import sys
 sys.path.append(os.getcwd())
 import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
+from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
 rel_path = str(files("f5_tts").joinpath("../../"))

src/f5_tts/infer/infer_cli.py CHANGED Viewed

@@ -14,20 +14,20 @@ from hydra.utils import get_class
 from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
-    mel_spec_type,
-    target_rms,
-    cross_fade_duration,
-    nfe_step,
     cfg_strength,
-    sway_sampling_coef,
-    speed,
-    fix_duration,
     device,
     infer_process,
     load_model,
     load_vocoder,
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
 )

 from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
     cfg_strength,
+    cross_fade_duration,
     device,
+    fix_duration,
     infer_process,
     load_model,
     load_vocoder,
+    mel_spec_type,
+    nfe_step,
     preprocess_ref_audio_text,
     remove_silence_for_generated_wav,
+    speed,
+    sway_sampling_coef,
+    target_rms,
 )

src/f5_tts/infer/speech_edit.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 from importlib.resources import files
@@ -7,14 +8,15 @@ from importlib.resources import files
 import torch
 import torch.nn.functional as F
 import torchaudio
 from hydra.utils import get_class
 from omegaconf import OmegaConf
-from cached_path import cached_path
 from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 device = (
     "cuda"
     if torch.cuda.is_available()

 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 from importlib.resources import files
 import torch
 import torch.nn.functional as F
 import torchaudio
+from cached_path import cached_path
 from hydra.utils import get_class
 from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
 from f5_tts.model import CFM
 from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 device = (
     "cuda"
     if torch.cuda.is_available()

src/f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
@@ -14,6 +15,7 @@ from importlib.resources import files
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pylab as plt
@@ -27,10 +29,8 @@ from transformers import pipeline
 from vocos import Vocos
 from f5_tts.model import CFM
-from f5_tts.model.utils import (
-    get_tokenizer,
-    convert_char_to_pinyin,
-)
 _ref_audio_cache = {}

 import sys
 from concurrent.futures import ThreadPoolExecutor
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # for MPS device compatibility
 sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pylab as plt
 from vocos import Vocos
 from f5_tts.model import CFM
+from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
 _ref_audio_cache = {}

src/f5_tts/model/__init__.py CHANGED Viewed

@@ -1,9 +1,7 @@
-from f5_tts.model.cfm import CFM
-from f5_tts.model.backbones.unett import UNetT
 from f5_tts.model.backbones.dit import DiT
 from f5_tts.model.backbones.mmdit import MMDiT
 from f5_tts.model.trainer import Trainer

 from f5_tts.model.backbones.dit import DiT
 from f5_tts.model.backbones.mmdit import MMDiT
+from f5_tts.model.backbones.unett import UNetT
+from f5_tts.model.cfm import CFM
 from f5_tts.model.trainer import Trainer

src/f5_tts/model/backbones/dit.py CHANGED Viewed

@@ -10,19 +10,18 @@ d - dimension
 from __future__ import annotations
 import torch
-from torch import nn
 import torch.nn.functional as F
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
-    TimestepEmbedding,
     ConvNeXtV2Block,
     ConvPositionEmbedding,
     DiTBlock,
-    AdaLayerNorm_Final,
-    precompute_freqs_cis,
     get_pos_embed_indices,
 )

 from __future__ import annotations
 import torch
 import torch.nn.functional as F
+from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
+    AdaLayerNorm_Final,
     ConvNeXtV2Block,
     ConvPositionEmbedding,
     DiTBlock,
+    TimestepEmbedding,
     get_pos_embed_indices,
+    precompute_freqs_cis,
 )

src/f5_tts/model/backbones/mmdit.py CHANGED Viewed

@@ -11,16 +11,15 @@ from __future__ import annotations
 import torch
 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
-    TimestepEmbedding,
     ConvPositionEmbedding,
     MMDiTBlock,
-    AdaLayerNorm_Final,
-    precompute_freqs_cis,
     get_pos_embed_indices,
 )

 import torch
 from torch import nn
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
+    AdaLayerNorm_Final,
     ConvPositionEmbedding,
     MMDiTBlock,
+    TimestepEmbedding,
     get_pos_embed_indices,
+    precompute_freqs_cis,
 )

src/f5_tts/model/backbones/unett.py CHANGED Viewed

@@ -8,24 +8,24 @@ d - dimension
 """
 from __future__ import annotations
 from typing import Literal
 import torch
-from torch import nn
 import torch.nn.functional as F
 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
-    TimestepEmbedding,
-    ConvNeXtV2Block,
-    ConvPositionEmbedding,
     Attention,
     AttnProcessor,
     FeedForward,
-    precompute_freqs_cis,
     get_pos_embed_indices,
 )

 """
 from __future__ import annotations
 from typing import Literal
 import torch
 import torch.nn.functional as F
+from torch import nn
 from x_transformers import RMSNorm
 from x_transformers.x_transformers import RotaryEmbedding
 from f5_tts.model.modules import (
     Attention,
     AttnProcessor,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
     FeedForward,
+    TimestepEmbedding,
     get_pos_embed_indices,
+    precompute_freqs_cis,
 )

src/f5_tts/model/trainer.py CHANGED Viewed

@@ -19,6 +19,7 @@ from f5_tts.model import CFM
 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
 from f5_tts.model.utils import default, exists
 # trainer

 from f5_tts.model.dataset import DynamicBatchSampler, collate_fn
 from f5_tts.model.utils import default, exists
 # trainer

src/f5_tts/model/utils.py CHANGED Viewed

@@ -5,12 +5,11 @@ import random
 from collections import defaultdict
 from importlib.resources import files
 import torch
 from torch.nn.utils.rnn import pad_sequence
-import jieba
-from pypinyin import lazy_pinyin, Style
 # seed everything

 from collections import defaultdict
 from importlib.resources import files
+import jieba
 import torch
+from pypinyin import Style, lazy_pinyin
 from torch.nn.utils.rnn import pad_sequence
 # seed everything

src/f5_tts/runtime/triton_trtllm/benchmark.py CHANGED Viewed

@@ -30,26 +30,27 @@ import argparse
 import json
 import os
 import time
-from typing import List, Dict, Union
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
 import torchaudio
-import jieba
-from pypinyin import Style, lazy_pinyin
 from datasets import load_dataset
-import datasets
 from huggingface_hub import hf_hub_download
 from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
 from vocos import Vocos
-from f5_tts_trtllm import F5TTS
-import tensorrt as trt
-from tensorrt_llm.runtime.session import Session, TensorInfo
-from tensorrt_llm.logger import logger
-from tensorrt_llm._utils import trt_dtype_to_torch
 torch.manual_seed(0)
@@ -381,8 +382,8 @@ def main():
         import sys
         sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../../../src/")
-        from f5_tts.model import DiT
         from f5_tts.infer.utils_infer import load_model
         F5TTS_model_cfg = dict(
             dim=1024,

 import json
 import os
 import time
+from typing import Dict, List, Union
+import datasets
+import jieba
+import tensorrt as trt
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 import torchaudio
 from datasets import load_dataset
+from f5_tts_trtllm import F5TTS
 from huggingface_hub import hf_hub_download
+from pypinyin import Style, lazy_pinyin
+from tensorrt_llm._utils import trt_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime.session import Session, TensorInfo
+from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, DistributedSampler
 from tqdm import tqdm
 from vocos import Vocos
 torch.manual_seed(0)
         import sys
         sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../../../src/")
         from f5_tts.infer.utils_infer import load_model
+        from f5_tts.model import DiT
         F5TTS_model_cfg = dict(
             dim=1024,

src/f5_tts/runtime/triton_trtllm/client_grpc.py CHANGED Viewed

@@ -44,7 +44,6 @@ python3 client_grpc.py \
 import argparse
 import asyncio
 import json
 import os
 import time
 import types

 import argparse
 import asyncio
 import json
 import os
 import time
 import types

src/f5_tts/runtime/triton_trtllm/client_http.py CHANGED Viewed

@@ -23,10 +23,11 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import requests
 import soundfile as sf
-import numpy as np
-import argparse
 def get_args():

 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import numpy as np
 import requests
 import soundfile as sf
 def get_args():

src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/f5_tts_trtllm.py CHANGED Viewed

@@ -1,18 +1,17 @@
-import tensorrt as trt
-import os
 import math
 import time
-from typing import List, Optional
 from functools import wraps
 import tensorrt_llm
-from tensorrt_llm._utils import str_dtype_to_torch, trt_dtype_to_torch
-from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime.session import Session
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 def remove_tensor_padding(input_tensor, input_tensor_lengths=None):

 import math
+import os
 import time
 from functools import wraps
+from typing import List, Optional
+import tensorrt as trt
 import tensorrt_llm
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from tensorrt_llm._utils import str_dtype_to_torch, trt_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime.session import Session
 def remove_tensor_padding(input_tensor, input_tensor_lengths=None):

src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py CHANGED Viewed

@@ -24,16 +24,17 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import json
 import torch
-from torch.nn.utils.rnn import pad_sequence
 import torch.nn.functional as F
-from torch.utils.dlpack import from_dlpack, to_dlpack
 import torchaudio
-import jieba
 import triton_python_backend_utils as pb_utils
-from pypinyin import Style, lazy_pinyin
-import os
 from f5_tts_trtllm import F5TTS
 def get_tokenizer(vocab_file_path: str):

 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import json
+import os
+import jieba
 import torch
 import torch.nn.functional as F
 import torchaudio
 import triton_python_backend_utils as pb_utils
 from f5_tts_trtllm import F5TTS
+from pypinyin import Style, lazy_pinyin
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.dlpack import from_dlpack, to_dlpack
 def get_tokenizer(vocab_file_path: str):

src/f5_tts/runtime/triton_trtllm/patch/__init__.py CHANGED Viewed

@@ -34,6 +34,7 @@ from .deepseek_v2.model import DeepseekV2ForCausalLM
 from .dit.model import DiT
 from .eagle.model import EagleForCausalLM
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
 from .falcon.config import FalconConfig
 from .falcon.model import FalconForCausalLM, FalconModel
 from .gemma.config import GEMMA2_ARCHITECTURE, GEMMA_ARCHITECTURE, GemmaConfig
@@ -54,12 +55,12 @@ from .modeling_utils import PretrainedConfig, PretrainedModel, SpeculativeDecodi
 from .mpt.model import MPTForCausalLM, MPTModel
 from .nemotron_nas.model import DeciLMForCausalLM
 from .opt.model import OPTForCausalLM, OPTModel
-from .phi3.model import Phi3ForCausalLM, Phi3Model
 from .phi.model import PhiForCausalLM, PhiModel
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
 from .redrafter.model import ReDrafterForCausalLM
-from .f5tts.model import F5TTS
 __all__ = [
     "BertModel",

 from .dit.model import DiT
 from .eagle.model import EagleForCausalLM
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
+from .f5tts.model import F5TTS
 from .falcon.config import FalconConfig
 from .falcon.model import FalconForCausalLM, FalconModel
 from .gemma.config import GEMMA2_ARCHITECTURE, GEMMA_ARCHITECTURE, GemmaConfig
 from .mpt.model import MPTForCausalLM, MPTModel
 from .nemotron_nas.model import DeciLMForCausalLM
 from .opt.model import OPTForCausalLM, OPTModel
 from .phi.model import PhiForCausalLM, PhiModel
+from .phi3.model import Phi3ForCausalLM, Phi3Model
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
 from .redrafter.model import ReDrafterForCausalLM
 __all__ = [
     "BertModel",

src/f5_tts/runtime/triton_trtllm/patch/f5tts/model.py CHANGED Viewed

@@ -1,23 +1,20 @@
 from __future__ import annotations
-import sys
 import os
 import tensorrt as trt
-from collections import OrderedDict
 from ..._utils import str_dtype_to_trt
-from ...plugin import current_all_reduce_helper
-from ..modeling_utils import PretrainedConfig, PretrainedModel
 from ...functional import Tensor, concat
-from ...module import Module, ModuleList
-from tensorrt_llm._common import default_net
 from ...layers import Linear
-from .modules import (
-    TimestepEmbedding,
-    ConvPositionEmbedding,
-    DiTBlock,
-    AdaLayerNormZero_Final,
-)
 current_file_path = os.path.abspath(__file__)
 parent_dir = os.path.dirname(current_file_path)

 from __future__ import annotations
 import os
+import sys
+from collections import OrderedDict
 import tensorrt as trt
+from tensorrt_llm._common import default_net
 from ..._utils import str_dtype_to_trt
 from ...functional import Tensor, concat
 from ...layers import Linear
+from ...module import Module, ModuleList
+from ...plugin import current_all_reduce_helper
+from ..modeling_utils import PretrainedConfig, PretrainedModel
+from .modules import AdaLayerNormZero_Final, ConvPositionEmbedding, DiTBlock, TimestepEmbedding
 current_file_path = os.path.abspath(__file__)
 parent_dir = os.path.dirname(current_file_path)

src/f5_tts/runtime/triton_trtllm/patch/f5tts/modules.py CHANGED Viewed

@@ -3,33 +3,35 @@ from __future__ import annotations
 import math
 from typing import Optional
 import torch
 import torch.nn.functional as F
-import numpy as np
 from tensorrt_llm._common import default_net
-from ..._utils import trt_dtype_to_np, str_dtype_to_trt
 from ...functional import (
     Tensor,
     chunk,
     concat,
     constant,
     expand,
     shape,
     silu,
     slice,
-    permute,
-    expand_mask,
-    expand_dims_like,
-    unsqueeze,
-    matmul,
     softmax,
     squeeze,
-    cast,
-    gelu,
 )
-from ...functional import expand_dims, view, bert_attention
-from ...layers import LayerNorm, Linear, Conv1d, Mish, RowLinear, ColumnLinear
 from ...module import Module

 import math
 from typing import Optional
+import numpy as np
 import torch
 import torch.nn.functional as F
 from tensorrt_llm._common import default_net
+from ..._utils import str_dtype_to_trt, trt_dtype_to_np
 from ...functional import (
     Tensor,
+    bert_attention,
+    cast,
     chunk,
     concat,
     constant,
     expand,
+    expand_dims,
+    expand_dims_like,
+    expand_mask,
+    gelu,
+    matmul,
+    permute,
     shape,
     silu,
     slice,
     softmax,
     squeeze,
+    unsqueeze,
+    view,
 )
+from ...layers import ColumnLinear, Conv1d, LayerNorm, Linear, Mish, RowLinear
 from ...module import Module

src/f5_tts/runtime/triton_trtllm/scripts/conv_stft.py CHANGED Viewed

@@ -40,6 +40,7 @@ import torch as th
 import torch.nn.functional as F
 from scipy.signal import check_COLA, get_window
 support_clp_op = None
 if th.__version__ >= "1.7.0":
     from torch.fft import rfft as fft

 import torch.nn.functional as F
 from scipy.signal import check_COLA, get_window
 support_clp_op = None
 if th.__version__ >= "1.7.0":
     from torch.fft import rfft as fft

src/f5_tts/runtime/triton_trtllm/scripts/convert_checkpoint.py CHANGED Viewed

@@ -8,7 +8,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 import safetensors.torch
 import torch
 from tensorrt_llm import str_dtype_to_torch
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import split, split_matrix_tp

 import safetensors.torch
 import torch
 from tensorrt_llm import str_dtype_to_torch
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import split, split_matrix_tp

src/f5_tts/runtime/triton_trtllm/scripts/export_vocoder_to_onnx.py CHANGED Viewed

@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.nn as nn
-from huggingface_hub import hf_hub_download
 from conv_stft import STFT
 from vocos import Vocos
-import argparse
 opset_version = 17

 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import torch
 import torch.nn as nn
 from conv_stft import STFT
+from huggingface_hub import hf_hub_download
 from vocos import Vocos
 opset_version = 17

src/f5_tts/scripts/count_params_gflops.py CHANGED Viewed

@@ -1,12 +1,13 @@
-import sys
 import os
-sys.path.append(os.getcwd())
-from f5_tts.model import CFM, DiT
-import torch
 import thop
 """ ~155M """

 import os
+import sys
+sys.path.append(os.getcwd())
 import thop
+import torch
+from f5_tts.model import CFM, DiT
 """ ~155M """

src/f5_tts/socket_client.py CHANGED Viewed

@@ -1,10 +1,12 @@
-import socket
 import asyncio
-import pyaudio
-import numpy as np
 import logging
 import time
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 import asyncio
 import logging
+import socket
 import time
+import numpy as np
+import pyaudio
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

src/f5_tts/socket_server.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import argparse
 import gc
 import logging
-import numpy as np
 import queue
 import socket
 import struct
@@ -10,6 +9,7 @@ import traceback
 import wave
 from importlib.resources import files
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
@@ -18,12 +18,13 @@ from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
     chunk_text,
-    preprocess_ref_audio_text,
-    load_vocoder,
-    load_model,
     infer_batch_process,
 )
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

 import argparse
 import gc
 import logging
 import queue
 import socket
 import struct
 import wave
 from importlib.resources import files
+import numpy as np
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
 from f5_tts.infer.utils_infer import (
     chunk_text,
     infer_batch_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
 )
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

src/f5_tts/train/datasets/prepare_csv_wavs.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os
-import sys
 import signal
 import subprocess  # For invoking ffprobe
-import shutil
-import concurrent.futures
-import multiprocessing
 from contextlib import contextmanager
 sys.path.append(os.getcwd())
 import argparse
@@ -16,12 +17,10 @@ from importlib.resources import files
 from pathlib import Path
 import torchaudio
-from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
-from f5_tts.model.utils import (
-    convert_char_to_pinyin,
-)
 PRETRAINED_VOCAB_PATH = files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt")

+import concurrent.futures
+import multiprocessing
 import os
+import shutil
 import signal
 import subprocess  # For invoking ffprobe
+import sys
 from contextlib import contextmanager
 sys.path.append(os.getcwd())
 import argparse
 from pathlib import Path
 import torchaudio
 from datasets.arrow_writer import ArrowWriter
+from tqdm import tqdm
+from f5_tts.model.utils import convert_char_to_pinyin
 PRETRAINED_VOCAB_PATH = files("f5_tts").joinpath("../../data/Emilia_ZH_EN_pinyin/vocab.txt")

src/f5_tts/train/datasets/prepare_emilia.py CHANGED Viewed

@@ -7,20 +7,18 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
 from pathlib import Path
-from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
-from f5_tts.model.utils import (
-    repetition_found,
-    convert_char_to_pinyin,
-)
 out_zh = {

 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
 from pathlib import Path
 from datasets.arrow_writer import ArrowWriter
+from tqdm import tqdm
+from f5_tts.model.utils import convert_char_to_pinyin, repetition_found
 out_zh = {

src/f5_tts/train/datasets/prepare_emilia_v2.py CHANGED Viewed

@@ -1,17 +1,17 @@
 # put in src/f5_tts/train/datasets/prepare_emilia_v2.py
 # prepares Emilia dataset with the new format w/ Emilia-YODAS
-import os
 import json
 from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
-from tqdm import tqdm
 from datasets.arrow_writer import ArrowWriter
-from importlib.resources import files
-from f5_tts.model.utils import (
-    repetition_found,
-)
 # Define filters for exclusion
 out_en = set()

 # put in src/f5_tts/train/datasets/prepare_emilia_v2.py
 # prepares Emilia dataset with the new format w/ Emilia-YODAS
 import json
+import os
 from concurrent.futures import ProcessPoolExecutor
+from importlib.resources import files
 from pathlib import Path
 from datasets.arrow_writer import ArrowWriter
+from tqdm import tqdm
+from f5_tts.model.utils import repetition_found
 # Define filters for exclusion
 out_en = set()

src/f5_tts/train/datasets/prepare_libritts.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
 from pathlib import Path
-from tqdm import tqdm
 import soundfile as sf
 from datasets.arrow_writer import ArrowWriter
 def deal_with_audio_dir(audio_dir):

 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
 from pathlib import Path
 import soundfile as sf
 from datasets.arrow_writer import ArrowWriter
+from tqdm import tqdm
 def deal_with_audio_dir(audio_dir):

src/f5_tts/train/datasets/prepare_ljspeech.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from importlib.resources import files
 from pathlib import Path
-from tqdm import tqdm
 import soundfile as sf
 from datasets.arrow_writer import ArrowWriter
 def main():

 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from importlib.resources import files
 from pathlib import Path
 import soundfile as sf
 from datasets.arrow_writer import ArrowWriter
+from tqdm import tqdm
 def main():

src/f5_tts/train/datasets/prepare_wenetspeech4tts.py CHANGED Viewed

@@ -4,15 +4,16 @@
 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
-from tqdm import tqdm
 import torchaudio
 from datasets import Dataset
 from f5_tts.model.utils import convert_char_to_pinyin

 import os
 import sys
 sys.path.append(os.getcwd())
 import json
 from concurrent.futures import ProcessPoolExecutor
 from importlib.resources import files
 import torchaudio
 from datasets import Dataset
+from tqdm import tqdm
 from f5_tts.model.utils import convert_char_to_pinyin

src/f5_tts/train/finetune_cli.py CHANGED Viewed

@@ -5,9 +5,9 @@ from importlib.resources import files
 from cached_path import cached_path
-from f5_tts.model import CFM, UNetT, DiT, Trainer
-from f5_tts.model.utils import get_tokenizer
 from f5_tts.model.dataset import load_dataset
 # -------------------------- Dataset Settings --------------------------- #

 from cached_path import cached_path
+from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
+from f5_tts.model.utils import get_tokenizer
 # -------------------------- Dataset Settings --------------------------- #

src/f5_tts/train/finetune_gradio.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import gc
 import json
-import numpy as np
 import os
 import platform
-import psutil
 import queue
 import random
 import re
-import signal
 import shutil
 import subprocess
 import sys
 import tempfile
@@ -16,21 +14,23 @@ import threading
 import time
 from glob import glob
 from importlib.resources import files
-from scipy.io import wavfile
 import click
 import gradio as gr
 import librosa
 import torch
 import torchaudio
 from cached_path import cached_path
 from datasets import Dataset as Dataset_
 from datasets.arrow_writer import ArrowWriter
 from safetensors.torch import load_file, save_file
 from f5_tts.api import F5TTS
-from f5_tts.model.utils import convert_char_to_pinyin
 from f5_tts.infer.utils_infer import transcribe
 training_process = None

 import gc
 import json
 import os
 import platform
 import queue
 import random
 import re
 import shutil
+import signal
 import subprocess
 import sys
 import tempfile
 import time
 from glob import glob
 from importlib.resources import files
 import click
 import gradio as gr
 import librosa
+import numpy as np
+import psutil
 import torch
 import torchaudio
 from cached_path import cached_path
 from datasets import Dataset as Dataset_
 from datasets.arrow_writer import ArrowWriter
 from safetensors.torch import load_file, save_file
+from scipy.io import wavfile
 from f5_tts.api import F5TTS
 from f5_tts.infer.utils_infer import transcribe
+from f5_tts.model.utils import convert_char_to_pinyin
 training_process = None

src/f5_tts/train/train.py CHANGED Viewed

@@ -10,6 +10,7 @@ from f5_tts.model import CFM, Trainer
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
 os.chdir(str(files("f5_tts").joinpath("../..")))  # change working directory to root of project (local editable)

 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
 os.chdir(str(files("f5_tts").joinpath("../..")))  # change working directory to root of project (local editable)