mrfakename commited on
Commit
7382ceb
·
verified ·
1 Parent(s): c04ba55

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +177 -338
app.py CHANGED
@@ -90,7 +90,7 @@ chat_tokenizer_state = None
90
 
91
 
92
  @gpu_decorator
93
- def generate_response(messages, model, tokenizer):
94
  """Generate response using Qwen"""
95
  text = tokenizer.apply_chat_template(
96
  messages,
@@ -112,21 +112,21 @@ def generate_response(messages, model, tokenizer):
112
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
113
 
114
 
115
- def read_text_file(file_path):
116
- """Read content from a .txt file"""
117
- if file_path:
118
- with open(file_path, "r", encoding="utf-8") as f:
119
- return f.read().strip()
120
- return ""
 
 
121
 
122
 
123
  @gpu_decorator
124
  def infer(
125
  ref_audio_orig,
126
  ref_text,
127
- ref_text_file,
128
  gen_text,
129
- gen_text_file,
130
  model,
131
  remove_silence,
132
  seed,
@@ -139,20 +139,13 @@ def infer(
139
  gr.Warning("Please provide reference audio.")
140
  return gr.update(), gr.update(), ref_text
141
 
142
- # Use text from file if provided, otherwise use direct text input
143
- ref_text = read_text_file(ref_text_file) or ref_text
144
- gen_text = read_text_file(gen_text_file) or gen_text
145
 
146
  if not gen_text.strip():
147
  gr.Warning("Please enter text to generate or upload a text file.")
148
  return gr.update(), gr.update(), ref_text
149
 
150
- # Set random seed for reproducibility
151
- torch.manual_seed(seed)
152
- np.random.seed(seed)
153
- if torch.cuda.is_available():
154
- torch.cuda.manual_seed_all(seed)
155
-
156
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
157
 
158
  if model == DEFAULT_TTS_MODEL:
@@ -212,36 +205,40 @@ with gr.Blocks() as app_credits:
212
  with gr.Blocks() as app_tts:
213
  gr.Markdown("# Batched TTS")
214
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
215
- gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
216
- with gr.Column(scale=1):
217
- gen_text_file = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
 
 
 
 
 
218
  generate_btn = gr.Button("Synthesize", variant="primary")
219
  with gr.Accordion("Advanced Settings", open=False):
220
- with gr.Row():
221
  ref_text_input = gr.Textbox(
222
  label="Reference Text",
223
  info="Leave blank to automatically transcribe the reference audio. If you enter text or upload a file, it will override automatic transcription.",
224
  lines=2,
 
 
 
 
225
  )
226
- with gr.Column(scale=1):
227
- ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
228
- remove_silence = gr.Checkbox(
229
- label="Remove Silences",
230
- info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
231
- value=False,
232
- )
233
  with gr.Row():
234
  randomize_seed = gr.Checkbox(
235
  label="Randomize Seed",
 
236
  value=True,
237
- info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
238
- )
239
- seed_input = gr.Textbox(
240
- label="Seed",
241
- value="0",
242
- placeholder="Enter a seed value",
243
- scale=1,
244
  )
 
 
 
 
 
 
 
245
  speed_slider = gr.Slider(
246
  label="Speed",
247
  minimum=0.3,
@@ -253,9 +250,9 @@ with gr.Blocks() as app_tts:
253
  nfe_slider = gr.Slider(
254
  label="NFE Steps",
255
  minimum=4,
256
- maximum=71,
257
  value=32,
258
- step=1,
259
  info="Set the number of denoising steps.",
260
  )
261
  cross_fade_duration_slider = gr.Slider(
@@ -270,25 +267,11 @@ with gr.Blocks() as app_tts:
270
  audio_output = gr.Audio(label="Synthesized Audio")
271
  spectrogram_output = gr.Image(label="Spectrogram")
272
 
273
- @gpu_decorator
274
- def update_gen_text_from_file(file):
275
- """Update the generate text input when a .txt file is uploaded"""
276
- text = read_text_file(file)
277
- return gr.update(value=text)
278
-
279
- @gpu_decorator
280
- def update_ref_text_from_file(file):
281
- """Update the reference text input when a .txt file is uploaded"""
282
- text = read_text_file(file)
283
- return gr.update(value=text)
284
-
285
  @gpu_decorator
286
  def basic_tts(
287
  ref_audio_input,
288
  ref_text_input,
289
- ref_text_file,
290
  gen_text_input,
291
- gen_text_file,
292
  remove_silence,
293
  randomize_seed,
294
  seed_input,
@@ -298,23 +281,17 @@ with gr.Blocks() as app_tts:
298
  ):
299
  # Determine the seed to use
300
  if randomize_seed:
301
- seed = np.random.randint(0, 2**31)
302
  else:
303
- try:
304
- seed = int(seed_input)
305
- if seed < 0:
306
- gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
307
- seed = np.random.randint(0, 2**31)
308
- except ValueError:
309
- gr.Warning("Invalid seed value. Using random seed instead.")
310
- seed = np.random.randint(0, 2**31)
311
 
312
  audio_out, spectrogram_path, ref_text_out = infer(
313
  ref_audio_input,
314
  ref_text_input,
315
- ref_text_file,
316
  gen_text_input,
317
- gen_text_file,
318
  tts_model_choice,
319
  remove_silence,
320
  seed=seed,
@@ -322,16 +299,16 @@ with gr.Blocks() as app_tts:
322
  nfe_step=nfe_slider,
323
  speed=speed_slider,
324
  )
325
- return audio_out, spectrogram_path, ref_text_out, str(seed)
326
 
327
- gen_text_file.change(
328
- update_gen_text_from_file,
329
  inputs=[gen_text_file],
330
  outputs=[gen_text_input],
331
  )
332
 
333
- ref_text_file.change(
334
- update_ref_text_from_file,
335
  inputs=[ref_text_file],
336
  outputs=[ref_text_input],
337
  )
@@ -341,9 +318,7 @@ with gr.Blocks() as app_tts:
341
  inputs=[
342
  ref_audio_input,
343
  ref_text_input,
344
- ref_text_file,
345
  gen_text_input,
346
- gen_text_file,
347
  remove_silence,
348
  randomize_seed,
349
  seed_input,
@@ -419,14 +394,14 @@ with gr.Blocks() as app_multistyle:
419
 
420
  # Regular speech type (mandatory)
421
  with gr.Row() as regular_row:
422
- with gr.Column():
423
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
424
  regular_insert = gr.Button("Insert Label", variant="secondary")
425
- regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
426
- with gr.Row():
427
- regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
428
- with gr.Column(scale=1):
429
- regular_ref_text_file = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
430
 
431
  # Regular speech type (max 100)
432
  max_speech_types = 100
@@ -441,15 +416,17 @@ with gr.Blocks() as app_multistyle:
441
  # Additional speech types (99 more)
442
  for i in range(max_speech_types - 1):
443
  with gr.Row(visible=False) as row:
444
- with gr.Column():
445
  name_input = gr.Textbox(label="Speech Type Name")
446
  delete_btn = gr.Button("Delete Type", variant="secondary")
447
  insert_btn = gr.Button("Insert Label", variant="secondary")
448
- audio_input = gr.Audio(label="Reference Audio", type="filepath")
449
- with gr.Row():
450
- ref_text_input = gr.Textbox(label="Reference Text", lines=2)
451
- with gr.Column(scale=1):
452
- ref_text_file_input = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
 
 
453
  speech_type_rows.append(row)
454
  speech_type_names.append(name_input)
455
  speech_type_audios.append(audio_input)
@@ -481,13 +458,6 @@ with gr.Blocks() as app_multistyle:
481
  def delete_speech_type_fn():
482
  return gr.update(visible=False), None, None, None, None
483
 
484
- # Function to update reference text from file
485
- @gpu_decorator
486
- def update_ref_text_from_file(file):
487
- """Update the reference text input when a .txt file is uploaded"""
488
- text = read_text_file(file)
489
- return gr.update(value=text)
490
-
491
  # Update delete button clicks and ref text file changes
492
  for i in range(1, len(speech_type_delete_btns)):
493
  speech_type_delete_btns[i].click(
@@ -500,27 +470,29 @@ with gr.Blocks() as app_multistyle:
500
  speech_type_ref_text_files[i],
501
  ],
502
  )
503
- speech_type_ref_text_files[i].change(
504
- update_ref_text_from_file,
505
  inputs=[speech_type_ref_text_files[i]],
506
  outputs=[speech_type_ref_texts[i]],
507
  )
508
 
509
  # Update regular speech type ref text file
510
- regular_ref_text_file.change(
511
- update_ref_text_from_file,
512
  inputs=[regular_ref_text_file],
513
  outputs=[regular_ref_text],
514
  )
515
 
516
  # Text input for the prompt
517
- gen_text_input_multistyle = gr.Textbox(
518
- label="Text to Generate",
519
- lines=10,
520
- placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
521
- )
522
- with gr.Column(scale=1):
523
- gen_text_file_multistyle = gr.File(label="Upload Text File to Generate (.txt)", file_types=[".txt"])
 
 
524
 
525
  def make_insert_speech_type_fn(index):
526
  def insert_speech_type_fn(current_text, speech_type_name):
@@ -542,20 +514,9 @@ with gr.Blocks() as app_multistyle:
542
  with gr.Accordion("Advanced Settings", open=False):
543
  remove_silence_multistyle = gr.Checkbox(
544
  label="Remove Silences",
 
545
  value=True,
546
  )
547
- with gr.Row():
548
- randomize_seed_multistyle = gr.Checkbox(
549
- label="Randomize Seed",
550
- value=True,
551
- info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
552
- )
553
- seed_input_multistyle = gr.Textbox(
554
- label="Seed",
555
- value="0",
556
- placeholder="Enter a seed value",
557
- scale=1,
558
- )
559
 
560
  # Generate button
561
  generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
@@ -563,60 +524,30 @@ with gr.Blocks() as app_multistyle:
563
  # Output audio
564
  audio_output_multistyle = gr.Audio(label="Synthesized Audio")
565
 
566
- @gpu_decorator
567
- def update_gen_text_from_file(file):
568
- """Update the generate text input when a .txt file is uploaded"""
569
- text = read_text_file(file)
570
- return gr.update(value=text)
571
-
572
- gen_text_file_multistyle.change(
573
- fn=lambda file, text, regular, *names: (
574
- update_gen_text_from_file(file),
575
- validate_speech_types(text, file, regular, *names),
576
- ),
577
- inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
578
- outputs=[gen_text_input_multistyle, generate_multistyle_btn],
579
  )
580
 
581
  @gpu_decorator
582
  def generate_multistyle_speech(
583
  gen_text,
584
- gen_text_file,
585
- randomize_seed,
586
- seed_input,
587
  *args,
588
  ):
589
- # Determine the seed to use
590
- if randomize_seed:
591
- seed = np.random.randint(0, 2**31)
592
- else:
593
- try:
594
- seed = int(seed_input)
595
- if seed < 0:
596
- gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
597
- seed = np.random.randint(0, 2**31)
598
- except ValueError:
599
- gr.Warning("Invalid seed value. Using random seed instead.")
600
- seed = np.random.randint(0, 2**31)
601
-
602
  speech_type_names_list = args[:max_speech_types]
603
  speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
604
  speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
605
- speech_type_ref_text_files_list = args[3 * max_speech_types : 4 * max_speech_types]
606
- remove_silence = args[4 * max_speech_types]
607
  # Collect the speech types and their audios into a dict
608
  speech_types = OrderedDict()
609
 
610
- # Use text from file if provided, otherwise use direct text input
611
- gen_text = read_text_file(gen_text_file) or gen_text
612
-
613
  ref_text_idx = 0
614
- for name_input, audio_input, ref_text_input, ref_text_file_input in zip(
615
- speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list, speech_type_ref_text_files_list
616
  ):
617
- ref_text = read_text_file(ref_text_file_input) or ref_text_input
618
  if name_input and audio_input:
619
- speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text}
620
  else:
621
  speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
622
  ref_text_idx += 1
@@ -642,12 +573,15 @@ with gr.Blocks() as app_multistyle:
642
  ref_audio = speech_types[current_style]["audio"]
643
  except KeyError:
644
  gr.Warning(f"Please provide reference audio for type {current_style}.")
645
- return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
646
  ref_text = speech_types[current_style].get("ref_text", "")
647
 
 
 
 
648
  # Generate speech for this segment
649
  audio_out, _, ref_text_out = infer(
650
- ref_audio, ref_text, None, text, None, tts_model_choice, remove_silence, seed, 0, show_info=print
651
  ) # show_info=print no pull to top when generating
652
  sr, audio_data = audio_out
653
 
@@ -657,29 +591,29 @@ with gr.Blocks() as app_multistyle:
657
  # Concatenate all audio segments
658
  if generated_audio_segments:
659
  final_audio_data = np.concatenate(generated_audio_segments)
660
- return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
661
  else:
662
  gr.Warning("No audio generated.")
663
- return [None] + [speech_types[style]["ref_text"] for style in speech_types] + [str(seed)]
664
 
665
  generate_multistyle_btn.click(
666
  generate_multistyle_speech,
667
- inputs=[gen_text_input_multistyle, gen_text_file_multistyle, randomize_seed_multistyle, seed_input_multistyle]
 
 
668
  + speech_type_names
669
  + speech_type_audios
670
  + speech_type_ref_texts
671
- + speech_type_ref_text_files
672
- + [remove_silence_multistyle],
673
- outputs=[audio_output_multistyle] + speech_type_ref_texts + [seed_input_multistyle],
 
674
  )
675
 
676
  # Validation function to disable Generate button if speech types are missing
677
- def validate_speech_types(gen_text, gen_text_file, regular_name, *args):
678
  speech_type_names_list = args
679
 
680
- # Use text from file if provided, otherwise use direct text input
681
- gen_text = read_text_file(gen_text_file) or gen_text
682
-
683
  # Collect the speech types names
684
  speech_types_available = set()
685
  if regular_name:
@@ -704,19 +638,10 @@ with gr.Blocks() as app_multistyle:
704
 
705
  gen_text_input_multistyle.change(
706
  validate_speech_types,
707
- inputs=[gen_text_input_multistyle, gen_text_file_multistyle, regular_name] + speech_type_names,
708
  outputs=generate_multistyle_btn,
709
  )
710
 
711
- gen_text_file_multistyle.change(
712
- fn=lambda file, text, regular, *names: (
713
- update_gen_text_from_file(file),
714
- validate_speech_types(text, file, regular, *names),
715
- ),
716
- inputs=[gen_text_file_multistyle, gen_text_input_multistyle, regular_name] + speech_type_names,
717
- outputs=[gen_text_input_multistyle, generate_multistyle_btn],
718
- )
719
-
720
 
721
  with gr.Blocks() as app_chat:
722
  gr.Markdown(
@@ -781,35 +706,33 @@ Have a conversation with an AI using your reference voice!
781
  ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
782
  with gr.Column():
783
  with gr.Accordion("Advanced Settings", open=False):
784
- remove_silence_chat = gr.Checkbox(
785
- label="Remove Silences",
786
- value=True,
787
- )
788
- with gr.Row():
789
  ref_text_chat = gr.Textbox(
790
  label="Reference Text",
791
  info="Optional: Leave blank to auto-transcribe",
792
  lines=2,
 
 
 
 
793
  )
794
- with gr.Column(scale=1):
795
- ref_text_file_chat = gr.File(label="Upload Reference Text File (.txt)", file_types=[".txt"])
796
- system_prompt_chat = gr.Textbox(
797
- label="System Prompt",
798
- value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
799
- lines=2,
800
- )
801
  with gr.Row():
802
  randomize_seed_chat = gr.Checkbox(
803
  label="Randomize Seed",
804
  value=True,
805
- info="Check to use a random seed for each generation. Uncheck to use the seed specified below.",
806
- )
807
- seed_input_chat = gr.Textbox(
808
- label="Seed",
809
- value="0",
810
- placeholder="Enter a seed value",
811
- scale=1,
812
  )
 
 
 
 
 
 
 
 
 
 
813
 
814
  chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
815
 
@@ -825,80 +748,59 @@ Have a conversation with an AI using your reference voice!
825
  label="Type your message",
826
  lines=1,
827
  )
828
- with gr.Column(scale=1):
829
- text_file_chat = gr.File(label="Upload Text File (.txt)", file_types=[".txt"])
830
  send_btn_chat = gr.Button("Send Message")
831
  clear_btn_chat = gr.Button("Clear Conversation")
832
 
833
- conversation_state = gr.State(
834
- value=[
835
- {
836
- "role": "system",
837
- "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
838
- }
839
- ]
840
- )
841
-
842
- # Modify process_audio_input to use model and tokenizer from state
843
  @gpu_decorator
844
- def process_audio_input(audio_path, text, text_file, history, conv_state):
845
- """Handle audio, text, or file input from user"""
846
- if not audio_path and not text.strip() and not text_file:
847
- return history, conv_state, "", None
848
-
849
- # Use file input if provided, then direct text input, then audio transcription
850
- if text_file:
851
- text = read_text_file(text_file)
852
- elif audio_path:
853
- text = preprocess_ref_audio_text(audio_path, text)[1]
854
 
 
 
 
 
 
855
  if not text.strip():
856
- return history, conv_state, "", None
857
 
858
  conv_state.append({"role": "user", "content": text})
859
- history.append((text, None))
860
 
861
- response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
 
 
 
862
 
863
- conv_state.append({"role": "assistant", "content": response})
864
- history[-1] = (text, response)
865
 
866
- return history, conv_state, "", None
 
867
 
868
  @gpu_decorator
869
- def generate_audio_response(
870
- history, ref_audio, ref_text, ref_text_file, remove_silence, randomize_seed, seed_input
871
- ):
872
  """Generate TTS audio for AI response"""
873
  if not history or not ref_audio:
874
  return None, ref_text, seed_input
875
 
876
- last_user_message, last_ai_response = history[-1]
877
- if not last_ai_response:
878
  return None, ref_text, seed_input
879
 
880
  # Determine the seed to use
881
  if randomize_seed:
882
- seed = np.random.randint(0, 2**31)
883
  else:
884
- try:
885
- seed = int(seed_input)
886
- if seed < 0:
887
- gr.Warning("Seed must be a non-negative integer. Using random seed instead.")
888
- seed = np.random.randint(0, 2**31)
889
- except ValueError:
890
- gr.Warning("Invalid seed value. Using random seed instead.")
891
- seed = np.random.randint(0, 2**31)
892
-
893
- # Use text from file if provided, otherwise use direct text input
894
- ref_text = read_text_file(ref_text_file) or ref_text
895
 
896
  audio_result, _, ref_text_out = infer(
897
  ref_audio,
898
  ref_text,
899
- None,
900
  last_ai_response,
901
- None,
902
  tts_model_choice,
903
  remove_silence,
904
  seed=seed,
@@ -906,113 +808,50 @@ Have a conversation with an AI using your reference voice!
906
  speed=1.0,
907
  show_info=print, # show_info=print no pull to top when generating
908
  )
909
- return audio_result, ref_text_out, str(seed)
910
 
911
  def clear_conversation():
912
  """Reset the conversation"""
913
- return [], [
914
- {
915
- "role": "system",
916
- "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
917
- }
918
- ]
919
-
920
- def update_system_prompt(new_prompt):
921
- """Update the system prompt and reset the conversation"""
922
- new_conv_state = [{"role": "system", "content": new_prompt}]
923
- return [], new_conv_state
924
-
925
- @gpu_decorator
926
- def update_text_from_file(file):
927
- """Update the text input when a .txt file is uploaded"""
928
- text = read_text_file(file)
929
- return gr.update(value=text), None
930
 
931
- ref_text_file_chat.change(
932
- update_ref_text_from_file,
933
  inputs=[ref_text_file_chat],
934
  outputs=[ref_text_chat],
935
  )
936
 
937
- text_file_chat.change(
938
- update_text_from_file,
939
- inputs=[text_file_chat],
940
- outputs=[text_input_chat, text_file_chat],
941
- )
942
-
943
- # Handle audio input
944
- audio_input_chat.stop_recording(
945
- process_audio_input,
946
- inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
947
- outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
948
- ).then(
949
- generate_audio_response,
950
- inputs=[
951
- chatbot_interface,
952
- ref_audio_chat,
953
- ref_text_chat,
954
- ref_text_file_chat,
955
- remove_silence_chat,
956
- randomize_seed_chat,
957
- seed_input_chat,
958
- ],
959
- outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
960
- ).then(
961
- lambda: None,
962
- None,
963
- audio_input_chat,
964
- )
965
-
966
- # Handle text input
967
- text_input_chat.submit(
968
- process_audio_input,
969
- inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
970
- outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
971
- ).then(
972
- generate_audio_response,
973
- inputs=[
974
- chatbot_interface,
975
- ref_audio_chat,
976
- ref_text_chat,
977
- ref_text_file_chat,
978
- remove_silence_chat,
979
- randomize_seed_chat,
980
- seed_input_chat,
981
- ],
982
- outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
983
- )
984
-
985
- # Handle send button
986
- send_btn_chat.click(
987
- process_audio_input,
988
- inputs=[audio_input_chat, text_input_chat, text_file_chat, chatbot_interface, conversation_state],
989
- outputs=[chatbot_interface, conversation_state, text_input_chat, text_file_chat],
990
- ).then(
991
- generate_audio_response,
992
- inputs=[
993
- chatbot_interface,
994
- ref_audio_chat,
995
- ref_text_chat,
996
- ref_text_file_chat,
997
- remove_silence_chat,
998
- randomize_seed_chat,
999
- seed_input_chat,
1000
- ],
1001
- outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
1002
- )
1003
-
1004
- # Handle clear button
1005
- clear_btn_chat.click(
1006
- clear_conversation,
1007
- outputs=[chatbot_interface, conversation_state],
1008
- )
1009
 
1010
- # Handle system prompt change and reset conversation
1011
- system_prompt_chat.change(
1012
- update_system_prompt,
1013
- inputs=system_prompt_chat,
1014
- outputs=[chatbot_interface, conversation_state],
1015
- )
1016
 
1017
 
1018
  with gr.Blocks() as app:
@@ -1027,9 +866,9 @@ This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not
1027
 
1028
  The checkpoints currently support English and Chinese.
1029
 
1030
- If you're having issues, try converting your заборreference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
1031
 
1032
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided via text or .txt file. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
1033
  """
1034
  )
1035
 
 
90
 
91
 
92
  @gpu_decorator
93
+ def chat_model_inference(messages, model, tokenizer):
94
  """Generate response using Qwen"""
95
  text = tokenizer.apply_chat_template(
96
  messages,
 
112
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
113
 
114
 
115
+ @gpu_decorator
116
+ def load_text_from_file(file):
117
+ if file:
118
+ with open(file, "r", encoding="utf-8") as f:
119
+ text = f.read().strip()
120
+ else:
121
+ text = ""
122
+ return gr.update(value=text)
123
 
124
 
125
  @gpu_decorator
126
  def infer(
127
  ref_audio_orig,
128
  ref_text,
 
129
  gen_text,
 
130
  model,
131
  remove_silence,
132
  seed,
 
139
  gr.Warning("Please provide reference audio.")
140
  return gr.update(), gr.update(), ref_text
141
 
142
+ # Set inference seed
143
+ torch.manual_seed(seed)
 
144
 
145
  if not gen_text.strip():
146
  gr.Warning("Please enter text to generate or upload a text file.")
147
  return gr.update(), gr.update(), ref_text
148
 
 
 
 
 
 
 
149
  ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
150
 
151
  if model == DEFAULT_TTS_MODEL:
 
205
  with gr.Blocks() as app_tts:
206
  gr.Markdown("# Batched TTS")
207
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
208
+ with gr.Row(equal_height=True):
209
+ gen_text_input = gr.Textbox(
210
+ label="Text to Generate",
211
+ lines=10,
212
+ max_lines=40,
213
+ scale=4,
214
+ )
215
+ gen_text_file = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
216
  generate_btn = gr.Button("Synthesize", variant="primary")
217
  with gr.Accordion("Advanced Settings", open=False):
218
+ with gr.Row(equal_height=True):
219
  ref_text_input = gr.Textbox(
220
  label="Reference Text",
221
  info="Leave blank to automatically transcribe the reference audio. If you enter text or upload a file, it will override automatic transcription.",
222
  lines=2,
223
+ scale=4,
224
+ )
225
+ ref_text_file = gr.File(
226
+ label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1, height=1
227
  )
 
 
 
 
 
 
 
228
  with gr.Row():
229
  randomize_seed = gr.Checkbox(
230
  label="Randomize Seed",
231
+ info="Check to use a random seed for each generation. Uncheck to use the seed specified.",
232
  value=True,
233
+ scale=3,
 
 
 
 
 
 
234
  )
235
+ seed_input = gr.Number(show_label=False, value=0, precision=0, scale=1)
236
+ with gr.Column(scale=4):
237
+ remove_silence = gr.Checkbox(
238
+ label="Remove Silences",
239
+ info="If undesired long silence(s) produced, turn on to automatically detect and crop.",
240
+ value=False,
241
+ )
242
  speed_slider = gr.Slider(
243
  label="Speed",
244
  minimum=0.3,
 
250
  nfe_slider = gr.Slider(
251
  label="NFE Steps",
252
  minimum=4,
253
+ maximum=64,
254
  value=32,
255
+ step=2,
256
  info="Set the number of denoising steps.",
257
  )
258
  cross_fade_duration_slider = gr.Slider(
 
267
  audio_output = gr.Audio(label="Synthesized Audio")
268
  spectrogram_output = gr.Image(label="Spectrogram")
269
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  @gpu_decorator
271
  def basic_tts(
272
  ref_audio_input,
273
  ref_text_input,
 
274
  gen_text_input,
 
275
  remove_silence,
276
  randomize_seed,
277
  seed_input,
 
281
  ):
282
  # Determine the seed to use
283
  if randomize_seed:
284
+ seed = np.random.randint(0, 2**31 - 1)
285
  else:
286
+ seed = seed_input
287
+ if seed < 0 or seed > 2**31 - 1:
288
+ gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
289
+ seed = np.random.randint(0, 2**31 - 1)
 
 
 
 
290
 
291
  audio_out, spectrogram_path, ref_text_out = infer(
292
  ref_audio_input,
293
  ref_text_input,
 
294
  gen_text_input,
 
295
  tts_model_choice,
296
  remove_silence,
297
  seed=seed,
 
299
  nfe_step=nfe_slider,
300
  speed=speed_slider,
301
  )
302
+ return audio_out, spectrogram_path, ref_text_out, seed
303
 
304
+ gen_text_file.upload(
305
+ load_text_from_file,
306
  inputs=[gen_text_file],
307
  outputs=[gen_text_input],
308
  )
309
 
310
+ ref_text_file.upload(
311
+ load_text_from_file,
312
  inputs=[ref_text_file],
313
  outputs=[ref_text_input],
314
  )
 
318
  inputs=[
319
  ref_audio_input,
320
  ref_text_input,
 
321
  gen_text_input,
 
322
  remove_silence,
323
  randomize_seed,
324
  seed_input,
 
394
 
395
  # Regular speech type (mandatory)
396
  with gr.Row() as regular_row:
397
+ with gr.Column(scale=1, min_width=160):
398
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
399
  regular_insert = gr.Button("Insert Label", variant="secondary")
400
+ with gr.Column(scale=3):
401
+ regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
402
+ with gr.Row(scale=4, equal_height=True):
403
+ regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=8, scale=3)
404
+ regular_ref_text_file = gr.File(label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1)
405
 
406
  # Regular speech type (max 100)
407
  max_speech_types = 100
 
416
  # Additional speech types (99 more)
417
  for i in range(max_speech_types - 1):
418
  with gr.Row(visible=False) as row:
419
+ with gr.Column(scale=1, min_width=160):
420
  name_input = gr.Textbox(label="Speech Type Name")
421
  delete_btn = gr.Button("Delete Type", variant="secondary")
422
  insert_btn = gr.Button("Insert Label", variant="secondary")
423
+ with gr.Column(scale=3):
424
+ audio_input = gr.Audio(label="Reference Audio", type="filepath")
425
+ with gr.Row(scale=4, equal_height=True):
426
+ ref_text_input = gr.Textbox(label="Reference Text", lines=8, scale=3)
427
+ ref_text_file_input = gr.File(
428
+ label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1
429
+ )
430
  speech_type_rows.append(row)
431
  speech_type_names.append(name_input)
432
  speech_type_audios.append(audio_input)
 
458
  def delete_speech_type_fn():
459
  return gr.update(visible=False), None, None, None, None
460
 
 
 
 
 
 
 
 
461
  # Update delete button clicks and ref text file changes
462
  for i in range(1, len(speech_type_delete_btns)):
463
  speech_type_delete_btns[i].click(
 
470
  speech_type_ref_text_files[i],
471
  ],
472
  )
473
+ speech_type_ref_text_files[i].upload(
474
+ load_text_from_file,
475
  inputs=[speech_type_ref_text_files[i]],
476
  outputs=[speech_type_ref_texts[i]],
477
  )
478
 
479
  # Update regular speech type ref text file
480
+ regular_ref_text_file.upload(
481
+ load_text_from_file,
482
  inputs=[regular_ref_text_file],
483
  outputs=[regular_ref_text],
484
  )
485
 
486
  # Text input for the prompt
487
+ with gr.Row(equal_height=True):
488
+ gen_text_input_multistyle = gr.Textbox(
489
+ label="Text to Generate",
490
+ lines=10,
491
+ max_lines=40,
492
+ scale=4,
493
+ placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
494
+ )
495
+ gen_text_file_multistyle = gr.File(label="Load Text to Generate from File (.txt)", file_types=[".txt"], scale=1)
496
 
497
  def make_insert_speech_type_fn(index):
498
  def insert_speech_type_fn(current_text, speech_type_name):
 
514
  with gr.Accordion("Advanced Settings", open=False):
515
  remove_silence_multistyle = gr.Checkbox(
516
  label="Remove Silences",
517
+ info="Turn on to automatically detect and crop long silences.",
518
  value=True,
519
  )
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
  # Generate button
522
  generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
 
524
  # Output audio
525
  audio_output_multistyle = gr.Audio(label="Synthesized Audio")
526
 
527
+ gen_text_file_multistyle.upload(
528
+ load_text_from_file,
529
+ inputs=[gen_text_file_multistyle],
530
+ outputs=[gen_text_input_multistyle],
 
 
 
 
 
 
 
 
 
531
  )
532
 
533
  @gpu_decorator
534
  def generate_multistyle_speech(
535
  gen_text,
 
 
 
536
  *args,
537
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  speech_type_names_list = args[:max_speech_types]
539
  speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
540
  speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
541
+ remove_silence = args[3 * max_speech_types]
 
542
  # Collect the speech types and their audios into a dict
543
  speech_types = OrderedDict()
544
 
 
 
 
545
  ref_text_idx = 0
546
+ for name_input, audio_input, ref_text_input in zip(
547
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
548
  ):
 
549
  if name_input and audio_input:
550
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
551
  else:
552
  speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
553
  ref_text_idx += 1
 
573
  ref_audio = speech_types[current_style]["audio"]
574
  except KeyError:
575
  gr.Warning(f"Please provide reference audio for type {current_style}.")
576
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
577
  ref_text = speech_types[current_style].get("ref_text", "")
578
 
579
+ # TODO. Attribute each type a unique seed (maybe also speed, pseudo-feature for #730 #813)
580
+ seed = np.random.randint(0, 2**31 - 1)
581
+
582
  # Generate speech for this segment
583
  audio_out, _, ref_text_out = infer(
584
+ ref_audio, ref_text, text, tts_model_choice, remove_silence, seed, 0, show_info=print
585
  ) # show_info=print no pull to top when generating
586
  sr, audio_data = audio_out
587
 
 
591
  # Concatenate all audio segments
592
  if generated_audio_segments:
593
  final_audio_data = np.concatenate(generated_audio_segments)
594
+ return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
595
  else:
596
  gr.Warning("No audio generated.")
597
+ return [None] + [speech_types[style]["ref_text"] for style in speech_types]
598
 
599
  generate_multistyle_btn.click(
600
  generate_multistyle_speech,
601
+ inputs=[
602
+ gen_text_input_multistyle,
603
+ ]
604
  + speech_type_names
605
  + speech_type_audios
606
  + speech_type_ref_texts
607
+ + [
608
+ remove_silence_multistyle,
609
+ ],
610
+ outputs=[audio_output_multistyle] + speech_type_ref_texts,
611
  )
612
 
613
  # Validation function to disable Generate button if speech types are missing
614
+ def validate_speech_types(gen_text, regular_name, *args):
615
  speech_type_names_list = args
616
 
 
 
 
617
  # Collect the speech types names
618
  speech_types_available = set()
619
  if regular_name:
 
638
 
639
  gen_text_input_multistyle.change(
640
  validate_speech_types,
641
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
642
  outputs=generate_multistyle_btn,
643
  )
644
 
 
 
 
 
 
 
 
 
 
645
 
646
  with gr.Blocks() as app_chat:
647
  gr.Markdown(
 
706
  ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
707
  with gr.Column():
708
  with gr.Accordion("Advanced Settings", open=False):
709
+ with gr.Row(equal_height=True):
 
 
 
 
710
  ref_text_chat = gr.Textbox(
711
  label="Reference Text",
712
  info="Optional: Leave blank to auto-transcribe",
713
  lines=2,
714
+ scale=3,
715
+ )
716
+ ref_text_file_chat = gr.File(
717
+ label="Load Reference Text from File (.txt)", file_types=[".txt"], scale=1
718
  )
 
 
 
 
 
 
 
719
  with gr.Row():
720
  randomize_seed_chat = gr.Checkbox(
721
  label="Randomize Seed",
722
  value=True,
723
+ info="Uncheck to use the seed specified.",
724
+ scale=3,
 
 
 
 
 
725
  )
726
+ seed_input_chat = gr.Number(show_label=False, value=0, precision=0, scale=1)
727
+ remove_silence_chat = gr.Checkbox(
728
+ label="Remove Silences",
729
+ value=True,
730
+ )
731
+ system_prompt_chat = gr.Textbox(
732
+ label="System Prompt",
733
+ value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
734
+ lines=2,
735
+ )
736
 
737
  chatbot_interface = gr.Chatbot(label="Conversation", type="messages")
738
 
 
748
  label="Type your message",
749
  lines=1,
750
  )
 
 
751
  send_btn_chat = gr.Button("Send Message")
752
  clear_btn_chat = gr.Button("Clear Conversation")
753
 
754
+ # Modify process_audio_input to generate user input
 
 
 
 
 
 
 
 
 
755
  @gpu_decorator
756
+ def process_audio_input(conv_state, audio_path, text):
757
+ """Handle audio or text input from user"""
 
 
 
 
 
 
 
 
758
 
759
+ if not audio_path and not text.strip():
760
+ return conv_state
761
+
762
+ if audio_path:
763
+ text = preprocess_ref_audio_text(audio_path, text)[1]
764
  if not text.strip():
765
+ return conv_state
766
 
767
  conv_state.append({"role": "user", "content": text})
768
+ return conv_state
769
 
770
+ # Use model and tokenizer from state to get text response
771
+ @gpu_decorator
772
+ def generate_text_response(conv_state, system_prompt):
773
+ """Generate text response from AI"""
774
 
775
+ system_prompt_state = [{"role": "system", "content": system_prompt}]
776
+ response = chat_model_inference(system_prompt_state + conv_state, chat_model_state, chat_tokenizer_state)
777
 
778
+ conv_state.append({"role": "assistant", "content": response})
779
+ return conv_state
780
 
781
  @gpu_decorator
782
+ def generate_audio_response(history, ref_audio, ref_text, remove_silence, randomize_seed, seed_input):
 
 
783
  """Generate TTS audio for AI response"""
784
  if not history or not ref_audio:
785
  return None, ref_text, seed_input
786
 
787
+ last_ai_response = history[-1]["content"]
788
+ if not last_ai_response or history[-1]["role"] != "assistant":
789
  return None, ref_text, seed_input
790
 
791
  # Determine the seed to use
792
  if randomize_seed:
793
+ seed = np.random.randint(0, 2**31 - 1)
794
  else:
795
+ seed = seed_input
796
+ if seed < 0 or seed > 2**31 - 1:
797
+ gr.Warning("Seed must in range 0 ~ 2147483647. Using random seed instead.")
798
+ seed = np.random.randint(0, 2**31 - 1)
 
 
 
 
 
 
 
799
 
800
  audio_result, _, ref_text_out = infer(
801
  ref_audio,
802
  ref_text,
 
803
  last_ai_response,
 
804
  tts_model_choice,
805
  remove_silence,
806
  seed=seed,
 
808
  speed=1.0,
809
  show_info=print, # show_info=print no pull to top when generating
810
  )
811
+ return audio_result, ref_text_out, seed
812
 
813
  def clear_conversation():
814
  """Reset the conversation"""
815
+ return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
 
817
+ ref_text_file_chat.upload(
818
+ load_text_from_file,
819
  inputs=[ref_text_file_chat],
820
  outputs=[ref_text_chat],
821
  )
822
 
823
+ for user_operation in [audio_input_chat.stop_recording, text_input_chat.submit, send_btn_chat.click]:
824
+ user_operation(
825
+ process_audio_input,
826
+ inputs=[chatbot_interface, audio_input_chat, text_input_chat],
827
+ outputs=[chatbot_interface],
828
+ ).then(
829
+ generate_text_response,
830
+ inputs=[chatbot_interface, system_prompt_chat],
831
+ outputs=[chatbot_interface],
832
+ ).then(
833
+ generate_audio_response,
834
+ inputs=[
835
+ chatbot_interface,
836
+ ref_audio_chat,
837
+ ref_text_chat,
838
+ remove_silence_chat,
839
+ randomize_seed_chat,
840
+ seed_input_chat,
841
+ ],
842
+ outputs=[audio_output_chat, ref_text_chat, seed_input_chat],
843
+ ).then(
844
+ lambda: [None, None],
845
+ None,
846
+ [audio_input_chat, text_input_chat],
847
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
848
 
849
+ # Handle clear button or system prompt change and reset conversation
850
+ for user_operation in [clear_btn_chat.click, system_prompt_chat.change, chatbot_interface.clear]:
851
+ user_operation(
852
+ clear_conversation,
853
+ outputs=[chatbot_interface, audio_output_chat],
854
+ )
855
 
856
 
857
  with gr.Blocks() as app:
 
866
 
867
  The checkpoints currently support English and Chinese.
868
 
869
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
870
 
871
+ **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
872
  """
873
  )
874