fffiloni commited on
Commit
6836325
·
verified ·
1 Parent(s): f8c81bd

Set examples

Browse files
Files changed (1) hide show
  1. app.py +45 -17
app.py CHANGED
@@ -114,19 +114,31 @@ print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
114
 
115
 
116
 
117
- def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: str) -> str:
118
  """
119
  Create a temporary JSON file with the user-provided prompt, image, and audio paths.
120
  Returns the path to the temporary JSON file.
121
  """
122
  # Structure based on your original JSON format
123
- data = {
124
- "prompt": prompt,
125
- "cond_image": cond_image_path,
126
- "cond_audio": {
127
- "person1": cond_audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  }
129
- }
130
 
131
  # Create a temp file
132
  temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
@@ -138,14 +150,19 @@ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: s
138
  return temp_json_path
139
 
140
 
141
- def infer(prompt, cond_image_path, cond_audio_path, sample_steps):
142
 
143
  if is_shared_ui:
144
- trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
145
- cond_audio_path = trimmed_audio_path
 
 
 
 
 
146
 
147
  # Prepare input JSON
148
- input_json_path = create_temp_input_json(prompt, cond_image_path, cond_audio_path)
149
 
150
  # Base args
151
  common_args = [
@@ -229,13 +246,19 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
229
  label="Conditioning Image"
230
  )
231
 
232
- audio_input = gr.Audio(
 
 
 
 
 
233
  type="filepath",
234
- label="Conditioning Audio (.wav)"
235
  )
236
 
237
  with gr.Accordion("Advanced settings", open=False):
238
  sample_steps = gr.Slider(
 
239
  value=6,
240
  minimum=2,
241
  maximum=25,
@@ -247,9 +270,14 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
247
 
248
  gr.Examples(
249
  examples = [
250
- ["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav"]
 
251
  ],
252
- inputs = [prompt_input, image_input, audio_input]
 
 
 
 
253
  )
254
 
255
  with gr.Column(scale=3):
@@ -257,8 +285,8 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
257
 
258
  submit_btn.click(
259
  fn=infer,
260
- inputs=[prompt_input, image_input, audio_input, sample_steps],
261
  outputs=output_video
262
  )
263
 
264
- demo.launch()
 
114
 
115
 
116
 
117
+ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path_spk1: str, cond_audio_path_spk2: str) -> str:
118
  """
119
  Create a temporary JSON file with the user-provided prompt, image, and audio paths.
120
  Returns the path to the temporary JSON file.
121
  """
122
  # Structure based on your original JSON format
123
+ if cond_audio_path_spk2 is None:
124
+ data = {
125
+ "prompt": prompt,
126
+ "cond_image": cond_image_path,
127
+ "cond_audio": {
128
+ "person1": cond_audio_path_spk1
129
+ }
130
+ }
131
+
132
+ else:
133
+ data = {
134
+ "prompt": prompt,
135
+ "cond_image": cond_image_path,
136
+ "audio_type": "para",
137
+ "cond_audio": {
138
+ "person1": cond_audio_path_spk1,
139
+ "person2": cond_audio_path_spk2
140
+ }
141
  }
 
142
 
143
  # Create a temp file
144
  temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
 
150
  return temp_json_path
151
 
152
 
153
+ def infer(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2, sample_steps):
154
 
155
  if is_shared_ui:
156
+
157
+ trimmed_audio_path_spk1 = trim_audio_to_5s_temp(cond_audio_path_spk1)
158
+ cond_audio_path_spk1 = trimmed_audio_path_spk1
159
+
160
+ if cond_audio_path_spk2 is not None:
161
+ trimmed_audio_path_spk2 = trim_audio_to_5s_temp(cond_audio_path_spk2)
162
+ cond_audio_path_spk2 = trimmed_audio_path_spk2
163
 
164
  # Prepare input JSON
165
+ input_json_path = create_temp_input_json(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2)
166
 
167
  # Base args
168
  common_args = [
 
246
  label="Conditioning Image"
247
  )
248
 
249
+ audio_input_spk1 = gr.Audio(
250
+ type="filepath",
251
+ label="Conditioning Audio for speaker 1(.wav)"
252
+ )
253
+
254
+ audio_input_spk2 = gr.Audio(
255
  type="filepath",
256
+ label="Conditioning Audio for speaker 2(.wav)"
257
  )
258
 
259
  with gr.Accordion("Advanced settings", open=False):
260
  sample_steps = gr.Slider(
261
+ label="sample steps",
262
  value=6,
263
  minimum=2,
264
  maximum=25,
 
270
 
271
  gr.Examples(
272
  examples = [
273
+ ["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav", None, 6],
274
+ ["In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "examples/multi/3/multi3.png", "examples/multi/3/1-man.WAV", "examples/multi/3/1-woman.WAV", 6],
275
  ],
276
+ fn=infer,
277
+ inputs = [prompt_input, image_input, audio_input_spk1, audio_input_spk2, sample_steps]
278
+ outputs=output_video,
279
+ cache_examples = True,
280
+ cache_mode = "lazy"
281
  )
282
 
283
  with gr.Column(scale=3):
 
285
 
286
  submit_btn.click(
287
  fn=infer,
288
+ inputs=[prompt_input, image_input, audio_input_spk1, sample_steps],
289
  outputs=output_video
290
  )
291
 
292
+ demo.launch(ssr_mode=False, show_error=True, show_api=False)