Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,510 +1,41 @@
|
|
1 |
-
import os
|
2 |
-
import random
|
3 |
-
import uuid
|
4 |
-
import json
|
5 |
-
import time
|
6 |
-
import asyncio
|
7 |
-
from threading import Thread
|
8 |
-
|
9 |
import gradio as gr
|
10 |
import spaces
|
11 |
-
import
|
12 |
-
|
13 |
-
from PIL import Image
|
14 |
-
import edge_tts
|
15 |
-
import cv2
|
16 |
-
|
17 |
-
from transformers import (
|
18 |
-
AutoModelForCausalLM,
|
19 |
-
AutoTokenizer,
|
20 |
-
TextIteratorStreamer,
|
21 |
-
Qwen2VLForConditionalGeneration,
|
22 |
-
Qwen2_5_VLForConditionalGeneration,
|
23 |
-
AutoProcessor,
|
24 |
-
)
|
25 |
from transformers.image_utils import load_image
|
26 |
-
from
|
27 |
-
|
28 |
-
MAX_MAX_NEW_TOKENS = 2048
|
29 |
-
DEFAULT_MAX_NEW_TOKENS = 1024
|
30 |
-
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
31 |
-
|
32 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
33 |
-
|
34 |
-
# Load text-only model and tokenizer
|
35 |
-
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
|
36 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
37 |
-
model = AutoModelForCausalLM.from_pretrained(
|
38 |
-
model_id,
|
39 |
-
device_map="auto",
|
40 |
-
torch_dtype=torch.bfloat16,
|
41 |
-
)
|
42 |
-
model.eval()
|
43 |
-
|
44 |
-
# Updated TTS voices list (all voices)
|
45 |
-
TTS_VOICES = [
|
46 |
-
"af-ZA-AdriNeural",
|
47 |
-
"af-ZA-WillemNeural",
|
48 |
-
"am-ET-AmehaNeural",
|
49 |
-
"am-ET-MekdesNeural",
|
50 |
-
"ar-AE-FatimaNeural",
|
51 |
-
"ar-AE-HamdanNeural",
|
52 |
-
"ar-BH-LailaNeural",
|
53 |
-
"ar-BH-MajedNeural",
|
54 |
-
"ar-DZ-AminaNeural",
|
55 |
-
"ar-DZ-IsmaelNeural",
|
56 |
-
"ar-EG-SalmaNeural",
|
57 |
-
"ar-EG-OmarNeural",
|
58 |
-
"ar-IQ-LanaNeural",
|
59 |
-
"ar-IQ-BassamNeural",
|
60 |
-
"ar-JO-SanaNeural",
|
61 |
-
"ar-JO-TaimNeural",
|
62 |
-
"ar-KW-NouraNeural",
|
63 |
-
"ar-KW-FahedNeural",
|
64 |
-
"ar-LB-LaylaNeural",
|
65 |
-
"ar-LB-RamiNeural",
|
66 |
-
"ar-LY-ImanNeural",
|
67 |
-
"ar-LY-OmarNeural",
|
68 |
-
"ar-MA-MounaNeural",
|
69 |
-
"ar-MA-JamalNeural",
|
70 |
-
"ar-OM-AyshaNeural",
|
71 |
-
"ar-OM-AbdullahNeural",
|
72 |
-
"ar-QA-AmalNeural",
|
73 |
-
"ar-QA-MoazNeural",
|
74 |
-
"ar-SA-ZariyahNeural",
|
75 |
-
"ar-SA-HamedNeural",
|
76 |
-
"ar-SY-AmanyNeural",
|
77 |
-
"ar-SY-LaithNeural",
|
78 |
-
"ar-TN-ReemNeural",
|
79 |
-
"ar-TN-SeifNeural",
|
80 |
-
"ar-YE-MaryamNeural",
|
81 |
-
"ar-YE-SalehNeural",
|
82 |
-
"az-AZ-BabekNeural",
|
83 |
-
"az-AZ-BanuNeural",
|
84 |
-
"bg-BG-BorislavNeural",
|
85 |
-
"bg-BG-KalinaNeural",
|
86 |
-
"bn-BD-NabanitaNeural",
|
87 |
-
"bn-BD-PradeepNeural",
|
88 |
-
"bn-IN-TanishaNeural",
|
89 |
-
"bn-IN-SwapanNeural",
|
90 |
-
"bs-BA-GoranNeural",
|
91 |
-
"bs-BA-VesnaNeural",
|
92 |
-
"ca-ES-JoanaNeural",
|
93 |
-
"ca-ES-AlbaNeural",
|
94 |
-
"ca-ES-EnricNeural",
|
95 |
-
"cs-CZ-AntoninNeural",
|
96 |
-
"cs-CZ-VlastaNeural",
|
97 |
-
"cy-GB-NiaNeural",
|
98 |
-
"cy-GB-AledNeural",
|
99 |
-
"da-DK-ChristelNeural",
|
100 |
-
"da-DK-JeppeNeural",
|
101 |
-
"de-AT-IngridNeural",
|
102 |
-
"de-AT-JonasNeural",
|
103 |
-
"de-CH-LeniNeural",
|
104 |
-
"de-CH-JanNeural",
|
105 |
-
"de-DE-KatjaNeural",
|
106 |
-
"de-DE-ConradNeural",
|
107 |
-
"el-GR-AthinaNeural",
|
108 |
-
"el-GR-NestorasNeural",
|
109 |
-
"en-AU-AnnetteNeural",
|
110 |
-
"en-AU-MichaelNeural",
|
111 |
-
"en-CA-ClaraNeural",
|
112 |
-
"en-CA-LiamNeural",
|
113 |
-
"en-GB-SoniaNeural",
|
114 |
-
"en-GB-RyanNeural",
|
115 |
-
"en-GH-EsiNeural",
|
116 |
-
"en-GH-KwameNeural",
|
117 |
-
"en-HK-YanNeural",
|
118 |
-
"en-HK-TrevorNeural",
|
119 |
-
"en-IE-EmilyNeural",
|
120 |
-
"en-IE-ConnorNeural",
|
121 |
-
"en-IN-NeerjaNeural",
|
122 |
-
"en-IN-PrabhasNeural",
|
123 |
-
"en-KE-ChantelleNeural",
|
124 |
-
"en-KE-ChilembaNeural",
|
125 |
-
"en-NG-EzinneNeural",
|
126 |
-
"en-NG-AbechiNeural",
|
127 |
-
"en-NZ-MollyNeural",
|
128 |
-
"en-NZ-MitchellNeural",
|
129 |
-
"en-PH-RosaNeural",
|
130 |
-
"en-PH-JamesNeural",
|
131 |
-
"en-SG-LunaNeural",
|
132 |
-
"en-SG-WayneNeural",
|
133 |
-
"en-TZ-ImaniNeural",
|
134 |
-
"en-TZ-DaudiNeural",
|
135 |
-
"en-US-JennyNeural",
|
136 |
-
"en-US-GuyNeural",
|
137 |
-
"en-ZA-LeahNeural",
|
138 |
-
"en-ZA-LukeNeural",
|
139 |
-
"es-AR-ElenaNeural",
|
140 |
-
"es-AR-TomasNeural",
|
141 |
-
"es-BO-SofiaNeural",
|
142 |
-
"es-BO-MarceloNeural",
|
143 |
-
"es-CL-CatalinaNeural",
|
144 |
-
"es-CL-LorenzoNeural",
|
145 |
-
"es-CO-SalomeNeural",
|
146 |
-
"es-CO-GonzaloNeural",
|
147 |
-
"es-CR-MariaNeural",
|
148 |
-
"es-CR-JuanNeural",
|
149 |
-
"es-CU-BelkysNeural",
|
150 |
-
"es-CU-ManuelNeural",
|
151 |
-
"es-DO-RamonaNeural",
|
152 |
-
"es-DO-EmilioNeural",
|
153 |
-
"es-EC-AndreaNeural",
|
154 |
-
"es-EC-LuisNeural",
|
155 |
-
"es-ES-ElviraNeural",
|
156 |
-
"es-ES-AlvaroNeural",
|
157 |
-
"es-GQ-TeresaNeural",
|
158 |
-
"es-GQ-JavierNeural",
|
159 |
-
"es-GT-MartaNeural",
|
160 |
-
"es-GT-AndresNeural",
|
161 |
-
"es-HN-KarlaNeural",
|
162 |
-
"es-HN-CarlosNeural",
|
163 |
-
"es-MX-DaliaNeural",
|
164 |
-
"es-MX-JorgeNeural",
|
165 |
-
"es-NI-YolandaNeural",
|
166 |
-
"es-NI-FedericoNeural",
|
167 |
-
"es-PA-MargaritaNeural",
|
168 |
-
"es-PA-RobertoNeural",
|
169 |
-
"es-PE-CamilaNeural",
|
170 |
-
"es-PE-AlexNeural",
|
171 |
-
"es-PR-KarinaNeural",
|
172 |
-
"es-PR-VictorNeural",
|
173 |
-
"es-PY-TaniaNeural",
|
174 |
-
"es-PY-MarioNeural",
|
175 |
-
"es-SV-LorenaNeural",
|
176 |
-
"es-SV-RodrigoNeural",
|
177 |
-
"es-US-SaraNeural",
|
178 |
-
"es-US-AlonsoNeural",
|
179 |
-
"es-UY-ValentinaNeural",
|
180 |
-
"es-UY-MateoNeural",
|
181 |
-
"es-VE-PaolaNeural",
|
182 |
-
"es-VE-SebastianNeural",
|
183 |
-
"et-EE-AnuNeural",
|
184 |
-
"et-EE-KertNeural",
|
185 |
-
"eu-ES-AinhoaNeural",
|
186 |
-
"eu-ES-AnderNeural",
|
187 |
-
"fa-IR-DilaraNeural",
|
188 |
-
"fa-IR-FaridNeural",
|
189 |
-
"fi-FI-NooraNeural",
|
190 |
-
"fi-FI-HarriNeural",
|
191 |
-
"fil-PH-BlessicaNeural",
|
192 |
-
"fil-PH-AngeloNeural",
|
193 |
-
"fr-BE-CharlineNeural",
|
194 |
-
"fr-BE-GerardNeural",
|
195 |
-
"fr-CA-SylvieNeural",
|
196 |
-
"fr-CA-AntoineNeural",
|
197 |
-
"fr-CH-ArianeNeural",
|
198 |
-
"fr-CH-GuillaumeNeural",
|
199 |
-
"fr-FR-DeniseNeural",
|
200 |
-
"fr-FR-HenriNeural",
|
201 |
-
"ga-IE-OrlaNeural",
|
202 |
-
"ga-IE-ColmNeural",
|
203 |
-
"gl-ES-SoniaNeural",
|
204 |
-
"gl-ES-XiaoqiangNeural",
|
205 |
-
"gu-IN-DhwaniNeural",
|
206 |
-
"gu-IN-NiranjanNeural",
|
207 |
-
"ha-NG-AishaNeural",
|
208 |
-
"ha-NG-YusufNeural",
|
209 |
-
"he-IL-HilaNeural",
|
210 |
-
"he-IL-AvriNeural",
|
211 |
-
"hi-IN-SwaraNeural",
|
212 |
-
"hi-IN-MadhurNeural",
|
213 |
-
"hr-HR-GabrijelaNeural",
|
214 |
-
"hr-HR-SreckoNeural",
|
215 |
-
"hu-HU-NoemiNeural",
|
216 |
-
"hu-HU-TamasNeural",
|
217 |
-
"hy-AM-AnushNeural",
|
218 |
-
"hy-AM-HaykNeural",
|
219 |
-
"id-ID-ArdiNeural",
|
220 |
-
"id-ID-GadisNeural",
|
221 |
-
"ig-NG-AdaNeural",
|
222 |
-
"ig-NG-EzeNeural",
|
223 |
-
"is-IS-GudrunNeural",
|
224 |
-
"is-IS-GunnarNeural",
|
225 |
-
"it-IT-ElsaNeural",
|
226 |
-
"it-IT-DiegoNeural",
|
227 |
-
"ja-JP-NanamiNeural",
|
228 |
-
"ja-JP-KeitaNeural",
|
229 |
-
"jv-ID-DianNeural",
|
230 |
-
"jv-ID-GustiNeural",
|
231 |
-
"ka-GE-EkaNeural",
|
232 |
-
# ... (truncated for brevity; include all voices as needed)
|
233 |
-
]
|
234 |
-
|
235 |
-
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
236 |
-
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
237 |
-
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
238 |
-
MODEL_ID,
|
239 |
-
trust_remote_code=True,
|
240 |
-
torch_dtype=torch.float16
|
241 |
-
).to("cuda").eval()
|
242 |
-
|
243 |
-
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
244 |
-
"""Convert text to speech using Edge TTS and save as MP3"""
|
245 |
-
communicate = edge_tts.Communicate(text, voice)
|
246 |
-
await communicate.save(output_file)
|
247 |
-
return output_file
|
248 |
-
|
249 |
-
def clean_chat_history(chat_history):
|
250 |
-
"""
|
251 |
-
Filter out any chat entries whose "content" is not a string.
|
252 |
-
This helps prevent errors when concatenating previous messages.
|
253 |
-
"""
|
254 |
-
cleaned = []
|
255 |
-
for msg in chat_history:
|
256 |
-
if isinstance(msg, dict) and isinstance(msg.get("content"), str):
|
257 |
-
cleaned.append(msg)
|
258 |
-
return cleaned
|
259 |
-
|
260 |
-
# Environment variables and parameters for Stable Diffusion XL (left in case needed in the future)
|
261 |
-
MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
|
262 |
-
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
|
263 |
-
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
264 |
-
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
|
265 |
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1")) # For batched image generation
|
266 |
-
|
267 |
-
# Load the SDXL pipeline (not used in the current configuration)
|
268 |
-
sd_pipe = StableDiffusionXLPipeline.from_pretrained(
|
269 |
-
MODEL_ID_SD,
|
270 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
271 |
-
use_safetensors=True,
|
272 |
-
add_watermarker=False,
|
273 |
-
).to(device)
|
274 |
-
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
275 |
-
if torch.cuda.is_available():
|
276 |
-
sd_pipe.text_encoder = sd_pipe.text_encoder.half()
|
277 |
-
if USE_TORCH_COMPILE:
|
278 |
-
sd_pipe.compile()
|
279 |
-
if ENABLE_CPU_OFFLOAD:
|
280 |
-
sd_pipe.enable_model_cpu_offload()
|
281 |
-
|
282 |
-
MAX_SEED = np.iinfo(np.int32).max
|
283 |
-
|
284 |
-
def save_image(img: Image.Image) -> str:
|
285 |
-
"""Save a PIL image with a unique filename and return the path."""
|
286 |
-
unique_name = str(uuid.uuid4()) + ".png"
|
287 |
-
img.save(unique_name)
|
288 |
-
return unique_name
|
289 |
-
|
290 |
-
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
291 |
-
if randomize_seed:
|
292 |
-
seed = random.randint(0, MAX_SEED)
|
293 |
-
return seed
|
294 |
-
|
295 |
-
def progress_bar_html(label: str) -> str:
|
296 |
-
"""
|
297 |
-
Returns an HTML snippet for a thin progress bar with a label.
|
298 |
-
The progress bar is styled as a dark red animated bar.
|
299 |
-
"""
|
300 |
-
return f'''
|
301 |
-
<div style="display: flex; align-items: center;">
|
302 |
-
<span style="margin-right: 10px; font-size: 14px;">{label}</span>
|
303 |
-
<div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
|
304 |
-
<div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
|
305 |
-
</div>
|
306 |
-
</div>
|
307 |
-
<style>
|
308 |
-
@keyframes loading {{
|
309 |
-
0% {{ transform: translateX(-100%); }}
|
310 |
-
100% {{ transform: translateX(100%); }}
|
311 |
-
}}
|
312 |
-
</style>
|
313 |
-
'''
|
314 |
-
|
315 |
-
def downsample_video(video_path):
|
316 |
-
"""
|
317 |
-
Downsamples the video to 10 evenly spaced frames.
|
318 |
-
Each frame is returned as a PIL image along with its timestamp.
|
319 |
-
"""
|
320 |
-
vidcap = cv2.VideoCapture(video_path)
|
321 |
-
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
|
322 |
-
fps = vidcap.get(cv2.CAP_PROP_FPS)
|
323 |
-
frames = []
|
324 |
-
frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
|
325 |
-
for i in frame_indices:
|
326 |
-
vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
327 |
-
success, image = vidcap.read()
|
328 |
-
if success:
|
329 |
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
330 |
-
pil_image = Image.fromarray(image)
|
331 |
-
timestamp = round(i / fps, 2)
|
332 |
-
frames.append((pil_image, timestamp))
|
333 |
-
vidcap.release()
|
334 |
-
return frames
|
335 |
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
use_negative_prompt: bool = False,
|
341 |
-
seed: int = 1,
|
342 |
-
width: int = 1024,
|
343 |
-
height: int = 1024,
|
344 |
-
guidance_scale: float = 3,
|
345 |
-
num_inference_steps: int = 25,
|
346 |
-
randomize_seed: bool = False,
|
347 |
-
use_resolution_binning: bool = True,
|
348 |
-
num_images: int = 1,
|
349 |
-
progress=gr.Progress(track_tqdm=True),
|
350 |
-
):
|
351 |
-
"""(Image generation function is preserved but not called in the current configuration)"""
|
352 |
-
seed = int(randomize_seed_fn(seed, randomize_seed))
|
353 |
-
generator = torch.Generator(device=device).manual_seed(seed)
|
354 |
-
options = {
|
355 |
-
"prompt": [prompt] * num_images,
|
356 |
-
"negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
|
357 |
-
"width": width,
|
358 |
-
"height": height,
|
359 |
-
"guidance_scale": guidance_scale,
|
360 |
-
"num_inference_steps": num_inference_steps,
|
361 |
-
"generator": generator,
|
362 |
-
"output_type": "pil",
|
363 |
-
}
|
364 |
-
if use_resolution_binning:
|
365 |
-
options["use_resolution_binning"] = True
|
366 |
-
images = []
|
367 |
-
for i in range(0, num_images, BATCH_SIZE):
|
368 |
-
batch_options = options.copy()
|
369 |
-
batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
|
370 |
-
if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
|
371 |
-
batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
|
372 |
-
if device.type == "cuda":
|
373 |
-
with torch.autocast("cuda", dtype=torch.float16):
|
374 |
-
outputs = sd_pipe(**batch_options)
|
375 |
-
else:
|
376 |
-
outputs = sd_pipe(**batch_options)
|
377 |
-
images.extend(outputs.images)
|
378 |
-
image_paths = [save_image(img) for img in images]
|
379 |
-
return image_paths, seed
|
380 |
|
381 |
@spaces.GPU
|
382 |
-
def
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
temperature: float = 0.6,
|
387 |
-
top_p: float = 0.9,
|
388 |
-
top_k: int = 50,
|
389 |
-
repetition_penalty: float = 1.2,
|
390 |
-
convert_to_speech: bool = False,
|
391 |
-
tts_rate: float = 1.0,
|
392 |
-
tts_voice: str = "en-US-JennyNeural",
|
393 |
-
):
|
394 |
-
"""
|
395 |
-
Generates chatbot responses with support for multimodal input and TTS conversion.
|
396 |
-
When files (images or videos) are provided, Qwen2VL is used.
|
397 |
-
Otherwise, the FastThink-0.5B text model is used.
|
398 |
-
After generating the response, if convert_to_speech is True the text is passed to the TTS function.
|
399 |
-
"""
|
400 |
-
text = input_dict["text"].strip()
|
401 |
-
files = input_dict.get("files", [])
|
402 |
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
images = [load_image(image) for image in files]
|
408 |
-
else:
|
409 |
-
images = [load_image(files[0])]
|
410 |
-
messages = [{
|
411 |
-
"role": "user",
|
412 |
-
"content": [
|
413 |
-
*[{"type": "image", "image": image} for image in images],
|
414 |
-
{"type": "text", "text": text},
|
415 |
-
]
|
416 |
-
}]
|
417 |
-
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
418 |
-
inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
|
419 |
-
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
420 |
-
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
421 |
-
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
422 |
-
thread.start()
|
423 |
-
buffer = ""
|
424 |
-
yield progress_bar_html("Processing multimodal input...")
|
425 |
-
for new_text in streamer:
|
426 |
-
buffer += new_text
|
427 |
-
buffer = buffer.replace("<|im_end|>", "")
|
428 |
-
time.sleep(0.01)
|
429 |
-
yield buffer
|
430 |
-
final_response = buffer
|
431 |
-
else:
|
432 |
-
conversation = clean_chat_history(chat_history)
|
433 |
-
conversation.append({"role": "user", "content": text})
|
434 |
-
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
|
435 |
-
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
436 |
-
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
437 |
-
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
438 |
-
input_ids = input_ids.to(model.device)
|
439 |
-
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
|
440 |
-
generation_kwargs = {
|
441 |
-
"input_ids": input_ids,
|
442 |
-
"streamer": streamer,
|
443 |
-
"max_new_tokens": max_new_tokens,
|
444 |
-
"do_sample": True,
|
445 |
-
"top_p": top_p,
|
446 |
-
"top_k": top_k,
|
447 |
-
"temperature": temperature,
|
448 |
-
"num_beams": 1,
|
449 |
-
"repetition_penalty": repetition_penalty,
|
450 |
-
}
|
451 |
-
t = Thread(target=model.generate, kwargs=generation_kwargs)
|
452 |
-
t.start()
|
453 |
-
outputs = []
|
454 |
-
yield progress_bar_html("Processing text...")
|
455 |
-
for new_text in streamer:
|
456 |
-
outputs.append(new_text)
|
457 |
-
yield "".join(outputs)
|
458 |
-
final_response = "".join(outputs)
|
459 |
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
# If TTS conversion is enabled, log the message and generate speech.
|
464 |
-
if convert_to_speech:
|
465 |
-
print("Generate Response to Generate Speech")
|
466 |
-
# Here tts_rate can be used to adjust parameters if needed.
|
467 |
-
output_file = asyncio.run(text_to_speech(final_response, tts_voice))
|
468 |
-
yield gr.Audio(output_file, autoplay=True)
|
469 |
-
|
470 |
-
with gr.Blocks() as demo:
|
471 |
-
with gr.Sidebar():
|
472 |
-
gr.Markdown("# TTS Conversion")
|
473 |
-
tts_rate_slider = gr.Slider(label="TTS Rate", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
|
474 |
-
tts_voice_radio = gr.Radio(choices=TTS_VOICES, label="Choose TTS Voice", value="en-US-JennyNeural")
|
475 |
-
convert_to_speech_checkbox = gr.Checkbox(label="Convert to Speech", value=False)
|
476 |
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
tts_rate_slider,
|
488 |
-
tts_voice_radio,
|
489 |
-
],
|
490 |
-
examples=[
|
491 |
-
["Write the Python Program for Array Rotation"],
|
492 |
-
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
493 |
-
[{"text": "Describe the Ad", "files": ["examples/coca.mp4"]}],
|
494 |
-
[{"text": "Summarize the event in video", "files": ["examples/sky.mp4"]}],
|
495 |
-
[{"text": "Describe the video", "files": ["examples/Missing.mp4"]}],
|
496 |
-
["Who is Nikola Tesla, and why did he die?"],
|
497 |
-
[{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
|
498 |
-
["What causes rainbows to form?"],
|
499 |
-
],
|
500 |
-
cache_examples=False,
|
501 |
-
type="messages",
|
502 |
-
description="# **QwQ Edge: Multimodal (image upload uses Qwen2-VL) with TTS conversion**",
|
503 |
-
fill_height=True,
|
504 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Enter text or upload files"),
|
505 |
-
stop_btn="Stop Generation",
|
506 |
-
multimodal=True,
|
507 |
-
)
|
508 |
|
|
|
509 |
if __name__ == "__main__":
|
510 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
+
from transformers import AutoImageProcessor
|
4 |
+
from transformers import SiglipForImageClassification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from transformers.image_utils import load_image
|
6 |
+
from PIL import Image
|
7 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Load model and processor
|
10 |
+
model_name = "prithivMLmods/Gender-Classifier-Mini"
|
11 |
+
model = SiglipForImageClassification.from_pretrained(model_name)
|
12 |
+
processor = AutoImageProcessor.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
@spaces.GPU
|
15 |
+
def gender_classification(image):
|
16 |
+
"""Predicts gender category for an image."""
|
17 |
+
image = Image.fromarray(image).convert("RGB")
|
18 |
+
inputs = processor(images=image, return_tensors="pt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
with torch.no_grad():
|
21 |
+
outputs = model(**inputs)
|
22 |
+
logits = outputs.logits
|
23 |
+
probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
labels = {"0": "Female ♀", "1": "Male ♂"}
|
26 |
+
predictions = {labels[str(i)]: round(probs[i], 3) for i in range(len(probs))}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
return predictions
|
29 |
+
|
30 |
+
# Create Gradio interface
|
31 |
+
iface = gr.Interface(
|
32 |
+
fn=gender_classification,
|
33 |
+
inputs=gr.Image(type="numpy"),
|
34 |
+
outputs=gr.Label(label="Prediction Scores"),
|
35 |
+
title="Gender Classification",
|
36 |
+
description="Upload an image to classify its gender."
|
37 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Launch the app
|
40 |
if __name__ == "__main__":
|
41 |
+
iface.launch()
|