prithivMLmods commited on
Commit
dbd1461
·
verified ·
1 Parent(s): baca90a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -499
app.py CHANGED
@@ -1,510 +1,41 @@
1
- import os
2
- import random
3
- import uuid
4
- import json
5
- import time
6
- import asyncio
7
- from threading import Thread
8
-
9
  import gradio as gr
10
  import spaces
11
- import torch
12
- import numpy as np
13
- from PIL import Image
14
- import edge_tts
15
- import cv2
16
-
17
- from transformers import (
18
- AutoModelForCausalLM,
19
- AutoTokenizer,
20
- TextIteratorStreamer,
21
- Qwen2VLForConditionalGeneration,
22
- Qwen2_5_VLForConditionalGeneration,
23
- AutoProcessor,
24
- )
25
  from transformers.image_utils import load_image
26
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
27
-
28
- MAX_MAX_NEW_TOKENS = 2048
29
- DEFAULT_MAX_NEW_TOKENS = 1024
30
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
31
-
32
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
-
34
- # Load text-only model and tokenizer
35
- model_id = "prithivMLmods/FastThink-0.5B-Tiny"
36
- tokenizer = AutoTokenizer.from_pretrained(model_id)
37
- model = AutoModelForCausalLM.from_pretrained(
38
- model_id,
39
- device_map="auto",
40
- torch_dtype=torch.bfloat16,
41
- )
42
- model.eval()
43
-
44
- # Updated TTS voices list (all voices)
45
- TTS_VOICES = [
46
- "af-ZA-AdriNeural",
47
- "af-ZA-WillemNeural",
48
- "am-ET-AmehaNeural",
49
- "am-ET-MekdesNeural",
50
- "ar-AE-FatimaNeural",
51
- "ar-AE-HamdanNeural",
52
- "ar-BH-LailaNeural",
53
- "ar-BH-MajedNeural",
54
- "ar-DZ-AminaNeural",
55
- "ar-DZ-IsmaelNeural",
56
- "ar-EG-SalmaNeural",
57
- "ar-EG-OmarNeural",
58
- "ar-IQ-LanaNeural",
59
- "ar-IQ-BassamNeural",
60
- "ar-JO-SanaNeural",
61
- "ar-JO-TaimNeural",
62
- "ar-KW-NouraNeural",
63
- "ar-KW-FahedNeural",
64
- "ar-LB-LaylaNeural",
65
- "ar-LB-RamiNeural",
66
- "ar-LY-ImanNeural",
67
- "ar-LY-OmarNeural",
68
- "ar-MA-MounaNeural",
69
- "ar-MA-JamalNeural",
70
- "ar-OM-AyshaNeural",
71
- "ar-OM-AbdullahNeural",
72
- "ar-QA-AmalNeural",
73
- "ar-QA-MoazNeural",
74
- "ar-SA-ZariyahNeural",
75
- "ar-SA-HamedNeural",
76
- "ar-SY-AmanyNeural",
77
- "ar-SY-LaithNeural",
78
- "ar-TN-ReemNeural",
79
- "ar-TN-SeifNeural",
80
- "ar-YE-MaryamNeural",
81
- "ar-YE-SalehNeural",
82
- "az-AZ-BabekNeural",
83
- "az-AZ-BanuNeural",
84
- "bg-BG-BorislavNeural",
85
- "bg-BG-KalinaNeural",
86
- "bn-BD-NabanitaNeural",
87
- "bn-BD-PradeepNeural",
88
- "bn-IN-TanishaNeural",
89
- "bn-IN-SwapanNeural",
90
- "bs-BA-GoranNeural",
91
- "bs-BA-VesnaNeural",
92
- "ca-ES-JoanaNeural",
93
- "ca-ES-AlbaNeural",
94
- "ca-ES-EnricNeural",
95
- "cs-CZ-AntoninNeural",
96
- "cs-CZ-VlastaNeural",
97
- "cy-GB-NiaNeural",
98
- "cy-GB-AledNeural",
99
- "da-DK-ChristelNeural",
100
- "da-DK-JeppeNeural",
101
- "de-AT-IngridNeural",
102
- "de-AT-JonasNeural",
103
- "de-CH-LeniNeural",
104
- "de-CH-JanNeural",
105
- "de-DE-KatjaNeural",
106
- "de-DE-ConradNeural",
107
- "el-GR-AthinaNeural",
108
- "el-GR-NestorasNeural",
109
- "en-AU-AnnetteNeural",
110
- "en-AU-MichaelNeural",
111
- "en-CA-ClaraNeural",
112
- "en-CA-LiamNeural",
113
- "en-GB-SoniaNeural",
114
- "en-GB-RyanNeural",
115
- "en-GH-EsiNeural",
116
- "en-GH-KwameNeural",
117
- "en-HK-YanNeural",
118
- "en-HK-TrevorNeural",
119
- "en-IE-EmilyNeural",
120
- "en-IE-ConnorNeural",
121
- "en-IN-NeerjaNeural",
122
- "en-IN-PrabhasNeural",
123
- "en-KE-ChantelleNeural",
124
- "en-KE-ChilembaNeural",
125
- "en-NG-EzinneNeural",
126
- "en-NG-AbechiNeural",
127
- "en-NZ-MollyNeural",
128
- "en-NZ-MitchellNeural",
129
- "en-PH-RosaNeural",
130
- "en-PH-JamesNeural",
131
- "en-SG-LunaNeural",
132
- "en-SG-WayneNeural",
133
- "en-TZ-ImaniNeural",
134
- "en-TZ-DaudiNeural",
135
- "en-US-JennyNeural",
136
- "en-US-GuyNeural",
137
- "en-ZA-LeahNeural",
138
- "en-ZA-LukeNeural",
139
- "es-AR-ElenaNeural",
140
- "es-AR-TomasNeural",
141
- "es-BO-SofiaNeural",
142
- "es-BO-MarceloNeural",
143
- "es-CL-CatalinaNeural",
144
- "es-CL-LorenzoNeural",
145
- "es-CO-SalomeNeural",
146
- "es-CO-GonzaloNeural",
147
- "es-CR-MariaNeural",
148
- "es-CR-JuanNeural",
149
- "es-CU-BelkysNeural",
150
- "es-CU-ManuelNeural",
151
- "es-DO-RamonaNeural",
152
- "es-DO-EmilioNeural",
153
- "es-EC-AndreaNeural",
154
- "es-EC-LuisNeural",
155
- "es-ES-ElviraNeural",
156
- "es-ES-AlvaroNeural",
157
- "es-GQ-TeresaNeural",
158
- "es-GQ-JavierNeural",
159
- "es-GT-MartaNeural",
160
- "es-GT-AndresNeural",
161
- "es-HN-KarlaNeural",
162
- "es-HN-CarlosNeural",
163
- "es-MX-DaliaNeural",
164
- "es-MX-JorgeNeural",
165
- "es-NI-YolandaNeural",
166
- "es-NI-FedericoNeural",
167
- "es-PA-MargaritaNeural",
168
- "es-PA-RobertoNeural",
169
- "es-PE-CamilaNeural",
170
- "es-PE-AlexNeural",
171
- "es-PR-KarinaNeural",
172
- "es-PR-VictorNeural",
173
- "es-PY-TaniaNeural",
174
- "es-PY-MarioNeural",
175
- "es-SV-LorenaNeural",
176
- "es-SV-RodrigoNeural",
177
- "es-US-SaraNeural",
178
- "es-US-AlonsoNeural",
179
- "es-UY-ValentinaNeural",
180
- "es-UY-MateoNeural",
181
- "es-VE-PaolaNeural",
182
- "es-VE-SebastianNeural",
183
- "et-EE-AnuNeural",
184
- "et-EE-KertNeural",
185
- "eu-ES-AinhoaNeural",
186
- "eu-ES-AnderNeural",
187
- "fa-IR-DilaraNeural",
188
- "fa-IR-FaridNeural",
189
- "fi-FI-NooraNeural",
190
- "fi-FI-HarriNeural",
191
- "fil-PH-BlessicaNeural",
192
- "fil-PH-AngeloNeural",
193
- "fr-BE-CharlineNeural",
194
- "fr-BE-GerardNeural",
195
- "fr-CA-SylvieNeural",
196
- "fr-CA-AntoineNeural",
197
- "fr-CH-ArianeNeural",
198
- "fr-CH-GuillaumeNeural",
199
- "fr-FR-DeniseNeural",
200
- "fr-FR-HenriNeural",
201
- "ga-IE-OrlaNeural",
202
- "ga-IE-ColmNeural",
203
- "gl-ES-SoniaNeural",
204
- "gl-ES-XiaoqiangNeural",
205
- "gu-IN-DhwaniNeural",
206
- "gu-IN-NiranjanNeural",
207
- "ha-NG-AishaNeural",
208
- "ha-NG-YusufNeural",
209
- "he-IL-HilaNeural",
210
- "he-IL-AvriNeural",
211
- "hi-IN-SwaraNeural",
212
- "hi-IN-MadhurNeural",
213
- "hr-HR-GabrijelaNeural",
214
- "hr-HR-SreckoNeural",
215
- "hu-HU-NoemiNeural",
216
- "hu-HU-TamasNeural",
217
- "hy-AM-AnushNeural",
218
- "hy-AM-HaykNeural",
219
- "id-ID-ArdiNeural",
220
- "id-ID-GadisNeural",
221
- "ig-NG-AdaNeural",
222
- "ig-NG-EzeNeural",
223
- "is-IS-GudrunNeural",
224
- "is-IS-GunnarNeural",
225
- "it-IT-ElsaNeural",
226
- "it-IT-DiegoNeural",
227
- "ja-JP-NanamiNeural",
228
- "ja-JP-KeitaNeural",
229
- "jv-ID-DianNeural",
230
- "jv-ID-GustiNeural",
231
- "ka-GE-EkaNeural",
232
- # ... (truncated for brevity; include all voices as needed)
233
- ]
234
-
235
- MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
236
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
237
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
238
- MODEL_ID,
239
- trust_remote_code=True,
240
- torch_dtype=torch.float16
241
- ).to("cuda").eval()
242
-
243
- async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
244
- """Convert text to speech using Edge TTS and save as MP3"""
245
- communicate = edge_tts.Communicate(text, voice)
246
- await communicate.save(output_file)
247
- return output_file
248
-
249
- def clean_chat_history(chat_history):
250
- """
251
- Filter out any chat entries whose "content" is not a string.
252
- This helps prevent errors when concatenating previous messages.
253
- """
254
- cleaned = []
255
- for msg in chat_history:
256
- if isinstance(msg, dict) and isinstance(msg.get("content"), str):
257
- cleaned.append(msg)
258
- return cleaned
259
-
260
- # Environment variables and parameters for Stable Diffusion XL (left in case needed in the future)
261
- MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
262
- MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
263
- USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
264
- ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
265
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1")) # For batched image generation
266
-
267
- # Load the SDXL pipeline (not used in the current configuration)
268
- sd_pipe = StableDiffusionXLPipeline.from_pretrained(
269
- MODEL_ID_SD,
270
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
271
- use_safetensors=True,
272
- add_watermarker=False,
273
- ).to(device)
274
- sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
275
- if torch.cuda.is_available():
276
- sd_pipe.text_encoder = sd_pipe.text_encoder.half()
277
- if USE_TORCH_COMPILE:
278
- sd_pipe.compile()
279
- if ENABLE_CPU_OFFLOAD:
280
- sd_pipe.enable_model_cpu_offload()
281
-
282
- MAX_SEED = np.iinfo(np.int32).max
283
-
284
- def save_image(img: Image.Image) -> str:
285
- """Save a PIL image with a unique filename and return the path."""
286
- unique_name = str(uuid.uuid4()) + ".png"
287
- img.save(unique_name)
288
- return unique_name
289
-
290
- def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
291
- if randomize_seed:
292
- seed = random.randint(0, MAX_SEED)
293
- return seed
294
-
295
- def progress_bar_html(label: str) -> str:
296
- """
297
- Returns an HTML snippet for a thin progress bar with a label.
298
- The progress bar is styled as a dark red animated bar.
299
- """
300
- return f'''
301
- <div style="display: flex; align-items: center;">
302
- <span style="margin-right: 10px; font-size: 14px;">{label}</span>
303
- <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
304
- <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
305
- </div>
306
- </div>
307
- <style>
308
- @keyframes loading {{
309
- 0% {{ transform: translateX(-100%); }}
310
- 100% {{ transform: translateX(100%); }}
311
- }}
312
- </style>
313
- '''
314
-
315
- def downsample_video(video_path):
316
- """
317
- Downsamples the video to 10 evenly spaced frames.
318
- Each frame is returned as a PIL image along with its timestamp.
319
- """
320
- vidcap = cv2.VideoCapture(video_path)
321
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
322
- fps = vidcap.get(cv2.CAP_PROP_FPS)
323
- frames = []
324
- frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
325
- for i in frame_indices:
326
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
327
- success, image = vidcap.read()
328
- if success:
329
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
330
- pil_image = Image.fromarray(image)
331
- timestamp = round(i / fps, 2)
332
- frames.append((pil_image, timestamp))
333
- vidcap.release()
334
- return frames
335
 
336
- @spaces.GPU(duration=60, enable_queue=True)
337
- def generate_image_fn(
338
- prompt: str,
339
- negative_prompt: str = "",
340
- use_negative_prompt: bool = False,
341
- seed: int = 1,
342
- width: int = 1024,
343
- height: int = 1024,
344
- guidance_scale: float = 3,
345
- num_inference_steps: int = 25,
346
- randomize_seed: bool = False,
347
- use_resolution_binning: bool = True,
348
- num_images: int = 1,
349
- progress=gr.Progress(track_tqdm=True),
350
- ):
351
- """(Image generation function is preserved but not called in the current configuration)"""
352
- seed = int(randomize_seed_fn(seed, randomize_seed))
353
- generator = torch.Generator(device=device).manual_seed(seed)
354
- options = {
355
- "prompt": [prompt] * num_images,
356
- "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
357
- "width": width,
358
- "height": height,
359
- "guidance_scale": guidance_scale,
360
- "num_inference_steps": num_inference_steps,
361
- "generator": generator,
362
- "output_type": "pil",
363
- }
364
- if use_resolution_binning:
365
- options["use_resolution_binning"] = True
366
- images = []
367
- for i in range(0, num_images, BATCH_SIZE):
368
- batch_options = options.copy()
369
- batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
370
- if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
371
- batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
372
- if device.type == "cuda":
373
- with torch.autocast("cuda", dtype=torch.float16):
374
- outputs = sd_pipe(**batch_options)
375
- else:
376
- outputs = sd_pipe(**batch_options)
377
- images.extend(outputs.images)
378
- image_paths = [save_image(img) for img in images]
379
- return image_paths, seed
380
 
381
  @spaces.GPU
382
- def generate(
383
- input_dict: dict,
384
- chat_history: list[dict],
385
- max_new_tokens: int = 1024,
386
- temperature: float = 0.6,
387
- top_p: float = 0.9,
388
- top_k: int = 50,
389
- repetition_penalty: float = 1.2,
390
- convert_to_speech: bool = False,
391
- tts_rate: float = 1.0,
392
- tts_voice: str = "en-US-JennyNeural",
393
- ):
394
- """
395
- Generates chatbot responses with support for multimodal input and TTS conversion.
396
- When files (images or videos) are provided, Qwen2VL is used.
397
- Otherwise, the FastThink-0.5B text model is used.
398
- After generating the response, if convert_to_speech is True the text is passed to the TTS function.
399
- """
400
- text = input_dict["text"].strip()
401
- files = input_dict.get("files", [])
402
 
403
- # Determine which branch to use: multimodal (if files provided) or text-only.
404
- if files:
405
- # Process uploaded files as images (or videos)
406
- if len(files) > 1:
407
- images = [load_image(image) for image in files]
408
- else:
409
- images = [load_image(files[0])]
410
- messages = [{
411
- "role": "user",
412
- "content": [
413
- *[{"type": "image", "image": image} for image in images],
414
- {"type": "text", "text": text},
415
- ]
416
- }]
417
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
418
- inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
419
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
420
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
421
- thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
422
- thread.start()
423
- buffer = ""
424
- yield progress_bar_html("Processing multimodal input...")
425
- for new_text in streamer:
426
- buffer += new_text
427
- buffer = buffer.replace("<|im_end|>", "")
428
- time.sleep(0.01)
429
- yield buffer
430
- final_response = buffer
431
- else:
432
- conversation = clean_chat_history(chat_history)
433
- conversation.append({"role": "user", "content": text})
434
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
435
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
436
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
437
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
438
- input_ids = input_ids.to(model.device)
439
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
440
- generation_kwargs = {
441
- "input_ids": input_ids,
442
- "streamer": streamer,
443
- "max_new_tokens": max_new_tokens,
444
- "do_sample": True,
445
- "top_p": top_p,
446
- "top_k": top_k,
447
- "temperature": temperature,
448
- "num_beams": 1,
449
- "repetition_penalty": repetition_penalty,
450
- }
451
- t = Thread(target=model.generate, kwargs=generation_kwargs)
452
- t.start()
453
- outputs = []
454
- yield progress_bar_html("Processing text...")
455
- for new_text in streamer:
456
- outputs.append(new_text)
457
- yield "".join(outputs)
458
- final_response = "".join(outputs)
459
 
460
- # Yield the final text response.
461
- yield final_response
462
-
463
- # If TTS conversion is enabled, log the message and generate speech.
464
- if convert_to_speech:
465
- print("Generate Response to Generate Speech")
466
- # Here tts_rate can be used to adjust parameters if needed.
467
- output_file = asyncio.run(text_to_speech(final_response, tts_voice))
468
- yield gr.Audio(output_file, autoplay=True)
469
-
470
- with gr.Blocks() as demo:
471
- with gr.Sidebar():
472
- gr.Markdown("# TTS Conversion")
473
- tts_rate_slider = gr.Slider(label="TTS Rate", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
474
- tts_voice_radio = gr.Radio(choices=TTS_VOICES, label="Choose TTS Voice", value="en-US-JennyNeural")
475
- convert_to_speech_checkbox = gr.Checkbox(label="Convert to Speech", value=False)
476
 
477
- chat_interface = gr.ChatInterface(
478
- fn=generate,
479
- additional_inputs=[
480
- gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
481
- gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
482
- gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
483
- gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
484
- gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
485
- # Pass TTS parameters to the generate function.
486
- convert_to_speech_checkbox,
487
- tts_rate_slider,
488
- tts_voice_radio,
489
- ],
490
- examples=[
491
- ["Write the Python Program for Array Rotation"],
492
- [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
493
- [{"text": "Describe the Ad", "files": ["examples/coca.mp4"]}],
494
- [{"text": "Summarize the event in video", "files": ["examples/sky.mp4"]}],
495
- [{"text": "Describe the video", "files": ["examples/Missing.mp4"]}],
496
- ["Who is Nikola Tesla, and why did he die?"],
497
- [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
498
- ["What causes rainbows to form?"],
499
- ],
500
- cache_examples=False,
501
- type="messages",
502
- description="# **QwQ Edge: Multimodal (image upload uses Qwen2-VL) with TTS conversion**",
503
- fill_height=True,
504
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Enter text or upload files"),
505
- stop_btn="Stop Generation",
506
- multimodal=True,
507
- )
508
 
 
509
  if __name__ == "__main__":
510
- demo.queue(max_size=20).launch(share=True)
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import AutoImageProcessor
4
+ from transformers import SiglipForImageClassification
 
 
 
 
 
 
 
 
 
 
 
 
5
  from transformers.image_utils import load_image
6
+ from PIL import Image
7
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Load model and processor
10
+ model_name = "prithivMLmods/Gender-Classifier-Mini"
11
+ model = SiglipForImageClassification.from_pretrained(model_name)
12
+ processor = AutoImageProcessor.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  @spaces.GPU
15
+ def gender_classification(image):
16
+ """Predicts gender category for an image."""
17
+ image = Image.fromarray(image).convert("RGB")
18
+ inputs = processor(images=image, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ with torch.no_grad():
21
+ outputs = model(**inputs)
22
+ logits = outputs.logits
23
+ probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ labels = {"0": "Female ♀", "1": "Male ♂"}
26
+ predictions = {labels[str(i)]: round(probs[i], 3) for i in range(len(probs))}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ return predictions
29
+
30
+ # Create Gradio interface
31
+ iface = gr.Interface(
32
+ fn=gender_classification,
33
+ inputs=gr.Image(type="numpy"),
34
+ outputs=gr.Label(label="Prediction Scores"),
35
+ title="Gender Classification",
36
+ description="Upload an image to classify its gender."
37
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Launch the app
40
  if __name__ == "__main__":
41
+ iface.launch()