Spaces:

prithivMLmods
/

Callisto-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 30

Commit

dbd1461

verified ·

1 Parent(s): baca90a

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -499

app.py CHANGED Viewed

@@ -1,510 +1,41 @@
-import os
-import random
-import uuid
-import json
-import time
-import asyncio
-from threading import Thread
 import gradio as gr
 import spaces
-import torch
-import numpy as np
-from PIL import Image
-import edge_tts
-import cv2
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-    Qwen2VLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
-)
 from transformers.image_utils import load_image
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
-model_id = "prithivMLmods/FastThink-0.5B-Tiny"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-model.eval()
-# Updated TTS voices list (all voices)
-TTS_VOICES = [
-    "af-ZA-AdriNeural",
-    "af-ZA-WillemNeural",
-    "am-ET-AmehaNeural",
-    "am-ET-MekdesNeural",
-    "ar-AE-FatimaNeural",
-    "ar-AE-HamdanNeural",
-    "ar-BH-LailaNeural",
-    "ar-BH-MajedNeural",
-    "ar-DZ-AminaNeural",
-    "ar-DZ-IsmaelNeural",
-    "ar-EG-SalmaNeural",
-    "ar-EG-OmarNeural",
-    "ar-IQ-LanaNeural",
-    "ar-IQ-BassamNeural",
-    "ar-JO-SanaNeural",
-    "ar-JO-TaimNeural",
-    "ar-KW-NouraNeural",
-    "ar-KW-FahedNeural",
-    "ar-LB-LaylaNeural",
-    "ar-LB-RamiNeural",
-    "ar-LY-ImanNeural",
-    "ar-LY-OmarNeural",
-    "ar-MA-MounaNeural",
-    "ar-MA-JamalNeural",
-    "ar-OM-AyshaNeural",
-    "ar-OM-AbdullahNeural",
-    "ar-QA-AmalNeural",
-    "ar-QA-MoazNeural",
-    "ar-SA-ZariyahNeural",
-    "ar-SA-HamedNeural",
-    "ar-SY-AmanyNeural",
-    "ar-SY-LaithNeural",
-    "ar-TN-ReemNeural",
-    "ar-TN-SeifNeural",
-    "ar-YE-MaryamNeural",
-    "ar-YE-SalehNeural",
-    "az-AZ-BabekNeural",
-    "az-AZ-BanuNeural",
-    "bg-BG-BorislavNeural",
-    "bg-BG-KalinaNeural",
-    "bn-BD-NabanitaNeural",
-    "bn-BD-PradeepNeural",
-    "bn-IN-TanishaNeural",
-    "bn-IN-SwapanNeural",
-    "bs-BA-GoranNeural",
-    "bs-BA-VesnaNeural",
-    "ca-ES-JoanaNeural",
-    "ca-ES-AlbaNeural",
-    "ca-ES-EnricNeural",
-    "cs-CZ-AntoninNeural",
-    "cs-CZ-VlastaNeural",
-    "cy-GB-NiaNeural",
-    "cy-GB-AledNeural",
-    "da-DK-ChristelNeural",
-    "da-DK-JeppeNeural",
-    "de-AT-IngridNeural",
-    "de-AT-JonasNeural",
-    "de-CH-LeniNeural",
-    "de-CH-JanNeural",
-    "de-DE-KatjaNeural",
-    "de-DE-ConradNeural",
-    "el-GR-AthinaNeural",
-    "el-GR-NestorasNeural",
-    "en-AU-AnnetteNeural",
-    "en-AU-MichaelNeural",
-    "en-CA-ClaraNeural",
-    "en-CA-LiamNeural",
-    "en-GB-SoniaNeural",
-    "en-GB-RyanNeural",
-    "en-GH-EsiNeural",
-    "en-GH-KwameNeural",
-    "en-HK-YanNeural",
-    "en-HK-TrevorNeural",
-    "en-IE-EmilyNeural",
-    "en-IE-ConnorNeural",
-    "en-IN-NeerjaNeural",
-    "en-IN-PrabhasNeural",
-    "en-KE-ChantelleNeural",
-    "en-KE-ChilembaNeural",
-    "en-NG-EzinneNeural",
-    "en-NG-AbechiNeural",
-    "en-NZ-MollyNeural",
-    "en-NZ-MitchellNeural",
-    "en-PH-RosaNeural",
-    "en-PH-JamesNeural",
-    "en-SG-LunaNeural",
-    "en-SG-WayneNeural",
-    "en-TZ-ImaniNeural",
-    "en-TZ-DaudiNeural",
-    "en-US-JennyNeural",
-    "en-US-GuyNeural",
-    "en-ZA-LeahNeural",
-    "en-ZA-LukeNeural",
-    "es-AR-ElenaNeural",
-    "es-AR-TomasNeural",
-    "es-BO-SofiaNeural",
-    "es-BO-MarceloNeural",
-    "es-CL-CatalinaNeural",
-    "es-CL-LorenzoNeural",
-    "es-CO-SalomeNeural",
-    "es-CO-GonzaloNeural",
-    "es-CR-MariaNeural",
-    "es-CR-JuanNeural",
-    "es-CU-BelkysNeural",
-    "es-CU-ManuelNeural",
-    "es-DO-RamonaNeural",
-    "es-DO-EmilioNeural",
-    "es-EC-AndreaNeural",
-    "es-EC-LuisNeural",
-    "es-ES-ElviraNeural",
-    "es-ES-AlvaroNeural",
-    "es-GQ-TeresaNeural",
-    "es-GQ-JavierNeural",
-    "es-GT-MartaNeural",
-    "es-GT-AndresNeural",
-    "es-HN-KarlaNeural",
-    "es-HN-CarlosNeural",
-    "es-MX-DaliaNeural",
-    "es-MX-JorgeNeural",
-    "es-NI-YolandaNeural",
-    "es-NI-FedericoNeural",
-    "es-PA-MargaritaNeural",
-    "es-PA-RobertoNeural",
-    "es-PE-CamilaNeural",
-    "es-PE-AlexNeural",
-    "es-PR-KarinaNeural",
-    "es-PR-VictorNeural",
-    "es-PY-TaniaNeural",
-    "es-PY-MarioNeural",
-    "es-SV-LorenaNeural",
-    "es-SV-RodrigoNeural",
-    "es-US-SaraNeural",
-    "es-US-AlonsoNeural",
-    "es-UY-ValentinaNeural",
-    "es-UY-MateoNeural",
-    "es-VE-PaolaNeural",
-    "es-VE-SebastianNeural",
-    "et-EE-AnuNeural",
-    "et-EE-KertNeural",
-    "eu-ES-AinhoaNeural",
-    "eu-ES-AnderNeural",
-    "fa-IR-DilaraNeural",
-    "fa-IR-FaridNeural",
-    "fi-FI-NooraNeural",
-    "fi-FI-HarriNeural",
-    "fil-PH-BlessicaNeural",
-    "fil-PH-AngeloNeural",
-    "fr-BE-CharlineNeural",
-    "fr-BE-GerardNeural",
-    "fr-CA-SylvieNeural",
-    "fr-CA-AntoineNeural",
-    "fr-CH-ArianeNeural",
-    "fr-CH-GuillaumeNeural",
-    "fr-FR-DeniseNeural",
-    "fr-FR-HenriNeural",
-    "ga-IE-OrlaNeural",
-    "ga-IE-ColmNeural",
-    "gl-ES-SoniaNeural",
-    "gl-ES-XiaoqiangNeural",
-    "gu-IN-DhwaniNeural",
-    "gu-IN-NiranjanNeural",
-    "ha-NG-AishaNeural",
-    "ha-NG-YusufNeural",
-    "he-IL-HilaNeural",
-    "he-IL-AvriNeural",
-    "hi-IN-SwaraNeural",
-    "hi-IN-MadhurNeural",
-    "hr-HR-GabrijelaNeural",
-    "hr-HR-SreckoNeural",
-    "hu-HU-NoemiNeural",
-    "hu-HU-TamasNeural",
-    "hy-AM-AnushNeural",
-    "hy-AM-HaykNeural",
-    "id-ID-ArdiNeural",
-    "id-ID-GadisNeural",
-    "ig-NG-AdaNeural",
-    "ig-NG-EzeNeural",
-    "is-IS-GudrunNeural",
-    "is-IS-GunnarNeural",
-    "it-IT-ElsaNeural",
-    "it-IT-DiegoNeural",
-    "ja-JP-NanamiNeural",
-    "ja-JP-KeitaNeural",
-    "jv-ID-DianNeural",
-    "jv-ID-GustiNeural",
-    "ka-GE-EkaNeural",
-    # ... (truncated for brevity; include all voices as needed)
-]
-MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
-# Environment variables and parameters for Stable Diffusion XL (left in case needed in the future)
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
-MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
-USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
-ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-# Load the SDXL pipeline (not used in the current configuration)
-sd_pipe = StableDiffusionXLPipeline.from_pretrained(
-    MODEL_ID_SD,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    use_safetensors=True,
-    add_watermarker=False,
-).to(device)
-sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-if torch.cuda.is_available():
-    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-if USE_TORCH_COMPILE:
-    sd_pipe.compile()
-if ENABLE_CPU_OFFLOAD:
-    sd_pipe.enable_model_cpu_offload()
-MAX_SEED = np.iinfo(np.int32).max
-def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark red animated bar.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-def downsample_video(video_path):
-    """
-    Downsamples the video to 10 evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
-@spaces.GPU(duration=60, enable_queue=True)
-def generate_image_fn(
-    prompt: str,
-    negative_prompt: str = "",
-    use_negative_prompt: bool = False,
-    seed: int = 1,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3,
-    num_inference_steps: int = 25,
-    randomize_seed: bool = False,
-    use_resolution_binning: bool = True,
-    num_images: int = 1,
-    progress=gr.Progress(track_tqdm=True),
-):
-    """(Image generation function is preserved but not called in the current configuration)"""
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
-    options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
-        "width": width,
-        "height": height,
-        "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
-        "output_type": "pil",
-    }
-    if use_resolution_binning:
-        options["use_resolution_binning"] = True
-    images = []
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        if device.type == "cuda":
-            with torch.autocast("cuda", dtype=torch.float16):
-                outputs = sd_pipe(**batch_options)
-        else:
-            outputs = sd_pipe(**batch_options)
-        images.extend(outputs.images)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
 @spaces.GPU
-def generate(
-    input_dict: dict,
-    chat_history: list[dict],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-    convert_to_speech: bool = False,
-    tts_rate: float = 1.0,
-    tts_voice: str = "en-US-JennyNeural",
-):
-    """
-    Generates chatbot responses with support for multimodal input and TTS conversion.
-    When files (images or videos) are provided, Qwen2VL is used.
-    Otherwise, the FastThink-0.5B text model is used.
-    After generating the response, if convert_to_speech is True the text is passed to the TTS function.
-    """
-    text = input_dict["text"].strip()
-    files = input_dict.get("files", [])
-    # Determine which branch to use: multimodal (if files provided) or text-only.
-    if files:
-        # Process uploaded files as images (or videos)
-        if len(files) > 1:
-            images = [load_image(image) for image in files]
-        else:
-            images = [load_image(files[0])]
-        messages = [{
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing multimodal input...")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-        final_response = buffer
-    else:
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "top_p": top_p,
-            "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
-            "repetition_penalty": repetition_penalty,
-        }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        outputs = []
-        yield progress_bar_html("Processing text...")
-        for new_text in streamer:
-            outputs.append(new_text)
-            yield "".join(outputs)
-        final_response = "".join(outputs)
-    # Yield the final text response.
-    yield final_response
-    # If TTS conversion is enabled, log the message and generate speech.
-    if convert_to_speech:
-        print("Generate Response to Generate Speech")
-        # Here tts_rate can be used to adjust parameters if needed.
-        output_file = asyncio.run(text_to_speech(final_response, tts_voice))
-        yield gr.Audio(output_file, autoplay=True)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.Markdown("# TTS Conversion")
-        tts_rate_slider = gr.Slider(label="TTS Rate", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
-        tts_voice_radio = gr.Radio(choices=TTS_VOICES, label="Choose TTS Voice", value="en-US-JennyNeural")
-        convert_to_speech_checkbox = gr.Checkbox(label="Convert to Speech", value=False)
-    chat_interface = gr.ChatInterface(
-        fn=generate,
-        additional_inputs=[
-            gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-            gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-            gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-            gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-            gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-            # Pass TTS parameters to the generate function.
-            convert_to_speech_checkbox,
-            tts_rate_slider,
-            tts_voice_radio,
-        ],
-        examples=[
-            ["Write the Python Program for Array Rotation"],
-            [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
-            [{"text": "Describe the Ad", "files": ["examples/coca.mp4"]}],
-            [{"text": "Summarize the event in video", "files": ["examples/sky.mp4"]}],
-            [{"text": "Describe the video", "files": ["examples/Missing.mp4"]}],
-            ["Who is Nikola Tesla, and why did he die?"],
-            [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
-            ["What causes rainbows to form?"],
-        ],
-        cache_examples=False,
-        type="messages",
-        description="# **QwQ Edge: Multimodal (image upload uses Qwen2-VL) with TTS conversion**",
-        fill_height=True,
-        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Enter text or upload files"),
-        stop_btn="Stop Generation",
-        multimodal=True,
-    )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import gradio as gr
 import spaces
+from transformers import AutoImageProcessor
+from transformers import SiglipForImageClassification
 from transformers.image_utils import load_image
+from PIL import Image
+import torch
+# Load model and processor
+model_name = "prithivMLmods/Gender-Classifier-Mini"
+model = SiglipForImageClassification.from_pretrained(model_name)
+processor = AutoImageProcessor.from_pretrained(model_name)
 @spaces.GPU
+def gender_classification(image):
+    """Predicts gender category for an image."""
+    image = Image.fromarray(image).convert("RGB")
+    inputs = processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist()
+    labels = {"0": "Female ♀", "1": "Male ♂"}
+    predictions = {labels[str(i)]: round(probs[i], 3) for i in range(len(probs))}
+    return predictions
+# Create Gradio interface
+iface = gr.Interface(
+    fn=gender_classification,
+    inputs=gr.Image(type="numpy"),
+    outputs=gr.Label(label="Prediction Scores"),
+    title="Gender Classification",
+    description="Upload an image to classify its gender."
+)
+# Launch the app
 if __name__ == "__main__":
+    iface.launch()