Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 28

Commit

e354e80

verified ·

1 Parent(s): 35bb999

Update app.py

Browse files

Files changed (1) hide show

app.py +492 -285

app.py CHANGED Viewed

@@ -1,302 +1,509 @@
-import subprocess
-subprocess.run(
-    'pip install flash-attn==2.7.0.post2 --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
-    shell=True
-)
-subprocess.run(
-    'pip install transformers',
-    shell=True
-)
-import spaces
 import os
-import re
-import logging
-from typing import List
 from threading import Thread
-import base64
-import torch
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-# ----------------------------------------------------------------------
-# 1. Setup Model & Tokenizer
-# ----------------------------------------------------------------------
-model_name = 'prithivMLmods/Raptor-X6'  # Change as needed
-use_thread = True  # Generation happens in a background thread
 model = AutoModelForCausalLM.from_pretrained(
-    model_name,
     torch_dtype=torch.bfloat16,
-    trust_remote_code=True
-).to("cuda")
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# ----------------------------------------------------------------------
-# 2. Two-Phase Prompt Templates
-# ----------------------------------------------------------------------
-s1_inference_prompt_think_only = """<|im_start|>user
-{question}<|im_end|>
-<|im_start|>assistant
-<|im_start|>think
-"""
-# ----------------------------------------------------------------------
-# 3. Generation Parameter Setup
-# ----------------------------------------------------------------------
-THINK_MAX_NEW_TOKENS = 12000
-ANSWER_MAX_NEW_TOKENS = 12000
-def initialize_gen_kwargs():
-    return {
-        "max_new_tokens": 1024,  # default; will be overwritten per phase
-        "do_sample": True,
-        "temperature": 0.7,
-        "top_p": 0.9,
-        "repetition_penalty": 1.05,
-        # "eos_token_id": model.generation_config.eos_token_id,  # Removed to avoid premature stopping
-        "pad_token_id": tokenizer.pad_token_id,
-        "use_cache": True,
-        "streamer": None  # dynamically added
-    }
-# ----------------------------------------------------------------------
-# 4. Helper to submit chat
-# ----------------------------------------------------------------------
-def submit_chat(chatbot, text_input):
-    if not text_input.strip():
-        return chatbot, ""
-    response = ""
-    chatbot.append((text_input, response))
-    return chatbot, ""
-# ----------------------------------------------------------------------
-# 5. Artifacts Handling
-#    We parse code from the final answer and display it in an iframe
-# ----------------------------------------------------------------------
-def extract_html_code_block(text: str) -> str:
     """
-    Look for a ```html ... ``` block in the text.
-    If found, return only that block content.
-    Otherwise, return the entire text.
     """
-    pattern = r'```html\s*(.*?)\s*```'
-    match = re.search(pattern, text, re.DOTALL)
-    if match:
-        return match.group(1).strip()
-    else:
-        return text.strip()
-def send_to_sandbox(html_code: str) -> str:
     """
-    Convert the code to a data URI iframe so it can be rendered
-    inside Gradio HTML component.
     """
-    encoded_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
-    data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
-    return f'<iframe src="{data_uri}" width="100%" height="920px"></iframe>'
-# ----------------------------------------------------------------------
-# 6. The Two-Phase Streaming Inference
-#    - Phase 1: "think" (chain-of-thought)
-#    - Phase 2: "answer"
-# ----------------------------------------------------------------------
-@spaces.GPU
-def ovis_chat(chatbot: List[List[str]]):
-    # Phase 1: chain-of-thought
-    last_query = chatbot[-1][0]
-    formatted_think_prompt = s1_inference_prompt_think_only.format(question=last_query)
-    input_ids_think = tokenizer.encode(formatted_think_prompt, return_tensors="pt").to(model.device)
-    attention_mask_think = torch.ne(input_ids_think, tokenizer.pad_token_id).to(model.device)
-    think_inputs = {
-        "input_ids": input_ids_think,
-        "attention_mask": attention_mask_think
-    }
-    gen_kwargs_think = initialize_gen_kwargs()
-    gen_kwargs_think["max_new_tokens"] = THINK_MAX_NEW_TOKENS
-    think_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs_think["streamer"] = think_streamer
-    full_think = ""
-    with torch.inference_mode():
-        thread_think = Thread(target=lambda: model.generate(**think_inputs, **gen_kwargs_think))
-        thread_think.start()
-        for new_text in think_streamer:
-            full_think += new_text
-            display_text = f"<|im_start|>think\n{full_think.strip()}"
-            chatbot[-1][1] = display_text
-            yield chatbot, ""  # second return is artifact placeholder
-        thread_think.join()
-    # Phase 2: answer
-    new_prompt = formatted_think_prompt + full_think.strip() + "\n<|im_start|>answer\n"
-    input_ids_answer = tokenizer.encode(new_prompt, return_tensors="pt").to(model.device)
-    attention_mask_answer = torch.ne(input_ids_answer, tokenizer.pad_token_id).to(model.device)
-    answer_inputs = {
-        "input_ids": input_ids_answer,
-        "attention_mask": attention_mask_answer
     }
-    gen_kwargs_answer = initialize_gen_kwargs()
-    gen_kwargs_answer["max_new_tokens"] = ANSWER_MAX_NEW_TOKENS
-    answer_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs_answer["streamer"] = answer_streamer
-    full_answer = ""
-    with torch.inference_mode():
-        thread_answer = Thread(target=lambda: model.generate(**answer_inputs, **gen_kwargs_answer))
-        thread_answer.start()
-        for new_text in answer_streamer:
-            full_answer += new_text
-            display_text = (
-                f"<|im_start|>think\n{full_think.strip()}\n\n"
-                f"<|im_start|>answer\n{full_answer.strip()}"
-            )
-            chatbot[-1][1] = display_text
-            yield chatbot, ""
-        thread_answer.join()
-    log_conversation(chatbot)
-    # Once final answer is complete, parse out HTML code block and
-    # return it as an artifact (iframe).
-    html_code = extract_html_code_block(full_answer)
-    sandbox_iframe = send_to_sandbox(html_code)
-    yield chatbot, sandbox_iframe
-# ----------------------------------------------------------------------
-# 7. Logging and Clearing
-# ----------------------------------------------------------------------
-def log_conversation(chatbot: List[List[str]]):
-    logger.info("[CONVERSATION]")
-    for i, (query, response) in enumerate(chatbot, 1):
-        logger.info(f"Q{i}: {query}\nA{i}: {response}")
-def clear_chat():
-    return [], "", ""
-# ----------------------------------------------------------------------
-# 8. Gradio UI Setup
-# ----------------------------------------------------------------------
-css_code = """
-.left_header {
-  display: flex;
-  flex-direction: column;
-  justify-content: center;
-  align-items: center;
-}
-.right_panel {
-  margin-top: 16px;
-  border: 1px solid #BFBFC4;
-  border-radius: 8px;
-  overflow: hidden;
-}
-.render_header {
-  height: 30px;
-  width: 100%;
-  padding: 5px 16px;
-  background-color: #f5f5f5;
-}
-.header_btn {
-  display: inline-block;
-  height: 10px;
-  width: 10px;
-  border-radius: 50%;
-  margin-right: 4px;
-}
-.render_header > .header_btn:nth-child(1) {
-  background-color: #f5222d;
-}
-.render_header > .header_btn:nth-child(2) {
-  background-color: #faad14;
-}
-.render_header > .header_btn:nth-child(3) {
-  background-color: #52c41a;
-}
-.right_content {
-  height: 920px;
-  display: flex;
-  flex-direction: column;
-  justify-content: center;
-  align-items: center;
-}
-.html_content {
-  width: 100%;
-  height: 920px;
-}
-"""
-svg_content = """
-<svg width="40" height="40" viewBox="0 0 45 45" fill="none" xmlns="http://www.w3.org/2000/svg">
-  <circle cx="22.5" cy="22.5" r="22.5" fill="#5572F9"/>
-  <path d="M22.5 11.25L26.25 16.875H18.75L22.5 11.25Z" fill="white"/>
-  <path d="M22.5 33.75L26.25 28.125H18.75L22.5 33.75Z" fill="white"/>
-  <path d="M28.125 22.5L22.5 28.125L16.875 22.5L22.5 16.875L28.125 22.5Z" fill="white"/>
-</svg>
-"""
-with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
-    gr.HTML(f"""
-        <div class="left_header" style="margin-bottom: 20px;">
-            {svg_content}
-            <h1>{model_name.split('/')[-1]} - Chat + Artifacts</h1>
-            <p>(Two-phase chain-of-thought with artifact extraction)</p>
-        </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=4):
-            chatbot = gr.Chatbot(
-                label="Chat",
-                height=520,
-                show_copy_button=True
-            )
-            with gr.Row():
-                text_input = gr.Textbox(
-                    label="Prompt",
-                    placeholder="Enter your query...",
-                    lines=1
-                )
-            with gr.Row():
-                submit_btn = gr.Button("Send", variant="primary")
-                clear_btn = gr.Button("Clear", variant="secondary")
-        with gr.Column(scale=6):
-            gr.HTML('<div class="render_header"><span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span></div>')
-            artifact_html = gr.HTML(
-                value="",
-                elem_classes="html_content"
-            )
-    submit_btn.click(
-        submit_chat, [chatbot, text_input], [chatbot, text_input]
-    ).then(
-        ovis_chat, [chatbot], [chatbot, artifact_html]
-    )
-    text_input.submit(
-        submit_chat, [chatbot, text_input], [chatbot, text_input]
-    ).then(
-        ovis_chat, [chatbot], [chatbot, artifact_html]
-    )
-    clear_btn.click(
-        clear_chat,
-        outputs=[chatbot, text_input, artifact_html]
     )
-demo.queue(default_concurrency_limit=1).launch(server_name="0.0.0.0", share=True)

 import os
+import random
+import uuid
+import json
+import time
+import asyncio
 from threading import Thread
 import gradio as gr
+import spaces
+import torch
+import numpy as np
+from PIL import Image
+import edge_tts
+import cv2
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
+)
+from transformers.image_utils import load_image
+from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load text-only model and tokenizer
+model_id = "prithivMLmods/FastThink-0.5B-Tiny"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
     torch_dtype=torch.bfloat16,
+)
+model.eval()
+# Updated TTS voices list (all voices)
+TTS_VOICES = [
+    "af-ZA-AdriNeural",
+    "af-ZA-WillemNeural",
+    "am-ET-AmehaNeural",
+    "am-ET-MekdesNeural",
+    "ar-AE-FatimaNeural",
+    "ar-AE-HamdanNeural",
+    "ar-BH-LailaNeural",
+    "ar-BH-MajedNeural",
+    "ar-DZ-AminaNeural",
+    "ar-DZ-IsmaelNeural",
+    "ar-EG-SalmaNeural",
+    "ar-EG-OmarNeural",
+    "ar-IQ-LanaNeural",
+    "ar-IQ-BassamNeural",
+    "ar-JO-SanaNeural",
+    "ar-JO-TaimNeural",
+    "ar-KW-NouraNeural",
+    "ar-KW-FahedNeural",
+    "ar-LB-LaylaNeural",
+    "ar-LB-RamiNeural",
+    "ar-LY-ImanNeural",
+    "ar-LY-OmarNeural",
+    "ar-MA-MounaNeural",
+    "ar-MA-JamalNeural",
+    "ar-OM-AyshaNeural",
+    "ar-OM-AbdullahNeural",
+    "ar-QA-AmalNeural",
+    "ar-QA-MoazNeural",
+    "ar-SA-ZariyahNeural",
+    "ar-SA-HamedNeural",
+    "ar-SY-AmanyNeural",
+    "ar-SY-LaithNeural",
+    "ar-TN-ReemNeural",
+    "ar-TN-SeifNeural",
+    "ar-YE-MaryamNeural",
+    "ar-YE-SalehNeural",
+    "az-AZ-BabekNeural",
+    "az-AZ-BanuNeural",
+    "bg-BG-BorislavNeural",
+    "bg-BG-KalinaNeural",
+    "bn-BD-NabanitaNeural",
+    "bn-BD-PradeepNeural",
+    "bn-IN-TanishaNeural",
+    "bn-IN-SwapanNeural",
+    "bs-BA-GoranNeural",
+    "bs-BA-VesnaNeural",
+    "ca-ES-JoanaNeural",
+    "ca-ES-AlbaNeural",
+    "ca-ES-EnricNeural",
+    "cs-CZ-AntoninNeural",
+    "cs-CZ-VlastaNeural",
+    "cy-GB-NiaNeural",
+    "cy-GB-AledNeural",
+    "da-DK-ChristelNeural",
+    "da-DK-JeppeNeural",
+    "de-AT-IngridNeural",
+    "de-AT-JonasNeural",
+    "de-CH-LeniNeural",
+    "de-CH-JanNeural",
+    "de-DE-KatjaNeural",
+    "de-DE-ConradNeural",
+    "el-GR-AthinaNeural",
+    "el-GR-NestorasNeural",
+    "en-AU-AnnetteNeural",
+    "en-AU-MichaelNeural",
+    "en-CA-ClaraNeural",
+    "en-CA-LiamNeural",
+    "en-GB-SoniaNeural",
+    "en-GB-RyanNeural",
+    "en-GH-EsiNeural",
+    "en-GH-KwameNeural",
+    "en-HK-YanNeural",
+    "en-HK-TrevorNeural",
+    "en-IE-EmilyNeural",
+    "en-IE-ConnorNeural",
+    "en-IN-NeerjaNeural",
+    "en-IN-PrabhasNeural",
+    "en-KE-ChantelleNeural",
+    "en-KE-ChilembaNeural",
+    "en-NG-EzinneNeural",
+    "en-NG-AbechiNeural",
+    "en-NZ-MollyNeural",
+    "en-NZ-MitchellNeural",
+    "en-PH-RosaNeural",
+    "en-PH-JamesNeural",
+    "en-SG-LunaNeural",
+    "en-SG-WayneNeural",
+    "en-TZ-ImaniNeural",
+    "en-TZ-DaudiNeural",
+    "en-US-JennyNeural",
+    "en-US-GuyNeural",
+    "en-ZA-LeahNeural",
+    "en-ZA-LukeNeural",
+    "es-AR-ElenaNeural",
+    "es-AR-TomasNeural",
+    "es-BO-SofiaNeural",
+    "es-BO-MarceloNeural",
+    "es-CL-CatalinaNeural",
+    "es-CL-LorenzoNeural",
+    "es-CO-SalomeNeural",
+    "es-CO-GonzaloNeural",
+    "es-CR-MariaNeural",
+    "es-CR-JuanNeural",
+    "es-CU-BelkysNeural",
+    "es-CU-ManuelNeural",
+    "es-DO-RamonaNeural",
+    "es-DO-EmilioNeural",
+    "es-EC-AndreaNeural",
+    "es-EC-LuisNeural",
+    "es-ES-ElviraNeural",
+    "es-ES-AlvaroNeural",
+    "es-GQ-TeresaNeural",
+    "es-GQ-JavierNeural",
+    "es-GT-MartaNeural",
+    "es-GT-AndresNeural",
+    "es-HN-KarlaNeural",
+    "es-HN-CarlosNeural",
+    "es-MX-DaliaNeural",
+    "es-MX-JorgeNeural",
+    "es-NI-YolandaNeural",
+    "es-NI-FedericoNeural",
+    "es-PA-MargaritaNeural",
+    "es-PA-RobertoNeural",
+    "es-PE-CamilaNeural",
+    "es-PE-AlexNeural",
+    "es-PR-KarinaNeural",
+    "es-PR-VictorNeural",
+    "es-PY-TaniaNeural",
+    "es-PY-MarioNeural",
+    "es-SV-LorenaNeural",
+    "es-SV-RodrigoNeural",
+    "es-US-SaraNeural",
+    "es-US-AlonsoNeural",
+    "es-UY-ValentinaNeural",
+    "es-UY-MateoNeural",
+    "es-VE-PaolaNeural",
+    "es-VE-SebastianNeural",
+    "et-EE-AnuNeural",
+    "et-EE-KertNeural",
+    "eu-ES-AinhoaNeural",
+    "eu-ES-AnderNeural",
+    "fa-IR-DilaraNeural",
+    "fa-IR-FaridNeural",
+    "fi-FI-NooraNeural",
+    "fi-FI-HarriNeural",
+    "fil-PH-BlessicaNeural",
+    "fil-PH-AngeloNeural",
+    "fr-BE-CharlineNeural",
+    "fr-BE-GerardNeural",
+    "fr-CA-SylvieNeural",
+    "fr-CA-AntoineNeural",
+    "fr-CH-ArianeNeural",
+    "fr-CH-GuillaumeNeural",
+    "fr-FR-DeniseNeural",
+    "fr-FR-HenriNeural",
+    "ga-IE-OrlaNeural",
+    "ga-IE-ColmNeural",
+    "gl-ES-SoniaNeural",
+    "gl-ES-XiaoqiangNeural",
+    "gu-IN-DhwaniNeural",
+    "gu-IN-NiranjanNeural",
+    "ha-NG-AishaNeural",
+    "ha-NG-YusufNeural",
+    "he-IL-HilaNeural",
+    "he-IL-AvriNeural",
+    "hi-IN-SwaraNeural",
+    "hi-IN-MadhurNeural",
+    "hr-HR-GabrijelaNeural",
+    "hr-HR-SreckoNeural",
+    "hu-HU-NoemiNeural",
+    "hu-HU-TamasNeural",
+    "hy-AM-AnushNeural",
+    "hy-AM-HaykNeural",
+    "id-ID-ArdiNeural",
+    "id-ID-GadisNeural",
+    "ig-NG-AdaNeural",
+    "ig-NG-EzeNeural",
+    "is-IS-GudrunNeural",
+    "is-IS-GunnarNeural",
+    "it-IT-ElsaNeural",
+    "it-IT-DiegoNeural",
+    "ja-JP-NanamiNeural",
+    "ja-JP-KeitaNeural",
+    "jv-ID-DianNeural",
+    "jv-ID-GustiNeural",
+    "ka-GE-EkaNeural",
+    # ... (truncated for brevity; include all voices as needed)
+]
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """Convert text to speech using Edge TTS and save as MP3"""
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
+    return output_file
+def clean_chat_history(chat_history):
     """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
     """
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
+# Environment variables and parameters for Stable Diffusion XL (left in case needed in the future)
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
+# Load the SDXL pipeline (not used in the current configuration)
+sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+    MODEL_ID_SD,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    use_safetensors=True,
+    add_watermarker=False,
+).to(device)
+sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+if torch.cuda.is_available():
+    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+if USE_TORCH_COMPILE:
+    sd_pipe.compile()
+if ENABLE_CPU_OFFLOAD:
+    sd_pipe.enable_model_cpu_offload()
+MAX_SEED = np.iinfo(np.int32).max
+def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
+    unique_name = str(uuid.uuid4()) + ".png"
+    img.save(unique_name)
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def progress_bar_html(label: str) -> str:
     """
+    Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a dark red animated bar.
     """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
+def downsample_video(video_path):
+    """
+    Downsamples the video to 10 evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+@spaces.GPU(duration=60, enable_queue=True)
+def generate_image_fn(
+    prompt: str,
+    negative_prompt: str = "",
+    use_negative_prompt: bool = False,
+    seed: int = 1,
+    width: int = 1024,
+    height: int = 1024,
+    guidance_scale: float = 3,
+    num_inference_steps: int = 25,
+    randomize_seed: bool = False,
+    use_resolution_binning: bool = True,
+    num_images: int = 1,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """(Image generation function is preserved but not called in the current configuration)"""
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    options = {
+        "prompt": [prompt] * num_images,
+        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
+        "width": width,
+        "height": height,
+        "guidance_scale": guidance_scale,
+        "num_inference_steps": num_inference_steps,
+        "generator": generator,
+        "output_type": "pil",
     }
+    if use_resolution_binning:
+        options["use_resolution_binning"] = True
+    images = []
+    for i in range(0, num_images, BATCH_SIZE):
+        batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        if device.type == "cuda":
+            with torch.autocast("cuda", dtype=torch.float16):
+                outputs = sd_pipe(**batch_options)
+        else:
+            outputs = sd_pipe(**batch_options)
+        images.extend(outputs.images)
+    image_paths = [save_image(img) for img in images]
+    return image_paths, seed
+@spaces.GPU
+def generate(
+    input_dict: dict,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+    convert_to_speech: bool = False,
+    tts_rate: float = 1.0,
+    tts_voice: str = "en-US-JennyNeural",
+):
+    """
+    Generates chatbot responses with support for multimodal input and TTS conversion.
+    When files (images or videos) are provided, Qwen2VL is used.
+    Otherwise, the FastThink-0.5B text model is used.
+    After generating the response, if convert_to_speech is True the text is passed to the TTS function.
+    """
+    text = input_dict["text"].strip()
+    files = input_dict.get("files", [])
+    # Determine which branch to use: multimodal (if files provided) or text-only.
+    if files:
+        # Process uploaded files as images (or videos)
+        if len(files) > 1:
+            images = [load_image(image) for image in files]
+        else:
+            images = [load_image(files[0])]
+        messages = [{
+            "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing multimodal input...")
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        final_response = buffer
+    else:
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
+        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        yield progress_bar_html("Processing text...")
+        for new_text in streamer:
+            outputs.append(new_text)
+            yield "".join(outputs)
+        final_response = "".join(outputs)
+    # Yield the final text response.
+    yield final_response
+    # If TTS conversion is enabled, log the message and generate speech.
+    if convert_to_speech:
+        print("Generate Response to Generate Speech")
+        # Here tts_rate can be used to adjust parameters if needed.
+        output_file = asyncio.run(text_to_speech(final_response, tts_voice))
+        yield gr.Audio(output_file, autoplay=True)
+with gr.Blocks() as demo:
+    with gr.Sidebar():
+        gr.Markdown("# TTS Conversion")
+        tts_rate_slider = gr.Slider(label="TTS Rate", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
+        tts_voice_radio = gr.Radio(choices=TTS_VOICES, label="Choose TTS Voice", value="en-US-JennyNeural")
+        convert_to_speech_checkbox = gr.Checkbox(label="Convert to Speech", value=False)
+    chat_interface = gr.ChatInterface(
+        fn=generate,
+        additional_inputs=[
+            gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
+            gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+            gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+            gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
+            gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
+            # Pass TTS parameters to the generate function.
+            convert_to_speech_checkbox,
+            tts_rate_slider,
+            tts_voice_radio,
+        ],
+        examples=[
+            ["Write the Python Program for Array Rotation"],
+            [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
+            [{"text": "Describe the Ad", "files": ["examples/coca.mp4"]}],
+            [{"text": "Summarize the event in video", "files": ["examples/sky.mp4"]}],
+            [{"text": "Describe the video", "files": ["examples/Missing.mp4"]}],
+            ["Who is Nikola Tesla, and why did he die?"],
+            [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
+            ["What causes rainbows to form?"],
+        ],
+        cache_examples=False,
+        type="messages",
+        description="# **QwQ Edge: Multimodal (image upload uses Qwen2-VL) with TTS conversion**",
+        fill_height=True,
+        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="Enter text or upload files"),
+        stop_btn="Stop Generation",
+        multimodal=True,
     )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)