Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 14

Commit

8716c2f

verified ·

1 Parent(s): c0da9a5

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -134

app.py CHANGED Viewed

@@ -1,138 +1,164 @@
-import os
-import time
-import spaces
-import json
-import subprocess
-from llama_cpp import Llama
-from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
-from llama_cpp_agent.providers import LlamaCppPythonProvider
-from llama_cpp_agent.chat_history import BasicChatHistory
-from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
-from huggingface_hub import hf_hub_download
-# Define model details
-MODEL_REPO = "prithivMLmods/Sombrero-QwQ-32B-Elite10-Fixed-Q2_K-GGUF"
-MODEL_FILENAME = "sombrero-qwq-32b-elite10-fixed-q2_k.gguf"
-MODEL_DIR = "./models"
-MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILENAME)
-# Ensure the model directory exists
-os.makedirs(MODEL_DIR, exist_ok=True)
-# Download the model if not already present
-if not os.path.exists(MODEL_PATH):
-    print("Downloading the model... This may take some time.")
-    try:
-        hf_hub_download(
-            repo_id=MODEL_REPO,
-            filename=MODEL_FILENAME,
-            local_dir=MODEL_DIR
         )
-        print("Model downloaded successfully!")
-    except Exception as e:
-        print(f"Error downloading model: {e}")
-        exit(1)
-# Ensure model is fully downloaded before using
-while not os.path.exists(MODEL_PATH):
-    print("Waiting for model to be available...")
-    time.sleep(5)
-# Function to handle AI responses
-@spaces.GPU
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    model,
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    top_k,
-    repeat_penalty,
-):
-    chat_template = MessagesFormatterType.GEMMA_2
-    llm = Llama(
-        model_path=MODEL_PATH,
-        flash_attn=True,
-        n_gpu_layers=81,
-        n_batch=1024,
-        n_ctx=8192,
-    )
-    provider = LlamaCppPythonProvider(llm)
-    agent = LlamaCppAgent(
-        provider,
-        system_prompt=f"{system_message}",
-        predefined_messages_formatter_type=chat_template,
-        debug_output=True
-    )
-    settings = provider.get_provider_default_settings()
-    settings.temperature = temperature
-    settings.top_k = top_k
-    settings.top_p = top_p
-    settings.max_tokens = max_tokens
-    settings.repeat_penalty = repeat_penalty
-    settings.stream = True
-    messages = BasicChatHistory()
-    for msn in history:
-        user = {'role': Roles.user, 'content': msn[0]}
-        assistant = {'role': Roles.assistant, 'content': msn[1]}
-        messages.add_message(user)
-        messages.add_message(assistant)
-    stream = agent.get_chat_response(
-        message,
-        llm_sampling_settings=settings,
-        chat_history=messages,
-        returns_streaming_generator=True,
-        print_output=False
-    )
-    outputs = ""
-    for output in stream:
-        outputs += output
-        yield outputs
-# Function to create Gradio interface
-def create_interface(model_name, description):
-    return gr.ChatInterface(
-        respond,
-        additional_inputs=[
-            gr.Textbox(value=model_name, label="Model", interactive=False),
-            gr.Textbox(
-                value="You are a helpful assistant. You should think step-by-step. You should answer all questions by prioritizing the principle of openness",
-                label="System message"
-            ),
-            gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
-            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-            gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
-            gr.Slider(minimum=0, maximum=100, value=40, step=1, label="Top-k"),
-            gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty"),
-        ],
-        retry_btn="Retry",
-        undo_btn="Undo",
-        clear_btn="Clear",
-        submit_btn="Send",
-        title=f"{model_name}",
-        description=description,
-        chatbot=gr.Chatbot(scale=1, likeable=False, show_copy_button=True)
-    )
-# Set interface description
-description = """<p align="center">Viper-Coder-32B-Elite13-GGUF</p>"""
-interface = create_interface(MODEL_REPO, description)
-# Create Gradio Blocks app
-demo = gr.Blocks()
-with demo:
-    interface.render()
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
+import torch
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+import random
+import time
+from PIL import Image
+from threading import Thread
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
+from transformers.image_utils import load_image
+#####################################
+# 1. Load Qwen2.5-VL Model & Processor
+#####################################
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # or "Qwen/Qwen2.5-VL-3B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to("cuda")
+model.eval()
+#####################################
+# 2. Helper Function: Downsample Video
+#####################################
+def downsample_video(video_path, num_frames=10):
+    """
+    Downsamples the video file to `num_frames` evenly spaced frames.
+    Each frame is converted to a PIL Image along with its timestamp.
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    if total_frames <= 0 or fps <= 0:
+        vidcap.release()
+        return frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+#####################################
+# 3. The Inference Function
+#####################################
+def video_inference(video_file, duration):
+    """
+    - Takes a recorded video file and a chosen duration (string).
+    - Downsamples the video, passes frames to Qwen2.5-VL for inference.
+    - Returns model-generated text + a dummy bar chart as example analytics.
+    """
+    if video_file is None:
+        return "No video provided.", None
+    # 3.1: Downsample the recorded video
+    frames = downsample_video(video_file)
+    if not frames:
+        return "Could not read frames from video.", None
+    # 3.2: Construct Qwen2.5-VL prompt
+    # We'll do a simple prompt: "Please describe what's happening in this video."
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
+        }
+    ]
+    # Add frames (with timestamp) to the messages
+    for (image, ts) in frames:
+        messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
+        messages[0]["content"].append({"type": "image", "image": image})
+    # Prepare final prompt for the model
+    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Qwen requires images in the same order. We'll just collect them:
+    frame_images = [img for (img, _) in frames]
+    inputs = processor(
+        text=[prompt],
+        images=frame_images,
+        return_tensors="pt",
+        padding=True
+    ).to("cuda")
+    # 3.3: Generate text output
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
+    # We'll run generation in a thread to simulate streaming.
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Collect the streamed text
+    generated_text = ""
+    for new_text in streamer:
+        generated_text += new_text
+        # Sleep briefly to yield control
+        time.sleep(0.01)
+    # 3.4: Dummy bar chart for demonstration
+    fig, ax = plt.subplots()
+    categories = ["Category A", "Category B", "Category C"]
+    values = [random.randint(1, 10) for _ in categories]
+    ax.bar(categories, values, color=["#4B0082", "#9370DB", "#4B0082"])
+    ax.set_title("Example Analytics Chart")
+    ax.set_ylabel("Value")
+    ax.set_xlabel("Category")
+    # Return text + figure
+    return generated_text, fig
+#####################################
+# 4. Build a Professional Gradio UI
+#####################################
+def build_app():
+    with gr.Blocks() as demo:
+        gr.Markdown("""
+        # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
+        Record your webcam for a chosen duration, then click **Stop** to finalize.
+        After that, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
+        """)
+        with gr.Row():
+            with gr.Column():
+                duration = gr.Radio(
+                    choices=["5", "10", "20", "30"],
+                    value="5",
+                    label="Suggested Recording Duration (seconds)",
+                    info="Select how long you plan to record before pressing Stop."
+                )
+                video = gr.Video(
+                    source="webcam",
+                    format="mp4",
+                    label="Webcam Recording (press the Record button, then Stop)"
+                )
+                analyze_btn = gr.Button("Analyze", variant="primary")
+            with gr.Column():
+                output_text = gr.Textbox(label="Model Output")
+                output_plot = gr.Plot(label="Analytics Chart")
+        analyze_btn.click(
+            fn=video_inference,
+            inputs=[video, duration],
+            outputs=[output_text, output_plot]
         )
+    return demo
 if __name__ == "__main__":
+    app = build_app()
+    app.launch(debug=True)