Spaces:

prithivMLmods
/

Callisto-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 14

Commit

8e6677c

verified ·

1 Parent(s): e553bfe

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -14

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import cv2
 import matplotlib.pyplot as plt
 import random
 import time
-import spaces
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
@@ -14,7 +13,7 @@ from transformers.image_utils import load_image
 #####################################
 # 1. Load Qwen2.5-VL Model & Processor
 #####################################
-MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"  # or "Qwen/Qwen2.5-VL-7B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -55,7 +54,6 @@ def downsample_video(video_path, num_frames=10):
 #####################################
 # 3. The Inference Function
 #####################################
-@spaces.GPU
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
@@ -71,7 +69,6 @@ def video_inference(video_file, duration):
         return "Could not read frames from video.", None
     # 3.2: Construct Qwen2.5-VL prompt
-    # We'll do a simple prompt: "Please describe what's happening in this video."
     messages = [
         {
             "role": "user",
@@ -97,19 +94,16 @@ def video_inference(video_file, duration):
         padding=True
     ).to("cuda")
-    # 3.3: Generate text output
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
-    # We'll run generation in a thread to simulate streaming.
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Collect the streamed text
     generated_text = ""
     for new_text in streamer:
         generated_text += new_text
-        # Sleep briefly to yield control
         time.sleep(0.01)
     # 3.4: Dummy bar chart for demonstration
@@ -121,7 +115,6 @@ def video_inference(video_file, duration):
     ax.set_ylabel("Value")
     ax.set_xlabel("Category")
-    # Return text + figure
     return generated_text, fig
 #####################################
@@ -131,8 +124,8 @@ def build_app():
     with gr.Blocks() as demo:
         gr.Markdown("""
         # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
-        Record your webcam for a chosen duration, then click **Stop** to finalize.
-        After that, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
         """)
         with gr.Row():
@@ -143,10 +136,10 @@ def build_app():
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
                 video = gr.Video(
-                    source="webcam",
-                    format="mp4",
-                    label="Webcam Recording (press the Record button, then Stop)"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")
             with gr.Column():

 import matplotlib.pyplot as plt
 import random
 import time
 from PIL import Image
 from threading import Thread
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 #####################################
 # 1. Load Qwen2.5-VL Model & Processor
 #####################################
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # or "Qwen/Qwen2.5-VL-3B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 #####################################
 # 3. The Inference Function
 #####################################
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
         return "Could not read frames from video.", None
     # 3.2: Construct Qwen2.5-VL prompt
     messages = [
         {
             "role": "user",
         padding=True
     ).to("cuda")
+    # 3.3: Generate text output (streaming)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     generated_text = ""
     for new_text in streamer:
         generated_text += new_text
         time.sleep(0.01)
     # 3.4: Dummy bar chart for demonstration
     ax.set_ylabel("Value")
     ax.set_xlabel("Category")
     return generated_text, fig
 #####################################
     with gr.Blocks() as demo:
         gr.Markdown("""
         # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
+        Record a video (from webcam or file), then click **Stop**.
+        Next, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
         """)
         with gr.Row():
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
+                # Remove 'source="webcam"' to avoid the TypeError on older Gradio versions
                 video = gr.Video(
+                    label="Webcam Recording (press the Record button, then Stop)",
+                    format="mp4"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")
             with gr.Column():