Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 14

Commit

554ae5a

verified ·

1 Parent(s): 20121ea

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -19

app.py CHANGED Viewed

@@ -12,9 +12,9 @@ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIter
 from transformers.image_utils import load_image
 #####################################
-# 1. Load Qwen2.5-VL Model & Processor
 #####################################
-MODEL_ID = "google/gemma-3-12b-it"  # or "Qwen/Qwen2.5-VL-3B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -27,7 +27,6 @@ model.eval()
 #####################################
 # 2. Helper Function: Downsample Video
 #####################################
 def downsample_video(video_path, num_frames=10):
     """
     Downsamples the video file to `num_frames` evenly spaced frames.
@@ -53,6 +52,29 @@ def downsample_video(video_path, num_frames=10):
     vidcap.release()
     return frames
 #####################################
 # 3. The Inference Function
 #####################################
@@ -60,8 +82,8 @@ def downsample_video(video_path, num_frames=10):
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
-    - Downsamples the video, passes frames to Qwen2.5-VL for inference.
-    - Returns model-generated text + a dummy bar chart as example analytics.
     """
     if video_file is None:
         return "No video provided.", None
@@ -71,23 +93,22 @@ def video_inference(video_file, duration):
     if not frames:
         return "Could not read frames from video.", None
-    # 3.2: Construct Qwen2.5-VL prompt
     messages = [
         {
             "role": "user",
             "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
         }
     ]
     # Add frames (with timestamp) to the messages
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
         messages[0]["content"].append({"type": "image", "image": image})
-    # Prepare final prompt for the model
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Qwen requires images in the same order. We'll just collect them:
     frame_images = [img for (img, _) in frames]
     inputs = processor(
@@ -109,14 +130,22 @@ def video_inference(video_file, duration):
         generated_text += new_text
         time.sleep(0.01)
-    # 3.4: Dummy bar chart for demonstration
-    fig, ax = plt.subplots()
-    categories = ["Category A", "Category B", "Category C"]
     values = [random.randint(1, 10) for _ in categories]
-    ax.bar(categories, values, color=["#4B0082", "#9370DB", "#4B0082"])
-    ax.set_title("Example Analytics Chart")
     ax.set_ylabel("Value")
-    ax.set_xlabel("Category")
     return generated_text, fig
@@ -126,9 +155,9 @@ def video_inference(video_file, duration):
 def build_app():
     with gr.Blocks() as demo:
         gr.Markdown("""
-        # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
         Record a video (from webcam or file), then click **Stop**.
-        Next, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
         """)
         with gr.Row():
@@ -139,9 +168,8 @@ def build_app():
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
-                # Remove 'source="webcam"' to avoid the TypeError on older Gradio versions
                 video = gr.Video(
-                    label="Webcam Recording (press the Record button, then Stop)",
                     format="mp4"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")

 from transformers.image_utils import load_image
 #####################################
+# 1. Load Gemma3 Model & Processor
 #####################################
+MODEL_ID = "google/gemma-3-12b-it"  # Example placeholder
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Gemma3ForConditionalGeneration.from_pretrained(
 #####################################
 # 2. Helper Function: Downsample Video
 #####################################
 def downsample_video(video_path, num_frames=10):
     """
     Downsamples the video file to `num_frames` evenly spaced frames.
     vidcap.release()
     return frames
+#####################################
+# 2.5: Parse Categories from Model Output
+#####################################
+def parse_inferred_categories(generated_text):
+    """
+    A naive parser that looks for lines starting with 'Category:'
+    and collects the text after that as the category name.
+    Example lines in model output:
+        Category: Nutrition
+        Category: Outdoor Scenes
+    Returns a list of category strings.
+    """
+    categories = []
+    for line in generated_text.split("\n"):
+        line = line.strip()
+        # Check if the line starts with 'Category:' (case-insensitive)
+        if line.lower().startswith("category:"):
+            # Extract everything after 'Category:'
+            cat = line.split(":", 1)[1].strip()
+            if cat:
+                categories.append(cat)
+    return categories
 #####################################
 # 3. The Inference Function
 #####################################
 def video_inference(video_file, duration):
     """
     - Takes a recorded video file and a chosen duration (string).
+    - Downsamples the video, passes frames to the Gemma3 model for inference.
+    - Returns model-generated text + a bar chart with categories derived from that text.
     """
     if video_file is None:
         return "No video provided.", None
     if not frames:
         return "Could not read frames from video.", None
+    # 3.2: Construct prompt
     messages = [
         {
             "role": "user",
             "content": [{"type": "text", "text": "Please describe what's happening in this video."}]
         }
     ]
     # Add frames (with timestamp) to the messages
     for (image, ts) in frames:
         messages[0]["content"].append({"type": "text", "text": f"Frame at {ts} seconds:"})
         messages[0]["content"].append({"type": "image", "image": image})
+    # Prepare final prompt
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Collect images for model
     frame_images = [img for (img, _) in frames]
     inputs = processor(
         generated_text += new_text
         time.sleep(0.01)
+    # 3.4: Parse categories from model output
+    categories = parse_inferred_categories(generated_text)
+    # If no categories were found, use fallback
+    if not categories:
+        categories = ["Category A", "Category B", "Category C"]
+    # Create dummy values for each category
     values = [random.randint(1, 10) for _ in categories]
+    # 3.5: Create bar chart
+    fig, ax = plt.subplots()
+    ax.bar(categories, values, color=["#4B0082", "#9370DB", "#4B0082"]*(len(categories)//3+1))
+    ax.set_title("Inferred Categories from Model Output")
     ax.set_ylabel("Value")
+    ax.set_xlabel("Categories")
+    plt.xticks(rotation=30, ha="right")
     return generated_text, fig
 def build_app():
     with gr.Blocks() as demo:
         gr.Markdown("""
+        # **Gemma3 (or Qwen2.5-VL) Live Video Analysis**
         Record a video (from webcam or file), then click **Stop**.
+        Next, click **Analyze** to run the model and see textual + chart outputs.
         """)
         with gr.Row():
                     label="Suggested Recording Duration (seconds)",
                     info="Select how long you plan to record before pressing Stop."
                 )
                 video = gr.Video(
+                    label="Webcam Recording (press Record, then Stop)",
                     format="mp4"
                 )
                 analyze_btn = gr.Button("Analyze", variant="primary")