prithivMLmods commited on
Commit
8e6677c
·
verified ·
1 Parent(s): e553bfe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -14
app.py CHANGED
@@ -5,7 +5,6 @@ import cv2
5
  import matplotlib.pyplot as plt
6
  import random
7
  import time
8
- import spaces
9
  from PIL import Image
10
  from threading import Thread
11
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
@@ -14,7 +13,7 @@ from transformers.image_utils import load_image
14
  #####################################
15
  # 1. Load Qwen2.5-VL Model & Processor
16
  #####################################
17
- MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" # or "Qwen/Qwen2.5-VL-7B-Instruct"
18
 
19
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
20
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -55,7 +54,6 @@ def downsample_video(video_path, num_frames=10):
55
  #####################################
56
  # 3. The Inference Function
57
  #####################################
58
- @spaces.GPU
59
  def video_inference(video_file, duration):
60
  """
61
  - Takes a recorded video file and a chosen duration (string).
@@ -71,7 +69,6 @@ def video_inference(video_file, duration):
71
  return "Could not read frames from video.", None
72
 
73
  # 3.2: Construct Qwen2.5-VL prompt
74
- # We'll do a simple prompt: "Please describe what's happening in this video."
75
  messages = [
76
  {
77
  "role": "user",
@@ -97,19 +94,16 @@ def video_inference(video_file, duration):
97
  padding=True
98
  ).to("cuda")
99
 
100
- # 3.3: Generate text output
101
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
102
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
103
 
104
- # We'll run generation in a thread to simulate streaming.
105
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
106
  thread.start()
107
 
108
- # Collect the streamed text
109
  generated_text = ""
110
  for new_text in streamer:
111
  generated_text += new_text
112
- # Sleep briefly to yield control
113
  time.sleep(0.01)
114
 
115
  # 3.4: Dummy bar chart for demonstration
@@ -121,7 +115,6 @@ def video_inference(video_file, duration):
121
  ax.set_ylabel("Value")
122
  ax.set_xlabel("Category")
123
 
124
- # Return text + figure
125
  return generated_text, fig
126
 
127
  #####################################
@@ -131,8 +124,8 @@ def build_app():
131
  with gr.Blocks() as demo:
132
  gr.Markdown("""
133
  # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
134
- Record your webcam for a chosen duration, then click **Stop** to finalize.
135
- After that, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
136
  """)
137
 
138
  with gr.Row():
@@ -143,10 +136,10 @@ def build_app():
143
  label="Suggested Recording Duration (seconds)",
144
  info="Select how long you plan to record before pressing Stop."
145
  )
 
146
  video = gr.Video(
147
- source="webcam",
148
- format="mp4",
149
- label="Webcam Recording (press the Record button, then Stop)"
150
  )
151
  analyze_btn = gr.Button("Analyze", variant="primary")
152
  with gr.Column():
 
5
  import matplotlib.pyplot as plt
6
  import random
7
  import time
 
8
  from PIL import Image
9
  from threading import Thread
10
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 
13
  #####################################
14
  # 1. Load Qwen2.5-VL Model & Processor
15
  #####################################
16
+ MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" # or "Qwen/Qwen2.5-VL-3B-Instruct"
17
 
18
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
19
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
54
  #####################################
55
  # 3. The Inference Function
56
  #####################################
 
57
  def video_inference(video_file, duration):
58
  """
59
  - Takes a recorded video file and a chosen duration (string).
 
69
  return "Could not read frames from video.", None
70
 
71
  # 3.2: Construct Qwen2.5-VL prompt
 
72
  messages = [
73
  {
74
  "role": "user",
 
94
  padding=True
95
  ).to("cuda")
96
 
97
+ # 3.3: Generate text output (streaming)
98
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
99
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
100
 
 
101
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
102
  thread.start()
103
 
 
104
  generated_text = ""
105
  for new_text in streamer:
106
  generated_text += new_text
 
107
  time.sleep(0.01)
108
 
109
  # 3.4: Dummy bar chart for demonstration
 
115
  ax.set_ylabel("Value")
116
  ax.set_xlabel("Category")
117
 
 
118
  return generated_text, fig
119
 
120
  #####################################
 
124
  with gr.Blocks() as demo:
125
  gr.Markdown("""
126
  # **Qwen2.5-VL-7B-Instruct Live Video Analysis**
127
+ Record a video (from webcam or file), then click **Stop**.
128
+ Next, click **Analyze** to run Qwen2.5-VL and see textual + chart outputs.
129
  """)
130
 
131
  with gr.Row():
 
136
  label="Suggested Recording Duration (seconds)",
137
  info="Select how long you plan to record before pressing Stop."
138
  )
139
+ # Remove 'source="webcam"' to avoid the TypeError on older Gradio versions
140
  video = gr.Video(
141
+ label="Webcam Recording (press the Record button, then Stop)",
142
+ format="mp4"
 
143
  )
144
  analyze_btn = gr.Button("Analyze", variant="primary")
145
  with gr.Column():