Luigi commited on
Commit
34cd1e5
·
1 Parent(s): 45c2159

add rich debug message and dedicated display ui

Browse files
Files changed (1) hide show
  1. app.py +52 -31
app.py CHANGED
@@ -8,6 +8,7 @@ from huggingface_hub import hf_hub_download
8
  from llama_cpp import Llama
9
  from llama_cpp.llama_chat_format import Llava15ChatHandler
10
  import base64
 
11
 
12
  # ----------------------------------------
13
  # Model configurations: per-size prefixes and repos
@@ -91,7 +92,8 @@ def update_llm(size, model_file, clip_file):
91
  if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
92
  mf, cf = ensure_weights(size, model_file, clip_file)
93
  handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
94
- llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False, n_threads=min(2, os.cpu_count()), )
 
95
  model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
96
  return None # no UI output
97
 
@@ -103,14 +105,29 @@ def get_weight_files(size):
103
  clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
104
  return model_files, clip_files
105
 
106
- # Caption using cached llm
107
 
108
  def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
109
- # Use pre-loaded model
110
- llm = model_cache['llm']
111
- time.sleep(interval_ms / 1000)
 
 
112
  img = cv2.resize(frame.copy(), (384, 384))
 
 
 
 
 
 
 
 
 
113
  success, jpeg = cv2.imencode('.jpg', img)
 
 
 
 
114
  uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
115
  messages = [
116
  {"role": "system", "content": sys_prompt},
@@ -119,19 +136,34 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
119
  {"type": "text", "text": usr_prompt}
120
  ]}
121
  ]
122
- # re-init handler
123
- llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
124
- resp = llm.create_chat_completion(
 
 
 
 
 
 
 
125
  messages=messages,
126
  max_tokens=128,
127
  temperature=0.1,
128
  stop=["<end_of_utterance>"]
129
  )
 
 
 
 
 
 
 
130
 
131
- import gc
132
  gc.collect()
 
 
133
 
134
- return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
135
 
136
  # Gradio UI
137
 
@@ -141,7 +173,7 @@ def main():
141
  mf, cf = get_weight_files(default)
142
 
143
  with gr.Blocks() as demo:
144
- gr.Markdown("## 🎥 Real-Time Camera Captioning")
145
  with gr.Row():
146
  size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
147
  model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
@@ -163,33 +195,22 @@ def main():
163
  inputs=[size_dd],
164
  outputs=[model_dd, clip_dd]
165
  )
166
-
167
- # When model weight changes: preload llm
168
- model_dd.change(
169
- fn=lambda sz, mf, cf: update_llm(sz, mf, cf),
170
- inputs=[size_dd, model_dd, clip_dd],
171
- outputs=[]
172
- )
173
- # When clip weight changes: preload llm
174
- clip_dd.change(
175
- fn=lambda sz, mf, cf: update_llm(sz, mf, cf),
176
- inputs=[size_dd, model_dd, clip_dd],
177
- outputs=[]
178
- )
179
-
180
- # Initial preload with defaults
181
  update_llm(default, mf[0], cf[0])
182
 
183
  interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
184
- sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
185
- usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
186
- cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
187
- cap = gr.Textbox(interactive=False, label='Caption')
 
188
 
189
  cam.stream(
190
  fn=caption_frame,
191
  inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
192
- outputs=[cap], time_limit=600
 
193
  )
194
 
195
  demo.launch()
 
8
  from llama_cpp import Llama
9
  from llama_cpp.llama_chat_format import Llava15ChatHandler
10
  import base64
11
+ import gc
12
 
13
  # ----------------------------------------
14
  # Model configurations: per-size prefixes and repos
 
92
  if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
93
  mf, cf = ensure_weights(size, model_file, clip_file)
94
  handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
95
+ llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024,
96
+ verbose=False, n_threads=min(2, os.cpu_count()))
97
  model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
98
  return None # no UI output
99
 
 
105
  clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
106
  return model_files, clip_files
107
 
108
+ # Caption using cached llm with real-time debug logs
109
 
110
  def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
111
+ debug_msgs = []
112
+ timestamp = time.strftime('%H:%M:%S')
113
+ debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
114
+
115
+ t_resize = time.time()
116
  img = cv2.resize(frame.copy(), (384, 384))
117
+ elapsed = (time.time() - t_resize) * 1000
118
+ timestamp = time.strftime('%H:%M:%S')
119
+ debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms")
120
+
121
+ timestamp = time.strftime('%H:%M:%S')
122
+ debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms")
123
+ time.sleep(interval_ms / 1000)
124
+
125
+ t_enc = time.time()
126
  success, jpeg = cv2.imencode('.jpg', img)
127
+ elapsed = (time.time() - t_enc) * 1000
128
+ timestamp = time.strftime('%H:%M:%S')
129
+ debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms")
130
+
131
  uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
132
  messages = [
133
  {"role": "system", "content": sys_prompt},
 
136
  {"type": "text", "text": usr_prompt}
137
  ]}
138
  ]
139
+
140
+ timestamp = time.strftime('%H:%M:%S')
141
+ debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
142
+ # re-init handler for image
143
+ model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
144
+ timestamp = time.strftime('%H:%M:%S')
145
+ debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
146
+
147
+ t_start = time.time()
148
+ resp = model_cache['llm'].create_chat_completion(
149
  messages=messages,
150
  max_tokens=128,
151
  temperature=0.1,
152
  stop=["<end_of_utterance>"]
153
  )
154
+ elapsed = (time.time() - t_start) * 1000
155
+ timestamp = time.strftime('%H:%M:%S')
156
+ debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms")
157
+
158
+ content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
159
+ timestamp = time.strftime('%H:%M:%S')
160
+ debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars")
161
 
 
162
  gc.collect()
163
+ timestamp = time.strftime('%H:%M:%S')
164
+ debug_msgs.append(f"[{timestamp}] Garbage collected")
165
 
166
+ return content, "\n".join(debug_msgs)
167
 
168
  # Gradio UI
169
 
 
173
  mf, cf = get_weight_files(default)
174
 
175
  with gr.Blocks() as demo:
176
+ gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs")
177
  with gr.Row():
178
  size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
179
  model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
 
195
  inputs=[size_dd],
196
  outputs=[model_dd, clip_dd]
197
  )
198
+ model_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
199
+ clip_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  update_llm(default, mf[0], cf[0])
201
 
202
  interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
203
+ sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
204
+ usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
205
+ cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
206
+ cap = gr.Textbox(interactive=False, label='Caption')
207
+ log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
208
 
209
  cam.stream(
210
  fn=caption_frame,
211
  inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
212
+ outputs=[cap, log_box],
213
+ time_limit=600
214
  )
215
 
216
  demo.launch()