Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on Jun 16

Commit

34cd1e5

1 Parent(s): 45c2159

add rich debug message and dedicated display ui

Browse files

Files changed (1) hide show

app.py +52 -31

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
 import base64
 # ----------------------------------------
 # Model configurations: per-size prefixes and repos
@@ -91,7 +92,8 @@ def update_llm(size, model_file, clip_file):
     if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
         mf, cf = ensure_weights(size, model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
-        llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False, n_threads=min(2, os.cpu_count()), )
         model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
     return None  # no UI output
@@ -103,14 +105,29 @@ def get_weight_files(size):
     clip_files  = [f"{cfg['clip_prefix']}-{v}.gguf"  for v in cfg['clip_variants']]
     return model_files, clip_files
-# Caption using cached llm
 def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
-    # Use pre-loaded model
-    llm = model_cache['llm']
-    time.sleep(interval_ms / 1000)
     img = cv2.resize(frame.copy(), (384, 384))
     success, jpeg = cv2.imencode('.jpg', img)
     uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
     messages = [
         {"role": "system", "content": sys_prompt},
@@ -119,19 +136,34 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
             {"type": "text",      "text": usr_prompt}
         ]}
     ]
-    # re-init handler
-    llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
-    resp = llm.create_chat_completion(
         messages=messages,
         max_tokens=128,
         temperature=0.1,
         stop=["<end_of_utterance>"]
     )
-    import gc
     gc.collect()
-    return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
 # Gradio UI
@@ -141,7 +173,7 @@ def main():
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
-        gr.Markdown("## 🎥 Real-Time Camera Captioning")
         with gr.Row():
             size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
@@ -163,33 +195,22 @@ def main():
             inputs=[size_dd],
             outputs=[model_dd, clip_dd]
         )
-        # When model weight changes: preload llm
-        model_dd.change(
-            fn=lambda sz, mf, cf: update_llm(sz, mf, cf),
-            inputs=[size_dd, model_dd, clip_dd],
-            outputs=[]
-        )
-        # When clip weight changes: preload llm
-        clip_dd.change(
-            fn=lambda sz, mf, cf: update_llm(sz, mf, cf),
-            inputs=[size_dd, model_dd, clip_dd],
-            outputs=[]
-        )
-        # Initial preload with defaults
         update_llm(default, mf[0], cf[0])
         interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
-        sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
-        usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
-        cam   = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
-        cap   = gr.Textbox(interactive=False, label='Caption')
         cam.stream(
             fn=caption_frame,
             inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
-            outputs=[cap], time_limit=600
         )
     demo.launch()

 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
 import base64
+import gc
 # ----------------------------------------
 # Model configurations: per-size prefixes and repos
     if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
         mf, cf = ensure_weights(size, model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
+        llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024,
+                    verbose=False, n_threads=min(2, os.cpu_count()))
         model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
     return None  # no UI output
     clip_files  = [f"{cfg['clip_prefix']}-{v}.gguf"  for v in cfg['clip_variants']]
     return model_files, clip_files
+# Caption using cached llm with real-time debug logs
 def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
+    debug_msgs = []
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
+    t_resize = time.time()
     img = cv2.resize(frame.copy(), (384, 384))
+    elapsed = (time.time() - t_resize) * 1000
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms")
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms")
+    time.sleep(interval_ms / 1000)
+    t_enc = time.time()
     success, jpeg = cv2.imencode('.jpg', img)
+    elapsed = (time.time() - t_enc) * 1000
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms")
     uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
     messages = [
         {"role": "system", "content": sys_prompt},
             {"type": "text",      "text": usr_prompt}
         ]}
     ]
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
+    # re-init handler for image
+    model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
+    t_start = time.time()
+    resp = model_cache['llm'].create_chat_completion(
         messages=messages,
         max_tokens=128,
         temperature=0.1,
         stop=["<end_of_utterance>"]
     )
+    elapsed = (time.time() - t_start) * 1000
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms")
+    content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars")
     gc.collect()
+    timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Garbage collected")
+    return content, "\n".join(debug_msgs)
 # Gradio UI
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
+        gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs")
         with gr.Row():
             size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
             inputs=[size_dd],
             outputs=[model_dd, clip_dd]
         )
+        model_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
+        clip_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
         update_llm(default, mf[0], cf[0])
         interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
+        sys_p    = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
+        usr_p    = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
+        cam      = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
+        cap      = gr.Textbox(interactive=False, label='Caption')
+        log_box  = gr.Textbox(lines=8, interactive=False, label='Debug Log')
         cam.stream(
             fn=caption_frame,
             inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
+            outputs=[cap, log_box],
+            time_limit=600
         )
     demo.launch()