Spaces:

atalaydenknalbant
/

DINOv3

Running on Zero

App Files Files Community

atalaydenknalbant commited on 7 days ago

Commit

cc78067

verified ·

1 Parent(s): 3142e56

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -86

app.py CHANGED Viewed

@@ -36,35 +36,29 @@ except Exception:
 def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     """Return a booking duration for a gallery job based on file count.
     Args:
         files: List of file paths. Used to scale the reservation window.
     Returns:
         Number of seconds to reserve capped at 600.
     """
     n = max(1, len(files) if files else 1)
-    return min(600, 35 * n + 30)  # 35s per image plus 30s buffer capped at 10 minutes
 def _gpu_duration_classify(*_args, **_kwargs) -> int:
     """Return a small booking duration for classification runs.
     Returns:
         Number of seconds to reserve for classification.
     """
-    return 90  # small buffer for 1 query plus a handful of centroids
 # ---------------------------
-# Model loading and embedding extraction (fp32 only)
 # ---------------------------
 def _load(model_id: str) -> Tuple[AutoImageProcessor, AutoModel]:
     """Load processor and model then move to CUDA eval in float32.
     Args:
         model_id: Hugging Face model id to load.
     Returns:
         Tuple of processor and model on CUDA in eval mode.
     """
@@ -77,13 +71,10 @@ def _load(model_id: str) -> Tuple[AutoImageProcessor, AutoModel]:
     model.to("cuda").to(torch.float32).eval()
     return processor, model
 def _to_cuda_batchfeature(bf):
     """Move a BatchFeature or dict of tensors to CUDA.
     Args:
         bf: Transformers BatchFeature or a dict of tensors.
     Returns:
         BatchFeature or dict on CUDA.
     """
@@ -91,15 +82,11 @@ def _to_cuda_batchfeature(bf):
         return bf.to("cuda")
     return {k: v.to("cuda") for k, v in bf.items()}
-def _embed(image: Image.Image, model_id: str, pooling: str) -> np.ndarray:
     """Extract a single-image DINOv3 embedding.
     Args:
         image: Input PIL image.
         model_id: Backbone id from MODELS.
-        pooling: Either "CLS" or "Mean of patch tokens".
     Returns:
         1D NumPy vector in float32.
     """
@@ -110,19 +97,10 @@ def _embed(image: Image.Image, model_id: str, pooling: str) -> np.ndarray:
     with torch.inference_mode():
         out = model(**bf)
-    if pooling == "CLS":
-        if getattr(out, "pooler_output", None) is not None:
-            emb = out.pooler_output[0]
-        else:
-            emb = out.last_hidden_state[0, 0]
     else:
-        if out.last_hidden_state.ndim == 3:  # ViT tokens
-            num_regs = getattr(model.config, "num_register_tokens", 0)
-            patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
-            emb = patch_tokens.mean(dim=0)
-        else:  # Conv/backbone feature map [C,H,W]
-            feat = out.last_hidden_state[0]
-            emb = feat.mean(dim=(1, 2))
     return emb.float().cpu().numpy().astype(np.float32)
@@ -132,10 +110,8 @@ def _embed(image: Image.Image, model_id: str, pooling: str) -> np.ndarray:
 def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
     """Open many image files as RGB PIL images.
     Args:
         paths: List of file paths.
     Returns:
         List of PIL images. Files that fail to open are skipped.
     """
@@ -148,14 +124,11 @@ def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
             pass
     return imgs
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     """Render a cosine similarity matrix as an HTML table.
     Args:
         S: Square matrix of cosine similarities.
         names: File names for header and row labels.
     Returns:
         HTML string with a scrollable table.
     """
@@ -177,13 +150,10 @@ def _to_html_table(S: np.ndarray, names: List[str]) -> str:
              """
     return table
 def _normalize_rows(X: np.ndarray) -> np.ndarray:
     """Normalize rows to unit norm with safe clipping.
     Args:
         X: Matrix of shape N by D.
     Returns:
         Matrix with each row divided by its L2 norm.
     """
@@ -195,14 +165,11 @@ def _normalize_rows(X: np.ndarray) -> np.ndarray:
 # ---------------------------
 @spaces.GPU(duration=_gpu_duration_gallery)
-def batch_similarity(files: List[str], model_name: str, pooling: str):
     """Compute pairwise cosine similarities for many images.
     Args:
         files: List of image file paths.
         model_name: Key from MODELS.
-        pooling: Either CLS or Mean of patch tokens.
     Returns:
         html_table: HTML table with cosine similarities.
         csv_path: Path to a CSV file of the matrix.
@@ -217,7 +184,7 @@ def batch_similarity(files: List[str], model_name: str, pooling: str):
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
-        e = _embed(img, model_id, pooling)
         embs.append(e)
     if len(embs) < 2:
@@ -233,47 +200,38 @@ def batch_similarity(files: List[str], model_name: str, pooling: str):
     return html, csv_path
 # ---------------------------
-# Image Classification using DINOv3 embeddings
 # ---------------------------
 def _init_state() -> Dict:
     """Create an empty classifier state.
     Returns:
-        Dict with model_id pooling and classes.
     """
-    return {"model_id": "", "pooling": "", "classes": {}}
 def _summarize_state(state: Dict) -> Dict:
     """Summarize counts in the classifier state.
     Args:
         state: Current classifier state.
     Returns:
         Dict with counts for display.
     """
     return {
         "model_id": state.get("model_id", ""),
-        "pooling": state.get("pooling", ""),
         "class_counts": {k: v.get("count", 0) for k, v in state.get("classes", {}).items()},
         "num_classes": len(state.get("classes", {})),
         "total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
     }
 @spaces.GPU(duration=_gpu_duration_gallery)
-def add_class(class_name: str, files: List[str], model_name: str, pooling: str, state: Dict):
     """Add images to a labeled class and update embeddings.
     Args:
         class_name: Target class label.
         files: Image file paths to embed and store.
         model_name: Key from MODELS.
-        pooling: Either CLS or Mean of patch tokens.
         state: Current classifier state.
     Returns:
         Summary dict and the updated state.
     """
@@ -285,10 +243,9 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
         return {"error": "No images uploaded for this class"}, state
     model_id = MODELS[model_name]
-    if state.get("model_id") and (state["model_id"] != model_id or state.get("pooling") != pooling):
         state = _init_state()
     state["model_id"] = model_id
-    state["pooling"] = pooling
     imgs = _open_images_from_paths(files)
     if not imgs:
@@ -296,7 +253,7 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
     embs = []
     for im in imgs:
-        e = _embed(im, model_id, pooling).astype(np.float32)
         embs.append(e)
     X = np.vstack(embs)
@@ -309,18 +266,14 @@ def add_class(class_name: str, files: List[str], model_name: str, pooling: str,
         state["classes"][class_name]["count"] = new.shape[0]
     return _summarize_state(state), state
 @spaces.GPU(duration=_gpu_duration_classify)
-def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict, top_k: int):
     """Predict a class using cosine to class centroids.
     Args:
         image: Query PIL image.
         model_name: Key from MODELS.
-        pooling: Either CLS or Mean of patch tokens.
         state: Classifier state holding embeddings per class.
         top_k: Number of classes to report.
     Returns:
         Info dict with prediction, a label dict for display, and HTML with ranks.
     """
@@ -332,10 +285,10 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
         return {"error": "No classes have been added yet"}, {}, None
     model_id = MODELS[model_name]
-    if state.get("model_id") != model_id or state.get("pooling") != pooling:
-        return {"error": "Model or pooling changed after building classes. Clear and rebuild."}, {}, None
-    q = _embed(image, model_id, pooling).astype(np.float32)[None, :]
     qn = _normalize_rows(q)
     names = []
@@ -363,13 +316,10 @@ def predict_class(image: Image.Image, model_name: str, pooling: str, state: Dict
     ) + "</ol>"
     return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
 def clear_classes(_state: Dict):
     """Reset the classifier state to empty.
     Args:
         _state: Previous state ignored.
     Returns:
         Fresh state and its summary.
     """
@@ -380,7 +330,7 @@ def clear_classes(_state: Dict):
 # ---------------------------
 with gr.Blocks() as app:
-    gr.Markdown("# DINOv3 - Similarity, Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
@@ -391,7 +341,7 @@ with gr.Blocks() as app:
                     ```
                     @misc{simeoni2025dinov3,
                     title={DINOv3},
-                    author={Oriane Siméoni and Huy V. Vo and Maximilian Seitzer and Federico Baldassarre and Maxime Oquab and Cijo Jose and Vasil Khalidov and Marc Szafraniec and Seungeun Yi and Michaël Ramamonjisoa and Francisco Massa and Daniel Haziza and Luca Wehrstedt and Jianyuan Wang and Timothée Darcet and Théo Moutakanni and Leonel Sentana and Claire Roberts and Andrea Vedaldi and Jamie Tolan and John Brandt and Camille Couprie and Julien Mairal and Hervé Jégou and Patrick Labatut and Piotr Bojanowski}},
                     year={2025},
                     eprint={2508.10104},
                     archivePrefix={arXiv},
@@ -401,26 +351,17 @@ with gr.Blocks() as app:
                     ```  """
                    )
-    # ------------- Similarity -------------
     with gr.Tab("Similarity"):
         gr.Markdown("Upload multiple images to compute a cosine similarity matrix and download a CSV.")
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
-        pooling2 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
         go = gr.Button("Compute cosine")
         table = gr.HTML(label="Cosine similarity")
         csv = gr.File(label="cosine_similarity_matrix.csv")
         def _preview(paths):
-            """Preview images from file paths as a gallery.
-            Args:
-                paths: List of file paths from gr.Files.
-            Returns:
-                List of PIL images for the Gallery.
-            """
             if not paths:
                 return []
             imgs = []
@@ -432,15 +373,14 @@ with gr.Blocks() as app:
             return imgs
         files_in.change(_preview, inputs=files_in, outputs=gallery_preview)
-        go.click(batch_similarity, [files_in, model_dd2, pooling2], [table, csv])
-    # ------------- Image Classification -------------
     with gr.Tab("Image Classification"):
         st = gr.State(_init_state())
         with gr.Row():
             with gr.Column():
                 model_dd3 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
-                pooling3 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
                 gr.Markdown("Build your labeled set by adding a few images per class.")
                 class_name = gr.Textbox(label="Class name")
                 class_files = gr.Files(label="Upload images for this class", file_types=["image"], type="filepath", file_count="multiple")
@@ -456,7 +396,7 @@ with gr.Blocks() as app:
         add_btn.click(
             add_class,
-            [class_name, class_files, model_dd3, pooling3, st],
             [state_view, st],
         )
         clear_btn.click(
@@ -466,7 +406,7 @@ with gr.Blocks() as app:
         )
         predict_btn.click(
             predict_class,
-            [query_img, model_dd3, pooling3, st, topk],
             [gr.JSON(label="Info"), predicted, scores_html],
         )

 def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
     """Return a booking duration for a gallery job based on file count.
     Args:
         files: List of file paths. Used to scale the reservation window.
     Returns:
         Number of seconds to reserve capped at 600.
     """
     n = max(1, len(files) if files else 1)
+    return min(600, 35 * n + 30)
 def _gpu_duration_classify(*_args, **_kwargs) -> int:
     """Return a small booking duration for classification runs.
     Returns:
         Number of seconds to reserve for classification.
     """
+    return 90
 # ---------------------------
+# Model loading and CLS embedding extraction
 # ---------------------------
 def _load(model_id: str) -> Tuple[AutoImageProcessor, AutoModel]:
     """Load processor and model then move to CUDA eval in float32.
     Args:
         model_id: Hugging Face model id to load.
     Returns:
         Tuple of processor and model on CUDA in eval mode.
     """
     model.to("cuda").to(torch.float32).eval()
     return processor, model
 def _to_cuda_batchfeature(bf):
     """Move a BatchFeature or dict of tensors to CUDA.
     Args:
         bf: Transformers BatchFeature or a dict of tensors.
     Returns:
         BatchFeature or dict on CUDA.
     """
         return bf.to("cuda")
     return {k: v.to("cuda") for k, v in bf.items()}
+def _embed_cls(image: Image.Image, model_id: str) -> np.ndarray:
     """Extract a single-image DINOv3 embedding.
     Args:
         image: Input PIL image.
         model_id: Backbone id from MODELS.
     Returns:
         1D NumPy vector in float32.
     """
     with torch.inference_mode():
         out = model(**bf)
+    if getattr(out, "pooler_output", None) is not None:
+        emb = out.pooler_output[0]
     else:
+        emb = out.last_hidden_state[0, 0]
     return emb.float().cpu().numpy().astype(np.float32)
 def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
     """Open many image files as RGB PIL images.
     Args:
         paths: List of file paths.
     Returns:
         List of PIL images. Files that fail to open are skipped.
     """
             pass
     return imgs
 def _to_html_table(S: np.ndarray, names: List[str]) -> str:
     """Render a cosine similarity matrix as an HTML table.
     Args:
         S: Square matrix of cosine similarities.
         names: File names for header and row labels.
     Returns:
         HTML string with a scrollable table.
     """
              """
     return table
 def _normalize_rows(X: np.ndarray) -> np.ndarray:
     """Normalize rows to unit norm with safe clipping.
     Args:
         X: Matrix of shape N by D.
     Returns:
         Matrix with each row divided by its L2 norm.
     """
 # ---------------------------
 @spaces.GPU(duration=_gpu_duration_gallery)
+def batch_similarity(files: List[str], model_name: str):
     """Compute pairwise cosine similarities for many images.
     Args:
         files: List of image file paths.
         model_name: Key from MODELS.
     Returns:
         html_table: HTML table with cosine similarities.
         csv_path: Path to a CSV file of the matrix.
     imgs = _open_images_from_paths(paths)
     embs = []
     for img in imgs:
+        e = _embed_cls(img, model_id)
         embs.append(e)
     if len(embs) < 2:
     return html, csv_path
 # ---------------------------
+# Image Classification using DINOv3 CLS embeddings
 # ---------------------------
 def _init_state() -> Dict:
     """Create an empty classifier state.
     Returns:
+        Dict with model_id and classes.
     """
+    return {"model_id": "", "classes": {}}
 def _summarize_state(state: Dict) -> Dict:
     """Summarize counts in the classifier state.
     Args:
         state: Current classifier state.
     Returns:
         Dict with counts for display.
     """
     return {
         "model_id": state.get("model_id", ""),
         "class_counts": {k: v.get("count", 0) for k, v in state.get("classes", {}).items()},
         "num_classes": len(state.get("classes", {})),
         "total_examples": int(sum(v.get("count", 0) for v in state.get("classes", {}).values())),
     }
 @spaces.GPU(duration=_gpu_duration_gallery)
+def add_class(class_name: str, files: List[str], model_name: str, state: Dict):
     """Add images to a labeled class and update embeddings.
     Args:
         class_name: Target class label.
         files: Image file paths to embed and store.
         model_name: Key from MODELS.
         state: Current classifier state.
     Returns:
         Summary dict and the updated state.
     """
         return {"error": "No images uploaded for this class"}, state
     model_id = MODELS[model_name]
+    if state.get("model_id") and state["model_id"] != model_id:
         state = _init_state()
     state["model_id"] = model_id
     imgs = _open_images_from_paths(files)
     if not imgs:
     embs = []
     for im in imgs:
+        e = _embed_cls(im, model_id).astype(np.float32)
         embs.append(e)
     X = np.vstack(embs)
         state["classes"][class_name]["count"] = new.shape[0]
     return _summarize_state(state), state
 @spaces.GPU(duration=_gpu_duration_classify)
+def predict_class(image: Image.Image, model_name: str, state: Dict, top_k: int):
     """Predict a class using cosine to class centroids.
     Args:
         image: Query PIL image.
         model_name: Key from MODELS.
         state: Classifier state holding embeddings per class.
         top_k: Number of classes to report.
     Returns:
         Info dict with prediction, a label dict for display, and HTML with ranks.
     """
         return {"error": "No classes have been added yet"}, {}, None
     model_id = MODELS[model_name]
+    if state.get("model_id") != model_id:
+        return {"error": "Model changed after building classes. Clear and rebuild."}, {}, None
+    q = _embed_cls(image, model_id).astype(np.float32)[None, :]
     qn = _normalize_rows(q)
     names = []
     ) + "</ol>"
     return {"top_k": top_k, "prediction": names[order[0]]}, result_dict, full_table
 def clear_classes(_state: Dict):
     """Reset the classifier state to empty.
     Args:
         _state: Previous state ignored.
     Returns:
         Fresh state and its summary.
     """
 # ---------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# DINOv3 Similarity and Classification")
     with gr.Accordion("Paper and Citation", open=False):
         gr.Markdown("""
                     ```
                     @misc{simeoni2025dinov3,
                     title={DINOv3},
+                    author={Oriane Siméoni and Huy V. Vo and Maximilian Seitzer and Federico Baldassarre and Maxime Oquab and Cijo Jose and Vasil Khalidov and Marc Szafraniec and Seungeun Yi and Michaël Ramamonjisoa and Francisco Massa and Daniel Haziza and Luca Wehrstedt and Jianyuan Wang and Timothée Darcet and Théo Moutakanni and Leonel Sentana and Claire Roberts and Andrea Vedaldi and Jamie Tolan and John Brandt and Camille Couprie and Julien Mairal and Hervé Jégou and Patrick Labatut and Piotr Bojanowski},
                     year={2025},
                     eprint={2508.10104},
                     archivePrefix={arXiv},
                     ```  """
                    )
+    # Similarity
     with gr.Tab("Similarity"):
         gr.Markdown("Upload multiple images to compute a cosine similarity matrix and download a CSV.")
         files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
         gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
         go = gr.Button("Compute cosine")
         table = gr.HTML(label="Cosine similarity")
         csv = gr.File(label="cosine_similarity_matrix.csv")
         def _preview(paths):
             if not paths:
                 return []
             imgs = []
             return imgs
         files_in.change(_preview, inputs=files_in, outputs=gallery_preview)
+        go.click(batch_similarity, [files_in, model_dd2], [table, csv])
+    # Image Classification
     with gr.Tab("Image Classification"):
         st = gr.State(_init_state())
         with gr.Row():
             with gr.Column():
                 model_dd3 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
                 gr.Markdown("Build your labeled set by adding a few images per class.")
                 class_name = gr.Textbox(label="Class name")
                 class_files = gr.Files(label="Upload images for this class", file_types=["image"], type="filepath", file_count="multiple")
         add_btn.click(
             add_class,
+            [class_name, class_files, model_dd3, st],
             [state_view, st],
         )
         clear_btn.click(
         )
         predict_btn.click(
             predict_class,
+            [query_img, model_dd3, st, topk],
             [gr.JSON(label="Info"), predicted, scores_html],
         )