ariG23498 HF Staff commited on
Commit
29750ea
·
1 Parent(s): 8c907c3
Files changed (2) hide show
  1. .gitignore +3 -0
  2. app.py +111 -127
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Library
2
+ .venv
3
+ .ruff_cache
app.py CHANGED
@@ -5,9 +5,11 @@ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
5
  from PIL import Image
6
  import time
7
 
 
8
  def extract_model_short_name(model_id):
9
  return model_id.split("/")[-1].replace("-", " ").replace("_", " ")
10
 
 
11
  model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
12
  model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
13
  model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
@@ -18,177 +20,159 @@ model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
18
  model_omdet_name = extract_model_short_name(model_omdet_id)
19
  model_owlv2_name = extract_model_short_name(model_owlv2_id)
20
 
21
- @spaces.GPU
22
- def detect_omdet(image: Image.Image, prompts: list, threshold: float):
23
- t0 = time.perf_counter()
24
- model_id = model_omdet_id
25
- device = "cuda" if torch.cuda.is_available() else "cpu"
26
- processor = AutoProcessor.from_pretrained(model_id)
27
- model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
28
- texts = [prompts]
29
- inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
30
- with torch.no_grad():
31
- outputs = model(**inputs)
32
- results = processor.post_process_grounded_object_detection(
33
- outputs,
34
- threshold=threshold,
35
- target_sizes=[image.size[::-1]]
36
- )
37
- result = results[0]
38
- annotations = []
39
- raw_results = []
40
- for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
41
- if score >= threshold:
42
- label_name = prompts[label]
43
- xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
44
- annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
45
- raw_results.append(f"Detected {label_name} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
46
- elapsed_ms = (time.perf_counter() - t0) * 1000
47
- time_taken = f"**Inference time ({model_omdet_name}):** {elapsed_ms:.0f} ms"
48
- raw_text = "\n".join(raw_results) if raw_results else "No detections"
49
- return annotations, raw_text, time_taken
50
- @spaces.GPU
51
- def detect_llmdet(image: Image.Image, prompts: list, threshold: float):
52
- t0 = time.perf_counter()
53
- model_id = model_llmdet_id
54
- device = "cuda" if torch.cuda.is_available() else "cpu"
55
- processor = AutoProcessor.from_pretrained(model_id)
56
- model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
57
- texts = [prompts]
58
- inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
59
- with torch.no_grad():
60
- outputs = model(**inputs)
61
- results = processor.post_process_grounded_object_detection(
62
- outputs,
63
- threshold=threshold,
64
- target_sizes=[image.size[::-1]]
65
- )
66
- result = results[0]
67
- annotations = []
68
- raw_results = []
69
- for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
70
- if score >= threshold:
71
- xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
72
- annotations.append(((xmin, ymin, xmax, ymax), f"{label} {score:.2f}"))
73
- raw_results.append(f"Detected {label} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
74
- elapsed_ms = (time.perf_counter() - t0) * 1000
75
- time_taken = f"**Inference time ({model_llmdet_name}):** {elapsed_ms:.0f} ms"
76
- raw_text = "\n".join(raw_results) if raw_results else "No detections"
77
- return annotations, raw_text, time_taken
78
- @spaces.GPU
79
- def detect_mm_grounding(image: Image.Image, prompts: list, threshold: float):
80
- t0 = time.perf_counter()
81
- model_id = model_mm_grounding_id
82
- device = "cuda" if torch.cuda.is_available() else "cpu"
83
- processor = AutoProcessor.from_pretrained(model_id)
84
- model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
85
- texts = [prompts]
86
- inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
87
- with torch.no_grad():
88
- outputs = model(**inputs)
89
- results = processor.post_process_grounded_object_detection(
90
- outputs,
91
- threshold=threshold,
92
- target_sizes=[image.size[::-1]]
93
- )
94
- result = results[0]
95
- annotations = []
96
- raw_results = []
97
- for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
98
- if score >= threshold:
99
- xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
100
- annotations.append(((xmin, ymin, xmax, ymax), f"{label} {score:.2f}"))
101
- raw_results.append(f"Detected {label} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
102
- elapsed_ms = (time.perf_counter() - t0) * 1000
103
- time_taken = f"**Inference time ({model_mm_grounding_name}):** {elapsed_ms:.0f} ms"
104
- raw_text = "\n".join(raw_results) if raw_results else "No detections"
105
- return annotations, raw_text, time_taken
106
 
107
  @spaces.GPU
108
- def detect_owlv2(image: Image.Image, prompts: list, threshold: float):
109
  t0 = time.perf_counter()
110
- model_id = model_owlv2_id
111
  device = "cuda" if torch.cuda.is_available() else "cpu"
112
  processor = AutoProcessor.from_pretrained(model_id)
113
- model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
 
 
114
  texts = [prompts]
115
  inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
116
- with torch.no_grad():
117
  outputs = model(**inputs)
118
  results = processor.post_process_grounded_object_detection(
119
- outputs,
120
- threshold=threshold,
121
- target_sizes=[image.size[::-1]]
122
  )
123
  result = results[0]
124
  annotations = []
125
- raw_results = []
126
- for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
127
  if score >= threshold:
128
- label_name = prompts[label]
129
  xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
130
  annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
131
- raw_results.append(f"Detected {label_name} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
132
  elapsed_ms = (time.perf_counter() - t0) * 1000
133
- time_taken = f"**Inference time ({model_owlv2_name}):** {elapsed_ms:.0f} ms"
134
- raw_text = "\n".join(raw_results) if raw_results else "No detections"
135
- return annotations, raw_text, time_taken
136
 
137
 
138
- def run_detection(image, prompts_str, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet):
139
- if image is None:
140
- return (None, []), "No detections", "", (None, []), "No detections", ""
141
  prompts = [p.strip() for p in prompts_str.split(",")]
142
- ann_llm, raw_llm, time_llm = detect_llmdet(image, prompts, threshold_llm)
143
- ann_mm, raw_mm, time_mm = detect_mm_grounding(image, prompts, threshold_mm)
144
- ann_owlv2, raw_owlv2, time_owlv2 = detect_owlv2(image, prompts, threshold_owlv2)
145
- ann_omdet, raw_omdet, time_omdet = detect_omdet(image, prompts, threshold_omdet)
146
- return (image, ann_llm), raw_llm, time_llm, (image, ann_mm), raw_mm, time_mm, (image, ann_owlv2), raw_owlv2, time_owlv2, (image, ann_omdet), raw_omdet, time_omdet
 
 
 
 
 
 
 
 
 
 
147
 
148
  with gr.Blocks() as app:
149
  gr.Markdown("# Zero-Shot Object Detection Arena")
150
- gr.Markdown("### Compare different zero-shot object detection models on the same image and prompts.")
 
 
151
  with gr.Row():
152
  with gr.Column(scale=1):
153
  image = gr.Image(type="pil", label="Upload an image", height=400)
154
- prompts = gr.Textbox(label="Prompts (comma-separated)", value="a cat, a remote control")
 
 
155
  with gr.Accordion("Per-model confidence thresholds", open=True):
156
- threshold_llm = gr.Slider(label="Threshold for LLMDet", minimum=0.0, maximum=1.0, value=0.3)
157
- threshold_mm = gr.Slider(label="Threshold for MM GroundingDINO Tiny", minimum=0.0, maximum=1.0, value=0.3)
158
- threshold_owlv2 = gr.Slider(label="Threshold for OwlV2 Large", minimum=0.0, maximum=1.0, value=0.1)
159
- threshold_omdet = gr.Slider(label="Threshold for OMDet Turbo Swin Tiny", minimum=0.0, maximum=1.0, value=0.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  generate_btn = gr.Button(value="Detect")
161
  with gr.Row():
162
  with gr.Column(scale=2):
163
- output_image_llm = gr.AnnotatedImage(label=f"Annotated image for {model_llmdet_name}", height=400)
164
- output_text_llm = gr.Textbox(label=f"Model detections for {model_llmdet_name}", lines=5)
 
165
  output_time_llm = gr.Markdown()
166
  with gr.Column(scale=2):
167
- output_image_mm = gr.AnnotatedImage(label=f"Annotated image for {model_mm_grounding_name}", height=400)
168
- output_text_mm = gr.Textbox(label=f"Model detections for {model_mm_grounding_name}", lines=5)
 
169
  output_time_mm = gr.Markdown()
170
  with gr.Row():
171
  with gr.Column(scale=2):
172
- output_image_owlv2 = gr.AnnotatedImage(label=f"Annotated image for {model_owlv2_name}", height=400)
173
- output_text_owlv2 = gr.Textbox(label=f"Model detections for {model_owlv2_name}", lines=5)
 
174
  output_time_owlv2 = gr.Markdown()
175
  with gr.Column(scale=2):
176
- output_image_omdet = gr.AnnotatedImage(label=f"Annotated image for {model_omdet_name}", height=400)
177
- output_text_omdet = gr.Textbox(label=f"Model detections for {model_omdet_name}", lines=5)
 
178
  output_time_omdet = gr.Markdown()
179
  gr.Markdown("### Examples")
180
  example_data = [
181
- ["http://images.cocodataset.org/val2017/000000039769.jpg", "a cat, a remote control", 0.30, 0.30, 0.10, 0.30],
182
- ["http://images.cocodataset.org/val2017/000000000139.jpg", "a person, a tv, a remote", 0.35, 0.30, 0.12, 0.30],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  ]
184
 
185
  gr.Examples(
186
  examples=example_data,
187
- inputs=[image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet],
 
 
 
 
 
 
 
188
  label="Click an example to populate the inputs",
189
  )
190
- inputs = [image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet]
191
- outputs = [output_image_llm, output_text_llm, output_time_llm, output_image_mm, output_text_mm, output_time_mm, output_image_owlv2, output_text_owlv2, output_time_owlv2, output_image_omdet, output_text_omdet, output_time_omdet]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  generate_btn.click(
193
  fn=run_detection,
194
  inputs=inputs,
@@ -200,4 +184,4 @@ with gr.Blocks() as app:
200
  outputs=outputs,
201
  )
202
 
203
- app.launch()
 
5
  from PIL import Image
6
  import time
7
 
8
+
9
  def extract_model_short_name(model_id):
10
  return model_id.split("/")[-1].replace("-", " ").replace("_", " ")
11
 
12
+
13
  model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
14
  model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
15
  model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
 
20
  model_omdet_name = extract_model_short_name(model_omdet_id)
21
  model_owlv2_name = extract_model_short_name(model_owlv2_id)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  @spaces.GPU
25
+ def detect(model_id: str, image: Image.Image, prompts: list, threshold: float):
26
  t0 = time.perf_counter()
 
27
  device = "cuda" if torch.cuda.is_available() else "cpu"
28
  processor = AutoProcessor.from_pretrained(model_id)
29
+ model = (
30
+ AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
31
+ )
32
  texts = [prompts]
33
  inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
34
+ with torch.inference_mode():
35
  outputs = model(**inputs)
36
  results = processor.post_process_grounded_object_detection(
37
+ outputs, threshold=threshold, target_sizes=[image.size[::-1]]
 
 
38
  )
39
  result = results[0]
40
  annotations = []
41
+ for box, score, label_name in zip(result["boxes"], result["scores"], result["text_abels"]):
 
42
  if score >= threshold:
 
43
  xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
44
  annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
 
45
  elapsed_ms = (time.perf_counter() - t0) * 1000
46
+ time_taken = f"**Inference time ({model_omdet_name}):** {elapsed_ms:.0f} ms"
47
+ return annotations, time_taken
 
48
 
49
 
50
+ def run_detection(
51
+ image: Image.Image, prompts_str: str, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet,
52
+ ):
53
  prompts = [p.strip() for p in prompts_str.split(",")]
54
+ ann_llm, time_llm = detect(model_llmdet_id, image, prompts, threshold_llm)
55
+ ann_mm, time_mm = detect(model_mm_grounding_name, image, prompts, threshold_mm)
56
+ ann_owlv2, time_owlv2 = detect(model_omdet_id, image, prompts, threshold_owlv2)
57
+ ann_omdet, time_omdet = detect(model_owlv2_name, image, prompts, threshold_omdet)
58
+ return (
59
+ (image, ann_llm),
60
+ time_llm,
61
+ (image, ann_mm),
62
+ time_mm,
63
+ (image, ann_owlv2),
64
+ time_owlv2,
65
+ (image, ann_omdet),
66
+ time_omdet,
67
+ )
68
+
69
 
70
  with gr.Blocks() as app:
71
  gr.Markdown("# Zero-Shot Object Detection Arena")
72
+ gr.Markdown(
73
+ "### Compare different zero-shot object detection models on the same image and prompts."
74
+ )
75
  with gr.Row():
76
  with gr.Column(scale=1):
77
  image = gr.Image(type="pil", label="Upload an image", height=400)
78
+ prompts = gr.Textbox(
79
+ label="Prompts (comma-separated)", value="a cat, a remote control"
80
+ )
81
  with gr.Accordion("Per-model confidence thresholds", open=True):
82
+ threshold_llm = gr.Slider(
83
+ label="Threshold for LLMDet", minimum=0.0, maximum=1.0, value=0.3
84
+ )
85
+ threshold_mm = gr.Slider(
86
+ label="Threshold for MM GroundingDINO Tiny",
87
+ minimum=0.0,
88
+ maximum=1.0,
89
+ value=0.3,
90
+ )
91
+ threshold_owlv2 = gr.Slider(
92
+ label="Threshold for OwlV2 Large",
93
+ minimum=0.0,
94
+ maximum=1.0,
95
+ value=0.1,
96
+ )
97
+ threshold_omdet = gr.Slider(
98
+ label="Threshold for OMDet Turbo Swin Tiny",
99
+ minimum=0.0,
100
+ maximum=1.0,
101
+ value=0.2,
102
+ )
103
  generate_btn = gr.Button(value="Detect")
104
  with gr.Row():
105
  with gr.Column(scale=2):
106
+ output_image_llm = gr.AnnotatedImage(
107
+ label=f"Annotated image for {model_llmdet_name}", height=400
108
+ )
109
  output_time_llm = gr.Markdown()
110
  with gr.Column(scale=2):
111
+ output_image_mm = gr.AnnotatedImage(
112
+ label=f"Annotated image for {model_mm_grounding_name}", height=400
113
+ )
114
  output_time_mm = gr.Markdown()
115
  with gr.Row():
116
  with gr.Column(scale=2):
117
+ output_image_owlv2 = gr.AnnotatedImage(
118
+ label=f"Annotated image for {model_owlv2_name}", height=400
119
+ )
120
  output_time_owlv2 = gr.Markdown()
121
  with gr.Column(scale=2):
122
+ output_image_omdet = gr.AnnotatedImage(
123
+ label=f"Annotated image for {model_omdet_name}", height=400
124
+ )
125
  output_time_omdet = gr.Markdown()
126
  gr.Markdown("### Examples")
127
  example_data = [
128
+ [
129
+ "http://images.cocodataset.org/val2017/000000039769.jpg",
130
+ "a cat, a remote control",
131
+ 0.30,
132
+ 0.30,
133
+ 0.10,
134
+ 0.30,
135
+ ],
136
+ [
137
+ "http://images.cocodataset.org/val2017/000000000139.jpg",
138
+ "a person, a tv, a remote",
139
+ 0.35,
140
+ 0.30,
141
+ 0.12,
142
+ 0.30,
143
+ ],
144
  ]
145
 
146
  gr.Examples(
147
  examples=example_data,
148
+ inputs=[
149
+ image,
150
+ prompts,
151
+ threshold_llm,
152
+ threshold_mm,
153
+ threshold_owlv2,
154
+ threshold_omdet,
155
+ ],
156
  label="Click an example to populate the inputs",
157
  )
158
+ inputs = [
159
+ image,
160
+ prompts,
161
+ threshold_llm,
162
+ threshold_mm,
163
+ threshold_owlv2,
164
+ threshold_omdet,
165
+ ]
166
+ outputs = [
167
+ output_image_llm,
168
+ output_time_llm,
169
+ output_image_mm,
170
+ output_time_mm,
171
+ output_image_owlv2,
172
+ output_time_owlv2,
173
+ output_image_omdet,
174
+ output_time_omdet,
175
+ ]
176
  generate_btn.click(
177
  fn=run_detection,
178
  inputs=inputs,
 
184
  outputs=outputs,
185
  )
186
 
187
+ app.launch()