Spaces:
Running
on
Zero
Running
on
Zero
Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +89 -411
- functional_zone_identifier.py +68 -25
- room_04.jpg +3 -0
- scene_analysis_coordinator.py +11 -15
- spatial_analyzer.py +6 -103
- ui_manager.py +683 -0
.gitattributes
CHANGED
@@ -41,3 +41,4 @@ room_02.jpg filter=lfs diff=lfs merge=lfs -text
|
|
41 |
street_04.jpg filter=lfs diff=lfs merge=lfs -text
|
42 |
landmark_Louvre_01.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
street_05.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
41 |
street_04.jpg filter=lfs diff=lfs merge=lfs -text
|
42 |
landmark_Louvre_01.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
street_05.jpg filter=lfs diff=lfs merge=lfs -text
|
44 |
+
room_04.jpg filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -17,12 +17,20 @@ from style import Style
|
|
17 |
from image_processor import ImageProcessor
|
18 |
from video_processor import VideoProcessor
|
19 |
from llm_enhancer import LLMEnhancer
|
|
|
20 |
|
21 |
# Initialize Processors with LLM support
|
22 |
image_processor = None
|
23 |
video_processor = None
|
|
|
24 |
|
25 |
def initialize_processors():
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
global image_processor, video_processor
|
27 |
|
28 |
try:
|
@@ -30,7 +38,7 @@ def initialize_processors():
|
|
30 |
image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
|
31 |
print("ImageProcessor initialized successfully with LLM")
|
32 |
|
33 |
-
#
|
34 |
if hasattr(image_processor, 'scene_analyzer'):
|
35 |
if image_processor.scene_analyzer is not None:
|
36 |
print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
|
@@ -66,49 +74,41 @@ def initialize_processors():
|
|
66 |
video_processor = None
|
67 |
return False
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Error getting class names from model: {e}")
|
87 |
-
|
88 |
-
# Fallback to standard COCO (ensure keys are ints)
|
89 |
-
default_classes = {
|
90 |
-
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
|
91 |
-
6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
|
92 |
-
11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
|
93 |
-
16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
|
94 |
-
22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
|
95 |
-
27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
|
96 |
-
32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
|
97 |
-
36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
|
98 |
-
40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
|
99 |
-
46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
|
100 |
-
51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
|
101 |
-
57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
|
102 |
-
62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
|
103 |
-
67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
|
104 |
-
72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
|
105 |
-
77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
|
106 |
-
}
|
107 |
-
return sorted(default_classes.items())
|
108 |
|
109 |
@spaces.GPU(duration=180)
|
110 |
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
|
111 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
# Enhanced safety check for image_processor
|
113 |
if image_processor is None:
|
114 |
error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
|
@@ -140,6 +140,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
140 |
|
141 |
print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
|
142 |
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
|
|
|
143 |
try:
|
144 |
image_processor.use_llm = use_llm
|
145 |
|
@@ -155,19 +156,19 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
155 |
image_processor.scene_analyzer.use_landmark_detection = enable_landmark
|
156 |
image_processor.scene_analyzer.enable_landmark = enable_landmark
|
157 |
|
158 |
-
# 確保處理器也設置了這選項
|
159 |
image_processor.enable_landmark = enable_landmark
|
160 |
|
161 |
# 檢查並設置更深層次的組件
|
162 |
if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
|
163 |
image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
|
164 |
|
165 |
-
# 檢查並設置CLIP
|
166 |
if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
|
167 |
if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
|
168 |
image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
|
169 |
|
170 |
-
# 檢查並設置LLM
|
171 |
if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
|
172 |
if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
|
173 |
image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
|
@@ -198,7 +199,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
198 |
class_ids_to_filter = None
|
199 |
if filter_classes:
|
200 |
class_ids_to_filter = []
|
201 |
-
available_classes_dict = dict(get_all_classes())
|
202 |
name_to_id = {name: id for id, name in available_classes_dict.items()}
|
203 |
for class_str in filter_classes:
|
204 |
class_name_or_id = class_str.split(":")[0].strip()
|
@@ -235,7 +236,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
235 |
# Prepare visualization data for the plot
|
236 |
plot_figure = None
|
237 |
if stats and "class_statistics" in stats and stats["class_statistics"]:
|
238 |
-
available_classes_dict = dict(get_all_classes())
|
239 |
viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
|
240 |
if "error" not in viz_data:
|
241 |
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
@@ -485,8 +486,20 @@ def download_video_from_url(video_url, max_duration_minutes=10):
|
|
485 |
|
486 |
@spaces.GPU
|
487 |
def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
|
488 |
-
"""
|
489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
print(f"Received video request: input_type={input_type}")
|
491 |
video_path = None
|
492 |
|
@@ -534,369 +547,34 @@ def handle_video_upload(video_input, video_url, input_type, model_name, confiden
|
|
534 |
return None, error_html, {"error": str(e)}
|
535 |
|
536 |
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
</div>
|
565 |
-
""")
|
566 |
-
|
567 |
-
# Main Content with Tabs
|
568 |
-
with gr.Tabs(elem_classes="tabs"):
|
569 |
-
|
570 |
-
# Tab 1: Image Processing
|
571 |
-
with gr.Tab("Image Processing"):
|
572 |
-
current_image_model = gr.State("yolov8m.pt") # State for image model selection
|
573 |
-
with gr.Row(equal_height=False): # Allow columns to have different heights
|
574 |
-
# Left Column: Image Input & Controls
|
575 |
-
with gr.Column(scale=4, elem_classes="input-panel"):
|
576 |
-
with gr.Group():
|
577 |
-
gr.HTML('<div class="section-heading">Upload Image</div>')
|
578 |
-
image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
|
579 |
-
|
580 |
-
with gr.Accordion("Image Analysis Settings", open=False):
|
581 |
-
image_model_dropdown = gr.Dropdown(
|
582 |
-
choices=model_choices,
|
583 |
-
value="yolov8m.pt", # Default for images
|
584 |
-
label="Select Model",
|
585 |
-
info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
|
586 |
-
)
|
587 |
-
# Display model info
|
588 |
-
image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
|
589 |
-
|
590 |
-
image_confidence = gr.Slider(
|
591 |
-
minimum=0.1, maximum=0.9, value=0.25, step=0.05,
|
592 |
-
label="Confidence Threshold",
|
593 |
-
info="Minimum confidence for displaying a detected object"
|
594 |
-
)
|
595 |
-
|
596 |
-
use_llm = gr.Checkbox(
|
597 |
-
label="Use LLM for enhanced scene descriptions",
|
598 |
-
value=True,
|
599 |
-
info="Provides more detailed and natural language descriptions (may increase processing time)"
|
600 |
-
)
|
601 |
-
|
602 |
-
use_landmark_detection = gr.Checkbox(
|
603 |
-
label="Use CLIP for Landmark Detection",
|
604 |
-
value=False,
|
605 |
-
info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
|
606 |
-
)
|
607 |
-
|
608 |
-
with gr.Accordion("Filter Classes", open=False):
|
609 |
-
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
610 |
-
with gr.Row():
|
611 |
-
people_btn = gr.Button("People", size="sm")
|
612 |
-
vehicles_btn = gr.Button("Vehicles", size="sm")
|
613 |
-
animals_btn = gr.Button("Animals", size="sm")
|
614 |
-
objects_btn = gr.Button("Common Objects", size="sm")
|
615 |
-
image_class_filter = gr.Dropdown(
|
616 |
-
choices=class_choices_formatted, # Use formatted choices
|
617 |
-
multiselect=True,
|
618 |
-
label="Select Classes to Display",
|
619 |
-
info="Leave empty to show all detected objects"
|
620 |
-
)
|
621 |
-
|
622 |
-
image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")
|
623 |
-
|
624 |
-
with gr.Group(elem_classes="how-to-use"):
|
625 |
-
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
626 |
-
gr.Markdown("""
|
627 |
-
1. Upload an image or use the camera
|
628 |
-
2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
|
629 |
-
3. In **Analysis Settings**, you can:
|
630 |
-
* Uncheck **Use LLM** to skip enhanced descriptions (faster)
|
631 |
-
* Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
|
632 |
-
* Filter object classes to focus on specific types of objects *(optional)*
|
633 |
-
4. Click **Analyze Image** button
|
634 |
-
|
635 |
-
**💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
|
636 |
-
""")
|
637 |
-
|
638 |
-
|
639 |
-
# Image Examples
|
640 |
-
gr.Examples(
|
641 |
-
examples=[
|
642 |
-
"room_01.jpg",
|
643 |
-
"street_04.jpg",
|
644 |
-
"street_05.jpg",
|
645 |
-
"landmark_Louvre_01.jpg"
|
646 |
-
],
|
647 |
-
inputs=image_input,
|
648 |
-
label="Example Images"
|
649 |
-
)
|
650 |
-
|
651 |
-
gr.HTML("""
|
652 |
-
<div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
|
653 |
-
<p style="font-size: 12px; color: #718096; margin: 0;">
|
654 |
-
📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
|
655 |
-
</p>
|
656 |
-
</div>
|
657 |
-
""")
|
658 |
-
|
659 |
-
# Right Column: Image Results
|
660 |
-
with gr.Column(scale=6, elem_classes="output-panel"):
|
661 |
-
with gr.Tabs(elem_classes="tabs"):
|
662 |
-
with gr.Tab("Detection Result"):
|
663 |
-
image_result_image = gr.Image(type="pil", label="Detection Result")
|
664 |
-
gr.HTML('<div class="section-heading">Detection Details</div>')
|
665 |
-
image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)
|
666 |
-
|
667 |
-
with gr.Tab("Scene Understanding"):
|
668 |
-
gr.HTML('<div class="section-heading">Scene Analysis</div>')
|
669 |
-
gr.HTML("""
|
670 |
-
<details class="info-details" style="margin: 5px 0 15px 0;">
|
671 |
-
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
672 |
-
🔍 The AI Vision Scout Report: Click for important notes about this analysis
|
673 |
-
</summary>
|
674 |
-
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
675 |
-
<p style="font-size: 13px; color: #718096; margin: 0;">
|
676 |
-
<b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
|
677 |
-
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
|
678 |
-
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
|
679 |
-
</p>
|
680 |
-
</div>
|
681 |
-
</details>
|
682 |
-
""")
|
683 |
-
|
684 |
-
gr.HTML('''
|
685 |
-
<div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
|
686 |
-
<p style="margin: 0; color: #4a5568;">
|
687 |
-
<b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
|
688 |
-
</p>
|
689 |
-
</div>
|
690 |
-
''')
|
691 |
-
image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
|
692 |
-
|
693 |
-
# 使用LLM增強敘述時也會顯示原本敘述內容
|
694 |
-
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
|
695 |
-
image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
|
696 |
-
|
697 |
-
with gr.Row():
|
698 |
-
with gr.Column(scale=1):
|
699 |
-
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
|
700 |
-
image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)
|
701 |
-
|
702 |
-
with gr.Column(scale=1):
|
703 |
-
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
|
704 |
-
image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)
|
705 |
-
|
706 |
-
gr.HTML('<div class="section-heading">Functional Zones</div>')
|
707 |
-
image_zones_json = gr.JSON(label=None, elem_classes="json-box")
|
708 |
-
|
709 |
-
gr.HTML('<div class="section-heading">Lighting Conditions</div>')
|
710 |
-
image_lighting_info = gr.JSON(label=None, elem_classes="json-box")
|
711 |
-
|
712 |
-
with gr.Tab("Statistics"):
|
713 |
-
with gr.Row():
|
714 |
-
with gr.Column(scale=3, elem_classes="plot-column"):
|
715 |
-
gr.HTML('<div class="section-heading">Object Distribution</div>')
|
716 |
-
image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
|
717 |
-
with gr.Column(scale=2, elem_classes="stats-column"):
|
718 |
-
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
719 |
-
image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
|
720 |
-
|
721 |
-
# Tab 2: Video Processing
|
722 |
-
with gr.Tab("Video Processing"):
|
723 |
-
with gr.Row(equal_height=False):
|
724 |
-
# Left Column: Video Input & Controls
|
725 |
-
with gr.Column(scale=4, elem_classes="input-panel"):
|
726 |
-
with gr.Group():
|
727 |
-
gr.HTML('<div class="section-heading">Video Input</div>')
|
728 |
-
|
729 |
-
# Add input type selection
|
730 |
-
video_input_type = gr.Radio(
|
731 |
-
["upload", "url"],
|
732 |
-
label="Input Method",
|
733 |
-
value="upload",
|
734 |
-
info="Choose how to provide the video"
|
735 |
-
)
|
736 |
-
|
737 |
-
# File upload (will be shown/hidden based on selection)
|
738 |
-
with gr.Group(elem_id="upload-video-group"):
|
739 |
-
video_input = gr.Video(
|
740 |
-
label="Upload a video file (MP4, AVI, MOV)",
|
741 |
-
sources=["upload"],
|
742 |
-
visible=True
|
743 |
-
)
|
744 |
-
|
745 |
-
# URL input (will be shown/hidden based on selection)
|
746 |
-
with gr.Group(elem_id="url-video-group"):
|
747 |
-
video_url_input = gr.Textbox(
|
748 |
-
label="Enter video URL (YouTube or direct video link)",
|
749 |
-
placeholder="https://www.youtube.com/watch?v=...",
|
750 |
-
visible=False,
|
751 |
-
elem_classes="custom-video-url-input"
|
752 |
-
)
|
753 |
-
gr.HTML("""
|
754 |
-
<div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
|
755 |
-
<p style="margin: 0; color: #4b5563;">
|
756 |
-
Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
|
757 |
-
</p>
|
758 |
-
</div>
|
759 |
-
""")
|
760 |
-
|
761 |
-
with gr.Accordion("Video Analysis Settings", open=True):
|
762 |
-
video_model_dropdown = gr.Dropdown(
|
763 |
-
choices=model_choices,
|
764 |
-
value="yolov8n.pt", # Default 'n' for video
|
765 |
-
label="Select Model (Video)",
|
766 |
-
info="Faster models (like 'n') are recommended"
|
767 |
-
)
|
768 |
-
video_confidence = gr.Slider(
|
769 |
-
minimum=0.1, maximum=0.9, value=0.4, step=0.05,
|
770 |
-
label="Confidence Threshold (Video)"
|
771 |
-
)
|
772 |
-
video_process_interval = gr.Slider(
|
773 |
-
minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
|
774 |
-
label="Processing Interval (Frames)",
|
775 |
-
info="Analyze every Nth frame (higher value = faster)"
|
776 |
-
)
|
777 |
-
video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")
|
778 |
-
|
779 |
-
with gr.Group(elem_classes="how-to-use"):
|
780 |
-
gr.HTML('<div class="section-heading">How to Use (Video)</div>')
|
781 |
-
gr.Markdown("""
|
782 |
-
1. Choose your input method: Upload a file or enter a URL.
|
783 |
-
2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
|
784 |
-
3. Click "Process Video". **Processing can take a significant amount of time.**
|
785 |
-
4. The annotated video and summary will appear on the right when finished.
|
786 |
-
""")
|
787 |
-
|
788 |
-
# Add video examples
|
789 |
-
gr.HTML('<div class="section-heading">Example Videos</div>')
|
790 |
-
gr.HTML("""
|
791 |
-
<div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
|
792 |
-
<p style="font-size: 14px; color: #4A5568; margin: 0;">
|
793 |
-
Upload any video containing objects that YOLO can detect. For testing, find sample videos
|
794 |
-
<a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
|
795 |
-
</p>
|
796 |
-
</div>
|
797 |
-
""")
|
798 |
-
|
799 |
-
# Right Column: Video Results
|
800 |
-
with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
|
801 |
-
gr.HTML("""
|
802 |
-
<div class="section-heading">Video Result</div>
|
803 |
-
<details class="info-details" style="margin: 5px 0 15px 0;">
|
804 |
-
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
805 |
-
🎬 Video Processing Notes
|
806 |
-
</summary>
|
807 |
-
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
808 |
-
<p style="font-size: 13px; color: #718096; margin: 0;">
|
809 |
-
The processed video includes bounding boxes around detected objects. For longer videos,
|
810 |
-
consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
|
811 |
-
</p>
|
812 |
-
</div>
|
813 |
-
</details>
|
814 |
-
""")
|
815 |
-
video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file
|
816 |
-
|
817 |
-
gr.HTML('<div class="section-heading">Processing Summary</div>')
|
818 |
-
# 使用HTML顯示影片的摘要
|
819 |
-
video_summary_text = gr.HTML(
|
820 |
-
label=None,
|
821 |
-
elem_id="video-summary-html-output"
|
822 |
-
)
|
823 |
-
|
824 |
-
gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
|
825 |
-
video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
|
826 |
-
|
827 |
-
# Event Listeners
|
828 |
-
# Image Model Change Handler
|
829 |
-
image_model_dropdown.change(
|
830 |
-
fn=lambda model: (model, DetectionModel.get_model_description(model)),
|
831 |
-
inputs=[image_model_dropdown],
|
832 |
-
outputs=[current_image_model, image_model_info] # Update state and description
|
833 |
-
)
|
834 |
-
|
835 |
-
# Image Filter Buttons
|
836 |
-
available_classes_list = get_all_classes() # Get list of (id, name)
|
837 |
-
people_classes_ids = [0]
|
838 |
-
vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
|
839 |
-
animals_classes_ids = list(range(14, 24))
|
840 |
-
common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book
|
841 |
-
|
842 |
-
people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
|
843 |
-
vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
|
844 |
-
animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
|
845 |
-
objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)
|
846 |
-
|
847 |
-
video_input_type.change(
|
848 |
-
fn=lambda input_type: [
|
849 |
-
# Show/hide file upload
|
850 |
-
gr.update(visible=(input_type == "upload")),
|
851 |
-
# Show/hide URL input
|
852 |
-
gr.update(visible=(input_type == "url"))
|
853 |
-
],
|
854 |
-
inputs=[video_input_type],
|
855 |
-
outputs=[video_input, video_url_input]
|
856 |
-
)
|
857 |
-
|
858 |
-
image_detect_btn.click(
|
859 |
-
fn=handle_image_upload,
|
860 |
-
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm, use_landmark_detection ],
|
861 |
-
outputs=[
|
862 |
-
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
863 |
-
image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
|
864 |
-
image_lighting_info
|
865 |
-
]
|
866 |
-
)
|
867 |
-
|
868 |
-
video_process_btn.click(
|
869 |
-
fn=handle_video_upload,
|
870 |
-
inputs=[
|
871 |
-
video_input,
|
872 |
-
video_url_input,
|
873 |
-
video_input_type,
|
874 |
-
video_model_dropdown,
|
875 |
-
video_confidence,
|
876 |
-
video_process_interval
|
877 |
-
],
|
878 |
-
outputs=[video_output, video_summary_text, video_stats_json]
|
879 |
-
)
|
880 |
-
|
881 |
-
# Footer
|
882 |
-
gr.HTML("""
|
883 |
-
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
884 |
-
<div style="margin-bottom: 15px;">
|
885 |
-
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
|
886 |
-
</div>
|
887 |
-
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
888 |
-
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
889 |
-
<a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
|
890 |
-
<img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
|
891 |
-
</a>
|
892 |
-
</div>
|
893 |
-
</div>
|
894 |
-
""")
|
895 |
-
|
896 |
-
return demo
|
897 |
|
898 |
|
899 |
if __name__ == "__main__":
|
900 |
-
|
901 |
-
|
902 |
-
demo_interface.launch(debug=True)
|
|
|
17 |
from image_processor import ImageProcessor
|
18 |
from video_processor import VideoProcessor
|
19 |
from llm_enhancer import LLMEnhancer
|
20 |
+
from ui_manager import UIManager
|
21 |
|
22 |
# Initialize Processors with LLM support
|
23 |
image_processor = None
|
24 |
video_processor = None
|
25 |
+
ui_manager = None
|
26 |
|
27 |
def initialize_processors():
|
28 |
+
"""
|
29 |
+
Initialize the image and video processors with LLM support.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
bool: True if initialization was successful, False otherwise
|
33 |
+
"""
|
34 |
global image_processor, video_processor
|
35 |
|
36 |
try:
|
|
|
38 |
image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
|
39 |
print("ImageProcessor initialized successfully with LLM")
|
40 |
|
41 |
+
# 檢查狀態
|
42 |
if hasattr(image_processor, 'scene_analyzer'):
|
43 |
if image_processor.scene_analyzer is not None:
|
44 |
print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
|
|
|
74 |
video_processor = None
|
75 |
return False
|
76 |
|
77 |
+
def initialize_ui_manager():
|
78 |
+
"""
|
79 |
+
Initialize the UI manager and set up references to processors.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
UIManager: Initialized UI manager instance
|
83 |
+
"""
|
84 |
+
global ui_manager, image_processor
|
85 |
+
|
86 |
+
ui_manager = UIManager()
|
87 |
+
|
88 |
+
# Set image processor reference for dynamic class retrieval
|
89 |
+
if image_processor:
|
90 |
+
ui_manager.set_image_processor(image_processor)
|
91 |
+
|
92 |
+
return ui_manager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
@spaces.GPU(duration=180)
|
95 |
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
|
96 |
+
"""
|
97 |
+
Processes a single uploaded image.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
image: PIL Image object
|
101 |
+
model_name: Name of the YOLO model to use
|
102 |
+
confidence_threshold: Confidence threshold for detections
|
103 |
+
filter_classes: List of class names/IDs to filter
|
104 |
+
use_llm: Whether to use LLM for enhanced descriptions
|
105 |
+
enable_landmark: Whether to enable landmark detection
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
Tuple: (result_image, result_text, formatted_stats, plot_figure,
|
109 |
+
scene_description_html, original_desc_html, activities_list_data,
|
110 |
+
safety_data, zones, lighting)
|
111 |
+
"""
|
112 |
# Enhanced safety check for image_processor
|
113 |
if image_processor is None:
|
114 |
error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
|
|
|
140 |
|
141 |
print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
|
142 |
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
|
143 |
+
|
144 |
try:
|
145 |
image_processor.use_llm = use_llm
|
146 |
|
|
|
156 |
image_processor.scene_analyzer.use_landmark_detection = enable_landmark
|
157 |
image_processor.scene_analyzer.enable_landmark = enable_landmark
|
158 |
|
159 |
+
# 確保處理器也設置了這選項(檢測地標用)
|
160 |
image_processor.enable_landmark = enable_landmark
|
161 |
|
162 |
# 檢查並設置更深層次的組件
|
163 |
if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
|
164 |
image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
|
165 |
|
166 |
+
# 檢查並設置CLIP Analyzer
|
167 |
if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
|
168 |
if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
|
169 |
image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
|
170 |
|
171 |
+
# 檢查並設置LLM方面
|
172 |
if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
|
173 |
if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
|
174 |
image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
|
|
|
199 |
class_ids_to_filter = None
|
200 |
if filter_classes:
|
201 |
class_ids_to_filter = []
|
202 |
+
available_classes_dict = dict(ui_manager.get_all_classes())
|
203 |
name_to_id = {name: id for id, name in available_classes_dict.items()}
|
204 |
for class_str in filter_classes:
|
205 |
class_name_or_id = class_str.split(":")[0].strip()
|
|
|
236 |
# Prepare visualization data for the plot
|
237 |
plot_figure = None
|
238 |
if stats and "class_statistics" in stats and stats["class_statistics"]:
|
239 |
+
available_classes_dict = dict(ui_manager.get_all_classes())
|
240 |
viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
|
241 |
if "error" not in viz_data:
|
242 |
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
|
|
486 |
|
487 |
@spaces.GPU
|
488 |
def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
|
489 |
+
"""
|
490 |
+
Handles video upload or URL input and calls the VideoProcessor.
|
491 |
+
|
492 |
+
Args:
|
493 |
+
video_input: Uploaded video file
|
494 |
+
video_url: Video URL (if using URL input)
|
495 |
+
input_type: Type of input ("upload" or "url")
|
496 |
+
model_name: Name of the YOLO model to use
|
497 |
+
confidence_threshold: Confidence threshold for detections
|
498 |
+
process_interval: Frame processing interval
|
499 |
+
|
500 |
+
Returns:
|
501 |
+
Tuple: (output_video_path, summary_html, formatted_stats)
|
502 |
+
"""
|
503 |
print(f"Received video request: input_type={input_type}")
|
504 |
video_path = None
|
505 |
|
|
|
547 |
return None, error_html, {"error": str(e)}
|
548 |
|
549 |
|
550 |
+
def main():
|
551 |
+
"""
|
552 |
+
Main function to initialize processors and launch the Gradio interface.
|
553 |
+
"""
|
554 |
+
global ui_manager
|
555 |
+
|
556 |
+
# Initialize processors
|
557 |
+
print("Initializing processors...")
|
558 |
+
initialization_success = initialize_processors()
|
559 |
+
if not initialization_success:
|
560 |
+
print("WARNING: Failed to initialize processors. Application may not function correctly.")
|
561 |
+
return
|
562 |
+
|
563 |
+
# Initialize UI manager
|
564 |
+
print("Initializing UI manager...")
|
565 |
+
ui_manager = initialize_ui_manager()
|
566 |
+
|
567 |
+
# Create and launch the Gradio interface
|
568 |
+
print("Creating Gradio interface...")
|
569 |
+
demo_interface = ui_manager.create_interface(
|
570 |
+
handle_image_upload_fn=handle_image_upload,
|
571 |
+
handle_video_upload_fn=handle_video_upload,
|
572 |
+
download_video_from_url_fn=download_video_from_url
|
573 |
+
)
|
574 |
+
|
575 |
+
print("Launching application...")
|
576 |
+
demo_interface.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
|
578 |
|
579 |
if __name__ == "__main__":
|
580 |
+
main()
|
|
|
|
functional_zone_identifier.py
CHANGED
@@ -11,7 +11,7 @@ class FunctionalZoneIdentifier:
|
|
11 |
整合區域評估和場景特定的區域辨識邏輯,提供統一的功能區域辨識接口
|
12 |
"""
|
13 |
|
14 |
-
def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None):
|
15 |
"""
|
16 |
初始化功能區域識別器
|
17 |
|
@@ -26,6 +26,7 @@ class FunctionalZoneIdentifier:
|
|
26 |
|
27 |
self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
|
28 |
self.viewpoint_detector = scene_viewpoint_analyzer
|
|
|
29 |
|
30 |
logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
|
31 |
|
@@ -68,7 +69,7 @@ class FunctionalZoneIdentifier:
|
|
68 |
logger.info("Insufficient objects for zone identification")
|
69 |
return {}
|
70 |
|
71 |
-
# 5. 建立 category_regions
|
72 |
category_regions = self._build_category_regions_mapping(detected_objects)
|
73 |
zones = {}
|
74 |
|
@@ -247,7 +248,7 @@ class FunctionalZoneIdentifier:
|
|
247 |
objects = zone_data.get("objects", [])
|
248 |
region = zone_data.get("region", "")
|
249 |
|
250 |
-
# 優先檢查是否含有 traffic light
|
251 |
if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
|
252 |
return "traffic control zone"
|
253 |
|
@@ -438,36 +439,42 @@ class FunctionalZoneIdentifier:
|
|
438 |
def _categorize_object(self, obj: Dict) -> str:
|
439 |
"""
|
440 |
將檢測到的物件分類到功能類別中,用於區域識別
|
441 |
-
|
442 |
-
Args:
|
443 |
-
obj: 物件字典
|
444 |
-
|
445 |
-
Returns:
|
446 |
-
物件功能類別字串
|
447 |
"""
|
448 |
try:
|
449 |
class_id = obj.get("class_id", -1)
|
450 |
-
class_name = obj.get("class_name", "").lower()
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
-
#
|
453 |
if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
|
454 |
for category, ids in self.OBJECT_CATEGORIES.items():
|
455 |
if class_id in ids:
|
456 |
-
|
|
|
457 |
|
458 |
-
#
|
459 |
furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
|
460 |
plant_items = ["potted plant"]
|
461 |
electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
|
462 |
vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
|
463 |
person_items = ["person"]
|
464 |
-
kitchen_items = [
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
469 |
personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
|
470 |
|
|
|
471 |
if any(item in class_name for item in furniture_items):
|
472 |
return "furniture"
|
473 |
elif any(item in class_name for item in plant_items):
|
@@ -479,11 +486,11 @@ class FunctionalZoneIdentifier:
|
|
479 |
elif any(item in class_name for item in person_items):
|
480 |
return "person"
|
481 |
elif any(item in class_name for item in kitchen_items):
|
482 |
-
return "
|
483 |
elif any(item in class_name for item in sports_items):
|
484 |
return "sports"
|
485 |
elif any(item in class_name for item in personal_items):
|
486 |
-
return "
|
487 |
else:
|
488 |
return "misc"
|
489 |
|
@@ -492,6 +499,42 @@ class FunctionalZoneIdentifier:
|
|
492 |
logger.error(traceback.format_exc())
|
493 |
return "misc"
|
494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
|
496 |
"""
|
497 |
當沒有匹配到特定場景類型時的一般功能區域識別
|
@@ -791,7 +834,7 @@ class FunctionalZoneIdentifier:
|
|
791 |
|
792 |
}
|
793 |
|
794 |
-
# 1. 統計 current_zones 裡,已使用掉的 (class_name, region) 次數
|
795 |
used_count = {}
|
796 |
for zone_info in current_zones.values():
|
797 |
rg = zone_info.get("region", "")
|
@@ -799,7 +842,7 @@ class FunctionalZoneIdentifier:
|
|
799 |
key = (obj_name, rg)
|
800 |
used_count[key] = used_count.get(key, 0) + 1
|
801 |
|
802 |
-
# 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
|
803 |
total_count = {}
|
804 |
for obj in all_detected_objects:
|
805 |
cname = obj.get("class_name", "")
|
@@ -807,7 +850,7 @@ class FunctionalZoneIdentifier:
|
|
807 |
key = (cname, rg)
|
808 |
total_count[key] = total_count.get(key, 0) + 1
|
809 |
|
810 |
-
# 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
|
811 |
category_to_fallback = {
|
812 |
# 行人與交通工具
|
813 |
"person": "pedestrian area",
|
@@ -906,12 +949,12 @@ class FunctionalZoneIdentifier:
|
|
906 |
"potted plant": "decorative area",
|
907 |
}
|
908 |
|
909 |
-
# 4. 計算缺少的 (class_name, region) 並建立 fallback zone
|
910 |
for (cname, rg), total in total_count.items():
|
911 |
used = used_count.get((cname, rg), 0)
|
912 |
missing = total - used
|
913 |
if missing <= 0:
|
914 |
-
continue
|
915 |
|
916 |
# (A) 決定這個 cname 在 fallback 裡屬於哪個大 class(zone_type)
|
917 |
zone_type = category_to_fallback.get(cname, "miscellaneous area")
|
|
|
11 |
整合區域評估和場景特定的區域辨識邏輯,提供統一的功能區域辨識接口
|
12 |
"""
|
13 |
|
14 |
+
def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None, object_categories=None):
|
15 |
"""
|
16 |
初始化功能區域識別器
|
17 |
|
|
|
26 |
|
27 |
self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
|
28 |
self.viewpoint_detector = scene_viewpoint_analyzer
|
29 |
+
self.OBJECT_CATEGORIES = object_categories or {}
|
30 |
|
31 |
logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
|
32 |
|
|
|
69 |
logger.info("Insufficient objects for zone identification")
|
70 |
return {}
|
71 |
|
72 |
+
# 5. 建立 category_regions
|
73 |
category_regions = self._build_category_regions_mapping(detected_objects)
|
74 |
zones = {}
|
75 |
|
|
|
248 |
objects = zone_data.get("objects", [])
|
249 |
region = zone_data.get("region", "")
|
250 |
|
251 |
+
# 優先檢查是否含有 traffic light
|
252 |
if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
|
253 |
return "traffic control zone"
|
254 |
|
|
|
439 |
def _categorize_object(self, obj: Dict) -> str:
|
440 |
"""
|
441 |
將檢測到的物件分類到功能類別中,用於區域識別
|
442 |
+
確保所有返回值都使用自然語言格式,避免底線或技術性標識符
|
|
|
|
|
|
|
|
|
|
|
443 |
"""
|
444 |
try:
|
445 |
class_id = obj.get("class_id", -1)
|
446 |
+
class_name = obj.get("class_name", "").lower().strip()
|
447 |
+
|
448 |
+
# 優先處理 traffic light
|
449 |
+
# 只要 class_id == 9 或 class_name 包含 "traffic light",就分類為 "traffic light"
|
450 |
+
if class_id == 9 or "traffic light" in class_name:
|
451 |
+
return "traffic light"
|
452 |
|
453 |
+
# 如果有自訂的 OBJECT_CATEGORIES 映射,優先使用它
|
454 |
if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
|
455 |
for category, ids in self.OBJECT_CATEGORIES.items():
|
456 |
if class_id in ids:
|
457 |
+
# 確保返回的類別名稱使用自然語言格式
|
458 |
+
return self._clean_category_name(category)
|
459 |
|
460 |
+
# COCO class default name
|
461 |
furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
|
462 |
plant_items = ["potted plant"]
|
463 |
electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
|
464 |
vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
|
465 |
person_items = ["person"]
|
466 |
+
kitchen_items = [
|
467 |
+
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
|
468 |
+
"banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
|
469 |
+
"pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
|
470 |
+
]
|
471 |
+
sports_items = [
|
472 |
+
"frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
|
473 |
+
"baseball glove", "skateboard", "surfboard", "tennis racket"
|
474 |
+
]
|
475 |
personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
|
476 |
|
477 |
+
# fallback natural language
|
478 |
if any(item in class_name for item in furniture_items):
|
479 |
return "furniture"
|
480 |
elif any(item in class_name for item in plant_items):
|
|
|
486 |
elif any(item in class_name for item in person_items):
|
487 |
return "person"
|
488 |
elif any(item in class_name for item in kitchen_items):
|
489 |
+
return "kitchen items" # 移除底線
|
490 |
elif any(item in class_name for item in sports_items):
|
491 |
return "sports"
|
492 |
elif any(item in class_name for item in personal_items):
|
493 |
+
return "personal items" # 移除底線
|
494 |
else:
|
495 |
return "misc"
|
496 |
|
|
|
499 |
logger.error(traceback.format_exc())
|
500 |
return "misc"
|
501 |
|
502 |
+
def _clean_category_name(self, category: str) -> str:
|
503 |
+
"""
|
504 |
+
清理類別名稱,移除底線並轉換為較自然的格式
|
505 |
+
|
506 |
+
Args:
|
507 |
+
category: 原始類別名稱
|
508 |
+
|
509 |
+
Returns:
|
510 |
+
str: 清理後的類別名稱
|
511 |
+
"""
|
512 |
+
try:
|
513 |
+
if not category:
|
514 |
+
return "misc"
|
515 |
+
|
516 |
+
# 將底線替換為空格
|
517 |
+
cleaned = category.replace('_', ' ')
|
518 |
+
|
519 |
+
# 處理常見的技術性命名模式
|
520 |
+
replacements = {
|
521 |
+
'kitchen items': 'kitchen items',
|
522 |
+
'personal items': 'personal items',
|
523 |
+
'traffic light': 'traffic light',
|
524 |
+
'misc items': 'misc'
|
525 |
+
}
|
526 |
+
|
527 |
+
# 應用特定的替換規則
|
528 |
+
for old_term, new_term in replacements.items():
|
529 |
+
if cleaned == old_term:
|
530 |
+
return new_term
|
531 |
+
|
532 |
+
return cleaned.strip()
|
533 |
+
|
534 |
+
except Exception as e:
|
535 |
+
logger.warning(f"Error cleaning category name '{category}': {str(e)}")
|
536 |
+
return "misc"
|
537 |
+
|
538 |
def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
|
539 |
"""
|
540 |
當沒有匹配到特定場景類型時的一般功能區域識別
|
|
|
834 |
|
835 |
}
|
836 |
|
837 |
+
# 1. 統計 current_zones 裡,已使用掉的 (class_name, region) 次數
|
838 |
used_count = {}
|
839 |
for zone_info in current_zones.values():
|
840 |
rg = zone_info.get("region", "")
|
|
|
842 |
key = (obj_name, rg)
|
843 |
used_count[key] = used_count.get(key, 0) + 1
|
844 |
|
845 |
+
# 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
|
846 |
total_count = {}
|
847 |
for obj in all_detected_objects:
|
848 |
cname = obj.get("class_name", "")
|
|
|
850 |
key = (cname, rg)
|
851 |
total_count[key] = total_count.get(key, 0) + 1
|
852 |
|
853 |
+
# 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
|
854 |
category_to_fallback = {
|
855 |
# 行人與交通工具
|
856 |
"person": "pedestrian area",
|
|
|
949 |
"potted plant": "decorative area",
|
950 |
}
|
951 |
|
952 |
+
# 4. 計算缺少的 (class_name, region) 並建立 fallback zone
|
953 |
for (cname, rg), total in total_count.items():
|
954 |
used = used_count.get((cname, rg), 0)
|
955 |
missing = total - used
|
956 |
if missing <= 0:
|
957 |
+
continue
|
958 |
|
959 |
# (A) 決定這個 cname 在 fallback 裡屬於哪個大 class(zone_type)
|
960 |
zone_type = category_to_fallback.get(cname, "miscellaneous area")
|
room_04.jpg
ADDED
![]() |
Git LFS Details
|
scene_analysis_coordinator.py
CHANGED
@@ -333,6 +333,9 @@ class SceneAnalysisCoordinator:
|
|
333 |
scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
|
334 |
)
|
335 |
possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
|
|
|
|
|
|
|
336 |
|
337 |
# 準備最終結果
|
338 |
return {
|
@@ -345,6 +348,7 @@ class SceneAnalysisCoordinator:
|
|
345 |
"object_count": len(detected_objects_from_landmarks_list),
|
346 |
"regions": region_analysis,
|
347 |
"possible_activities": possible_activities,
|
|
|
348 |
"functional_zones": functional_zones,
|
349 |
"detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
|
350 |
"primary_landmark": primary_landmark,
|
@@ -463,26 +467,18 @@ class SceneAnalysisCoordinator:
|
|
463 |
# 空間分析
|
464 |
region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
|
465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
# 地標處理和整合
|
467 |
landmark_objects_identified = []
|
468 |
landmark_specific_activities = []
|
469 |
final_landmark_info = {}
|
470 |
|
471 |
-
if self.use_clip and current_run_enable_landmark:
|
472 |
-
detected_objects_main, landmark_objects_identified = self.landmark_processing_manager.process_unknown_objects(
|
473 |
-
detection_result, detected_objects_main, self.clip_analyzer
|
474 |
-
)
|
475 |
-
|
476 |
-
if landmark_objects_identified:
|
477 |
-
landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(
|
478 |
-
landmark_objects_identified
|
479 |
-
)
|
480 |
-
final_landmark_info = {
|
481 |
-
"detected_landmarks": landmark_objects_identified,
|
482 |
-
"primary_landmark": max(landmark_objects_identified, key=lambda x: x.get("confidence", 0.0), default=None),
|
483 |
-
"detailed_landmarks": landmark_objects_identified
|
484 |
-
}
|
485 |
-
|
486 |
# 如果當前運行禁用地標檢測,清理地標物體
|
487 |
if not current_run_enable_landmark:
|
488 |
detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
|
|
|
333 |
scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
|
334 |
)
|
335 |
possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
|
336 |
+
safety_concerns = []
|
337 |
+
if self.descriptor and hasattr(self.descriptor, '_identify_safety_concerns'):
|
338 |
+
safety_concerns = self.descriptor._identify_safety_concerns(detected_objects_from_landmarks_list, best_scene_val)
|
339 |
|
340 |
# 準備最終結果
|
341 |
return {
|
|
|
348 |
"object_count": len(detected_objects_from_landmarks_list),
|
349 |
"regions": region_analysis,
|
350 |
"possible_activities": possible_activities,
|
351 |
+
"safety_concerns": safety_concerns,
|
352 |
"functional_zones": functional_zones,
|
353 |
"detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
|
354 |
"primary_landmark": primary_landmark,
|
|
|
467 |
# 空間分析
|
468 |
region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
|
469 |
|
470 |
+
if current_run_enable_landmark:
|
471 |
+
self.logger.info("Using landmark detection logic for YOLO scene")
|
472 |
+
return self._handle_no_yolo_detections(
|
473 |
+
original_image_pil, image_dims_val, current_run_enable_landmark,
|
474 |
+
lighting_info, places365_info
|
475 |
+
)
|
476 |
+
|
477 |
# 地標處理和整合
|
478 |
landmark_objects_identified = []
|
479 |
landmark_specific_activities = []
|
480 |
final_landmark_info = {}
|
481 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
# 如果當前運行禁用地標檢測,清理地標物體
|
483 |
if not current_run_enable_landmark:
|
484 |
detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
|
spatial_analyzer.py
CHANGED
@@ -5,6 +5,7 @@ import logging
|
|
5 |
import traceback
|
6 |
from typing import Dict, List, Tuple, Any, Optional
|
7 |
|
|
|
8 |
from region_analyzer import RegionAnalyzer
|
9 |
from object_extractor import ObjectExtractor
|
10 |
from scene_viewpoint_analyzer import SceneViewpointAnalyzer
|
@@ -31,6 +32,9 @@ class SpatialAnalyzer:
|
|
31 |
"""
|
32 |
try:
|
33 |
# 初始化所有子組件
|
|
|
|
|
|
|
34 |
self.region_analyzer = RegionAnalyzer()
|
35 |
self.object_extractor = ObjectExtractor(class_names, object_categories)
|
36 |
|
@@ -41,12 +45,10 @@ class SpatialAnalyzer:
|
|
41 |
self.functional_zone_identifier = FunctionalZoneIdentifier(
|
42 |
zone_evaluator=self.zone_evaluator,
|
43 |
scene_zone_identifier=self.scene_zone_identifier,
|
44 |
-
scene_viewpoint_analyzer=self.scene_viewpoint_analyzer
|
|
|
45 |
)
|
46 |
|
47 |
-
self.class_names = class_names
|
48 |
-
self.OBJECT_CATEGORIES = object_categories or {}
|
49 |
-
|
50 |
self.enhance_descriptor = None
|
51 |
|
52 |
# 接近分析的距離閾值(標準化)
|
@@ -171,105 +173,6 @@ class SpatialAnalyzer:
|
|
171 |
logger.error(traceback.format_exc())
|
172 |
return {}
|
173 |
|
174 |
-
def _categorize_object(self, obj: Dict) -> str:
|
175 |
-
"""
|
176 |
-
將檢測到的物件分類到功能類別中,用於區域識別
|
177 |
-
確保所有返回值都使用自然語言格式,避免底線或技術性標識符
|
178 |
-
"""
|
179 |
-
try:
|
180 |
-
class_id = obj.get("class_id", -1)
|
181 |
-
class_name = obj.get("class_name", "").lower().strip()
|
182 |
-
|
183 |
-
# 優先處理 traffic light
|
184 |
-
# 只要 class_id == 9 或 class_name 包含 "traffic light",就分類為 "traffic light"
|
185 |
-
if class_id == 9 or "traffic light" in class_name:
|
186 |
-
return "traffic light"
|
187 |
-
|
188 |
-
# 如果有自訂的 OBJECT_CATEGORIES 映射,優先使用它
|
189 |
-
if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
|
190 |
-
for category, ids in self.OBJECT_CATEGORIES.items():
|
191 |
-
if class_id in ids:
|
192 |
-
# 確保返回的類別名稱使用自然語言格式
|
193 |
-
return self._clean_category_name(category)
|
194 |
-
|
195 |
-
# COCO class default name
|
196 |
-
furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
|
197 |
-
plant_items = ["potted plant"]
|
198 |
-
electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
|
199 |
-
vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
|
200 |
-
person_items = ["person"]
|
201 |
-
kitchen_items = [
|
202 |
-
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
|
203 |
-
"banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
|
204 |
-
"pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
|
205 |
-
]
|
206 |
-
sports_items = [
|
207 |
-
"frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
|
208 |
-
"baseball glove", "skateboard", "surfboard", "tennis racket"
|
209 |
-
]
|
210 |
-
personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
|
211 |
-
|
212 |
-
# fallback natural language
|
213 |
-
if any(item in class_name for item in furniture_items):
|
214 |
-
return "furniture"
|
215 |
-
elif any(item in class_name for item in plant_items):
|
216 |
-
return "plant"
|
217 |
-
elif any(item in class_name for item in electronic_items):
|
218 |
-
return "electronics"
|
219 |
-
elif any(item in class_name for item in vehicle_items):
|
220 |
-
return "vehicle"
|
221 |
-
elif any(item in class_name for item in person_items):
|
222 |
-
return "person"
|
223 |
-
elif any(item in class_name for item in kitchen_items):
|
224 |
-
return "kitchen items" # 移除底線
|
225 |
-
elif any(item in class_name for item in sports_items):
|
226 |
-
return "sports"
|
227 |
-
elif any(item in class_name for item in personal_items):
|
228 |
-
return "personal items" # 移除底線
|
229 |
-
else:
|
230 |
-
return "misc"
|
231 |
-
|
232 |
-
except Exception as e:
|
233 |
-
logger.error(f"Error categorizing object: {str(e)}")
|
234 |
-
logger.error(traceback.format_exc())
|
235 |
-
return "misc"
|
236 |
-
|
237 |
-
def _clean_category_name(self, category: str) -> str:
|
238 |
-
"""
|
239 |
-
清理類別名稱,移除底線並轉換為較自然的格式
|
240 |
-
|
241 |
-
Args:
|
242 |
-
category: 原始類別名稱
|
243 |
-
|
244 |
-
Returns:
|
245 |
-
str: 清理後的類別名稱
|
246 |
-
"""
|
247 |
-
try:
|
248 |
-
if not category:
|
249 |
-
return "misc"
|
250 |
-
|
251 |
-
# 將底線替換為空格
|
252 |
-
cleaned = category.replace('_', ' ')
|
253 |
-
|
254 |
-
# 處理常見的技術性命名模式
|
255 |
-
replacements = {
|
256 |
-
'kitchen items': 'kitchen items',
|
257 |
-
'personal items': 'personal items',
|
258 |
-
'traffic light': 'traffic light',
|
259 |
-
'misc items': 'misc'
|
260 |
-
}
|
261 |
-
|
262 |
-
# 應用特定的替換規則
|
263 |
-
for old_term, new_term in replacements.items():
|
264 |
-
if cleaned == old_term:
|
265 |
-
return new_term
|
266 |
-
|
267 |
-
return cleaned.strip()
|
268 |
-
|
269 |
-
except Exception as e:
|
270 |
-
logger.warning(f"Error cleaning category name '{category}': {str(e)}")
|
271 |
-
return "misc"
|
272 |
-
|
273 |
def _get_object_categories(self, detected_objects: List[Dict]) -> set:
|
274 |
"""
|
275 |
從檢測到的物件中獲取唯一的物件類別
|
|
|
5 |
import traceback
|
6 |
from typing import Dict, List, Tuple, Any, Optional
|
7 |
|
8 |
+
from object_categories import OBJECT_CATEGORIES
|
9 |
from region_analyzer import RegionAnalyzer
|
10 |
from object_extractor import ObjectExtractor
|
11 |
from scene_viewpoint_analyzer import SceneViewpointAnalyzer
|
|
|
32 |
"""
|
33 |
try:
|
34 |
# 初始化所有子組件
|
35 |
+
self.class_names = class_names
|
36 |
+
self.OBJECT_CATEGORIES = object_categories or {}
|
37 |
+
|
38 |
self.region_analyzer = RegionAnalyzer()
|
39 |
self.object_extractor = ObjectExtractor(class_names, object_categories)
|
40 |
|
|
|
45 |
self.functional_zone_identifier = FunctionalZoneIdentifier(
|
46 |
zone_evaluator=self.zone_evaluator,
|
47 |
scene_zone_identifier=self.scene_zone_identifier,
|
48 |
+
scene_viewpoint_analyzer=self.scene_viewpoint_analyzer,
|
49 |
+
object_categories=self.OBJECT_CATEGORIES
|
50 |
)
|
51 |
|
|
|
|
|
|
|
52 |
self.enhance_descriptor = None
|
53 |
|
54 |
# 接近分析的距離閾值(標準化)
|
|
|
173 |
logger.error(traceback.format_exc())
|
174 |
return {}
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
def _get_object_categories(self, detected_objects: List[Dict]) -> set:
|
177 |
"""
|
178 |
從檢測到的物件中獲取唯一的物件類別
|
ui_manager.py
ADDED
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from typing import Dict, List, Any, Optional, Tuple
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
|
5 |
+
from detection_model import DetectionModel
|
6 |
+
from style import Style
|
7 |
+
|
8 |
+
class UIManager:
|
9 |
+
"""
|
10 |
+
Manages all UI-related functionality for the VisionScout application.
|
11 |
+
Handles Gradio interface creation, component definitions, and event binding.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self):
|
15 |
+
"""Initialize the UI Manager."""
|
16 |
+
self.available_models = None
|
17 |
+
self.model_choices = []
|
18 |
+
self.class_choices_formatted = []
|
19 |
+
self._setup_model_choices()
|
20 |
+
|
21 |
+
def _setup_model_choices(self):
|
22 |
+
"""Setup model choices for dropdowns."""
|
23 |
+
try:
|
24 |
+
self.available_models = DetectionModel.get_available_models()
|
25 |
+
self.model_choices = [model["model_file"] for model in self.available_models]
|
26 |
+
except ImportError:
|
27 |
+
# Fallback model choices if DetectionModel is not available
|
28 |
+
self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
|
29 |
+
|
30 |
+
# Setup class choices
|
31 |
+
self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
|
32 |
+
|
33 |
+
def get_all_classes(self):
|
34 |
+
"""
|
35 |
+
Gets all available COCO classes.
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
List[Tuple[int, str]]: List of (class_id, class_name) tuples
|
39 |
+
"""
|
40 |
+
# Try to get from a loaded model first
|
41 |
+
try:
|
42 |
+
# This will be injected by the main app when processors are available
|
43 |
+
if hasattr(self, '_image_processor') and self._image_processor and self._image_processor.model_instances:
|
44 |
+
for model_instance in self._image_processor.model_instances.values():
|
45 |
+
if model_instance and model_instance.is_model_loaded:
|
46 |
+
try:
|
47 |
+
# Ensure class_names is a dict {id: name}
|
48 |
+
if isinstance(model_instance.class_names, dict):
|
49 |
+
return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error getting class names from model: {e}")
|
52 |
+
except Exception:
|
53 |
+
pass
|
54 |
+
|
55 |
+
# Fallback to standard COCO (ensure keys are ints)
|
56 |
+
default_classes = {
|
57 |
+
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
|
58 |
+
6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
|
59 |
+
11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
|
60 |
+
16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
|
61 |
+
22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
|
62 |
+
27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
|
63 |
+
32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
|
64 |
+
36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
|
65 |
+
40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
|
66 |
+
46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
|
67 |
+
51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
|
68 |
+
57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
|
69 |
+
62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
|
70 |
+
67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
|
71 |
+
72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
|
72 |
+
77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
|
73 |
+
}
|
74 |
+
return sorted(default_classes.items())
|
75 |
+
|
76 |
+
def set_image_processor(self, image_processor):
|
77 |
+
"""
|
78 |
+
Set the image processor reference for dynamic class retrieval.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
image_processor: The ImageProcessor instance
|
82 |
+
"""
|
83 |
+
self._image_processor = image_processor
|
84 |
+
|
85 |
+
def get_css_styles(self):
|
86 |
+
"""
|
87 |
+
Get CSS styles for the interface.
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
str: CSS styles
|
91 |
+
"""
|
92 |
+
try:
|
93 |
+
return Style.get_css()
|
94 |
+
except ImportError:
|
95 |
+
# Fallback CSS if Style module is not available
|
96 |
+
return """
|
97 |
+
.app-header {
|
98 |
+
text-align: center;
|
99 |
+
padding: 2rem 0 3rem 0;
|
100 |
+
background: linear-gradient(135deg, #f0f9ff, #e1f5fe);
|
101 |
+
}
|
102 |
+
.section-heading {
|
103 |
+
font-size: 1.2rem;
|
104 |
+
font-weight: bold;
|
105 |
+
color: #2D3748;
|
106 |
+
margin: 1rem 0 0.5rem 0;
|
107 |
+
}
|
108 |
+
.detect-btn {
|
109 |
+
background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
|
110 |
+
color: white !important;
|
111 |
+
border: none !important;
|
112 |
+
border-radius: 8px !important;
|
113 |
+
}
|
114 |
+
"""
|
115 |
+
|
116 |
+
def get_model_description(self, model_name):
|
117 |
+
"""
|
118 |
+
Get model description for the given model name.
|
119 |
+
|
120 |
+
Args:
|
121 |
+
model_name: Name of the model
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
str: Model description
|
125 |
+
"""
|
126 |
+
try:
|
127 |
+
return DetectionModel.get_model_description(model_name)
|
128 |
+
except ImportError:
|
129 |
+
return f"Model: {model_name}"
|
130 |
+
|
131 |
+
def create_header(self):
|
132 |
+
"""
|
133 |
+
Create the application header.
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
gr.HTML: Header HTML component
|
137 |
+
"""
|
138 |
+
return gr.HTML("""
|
139 |
+
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
140 |
+
<h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
|
141 |
+
<h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
|
142 |
+
<div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
|
143 |
+
<div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
|
144 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
|
145 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
|
146 |
+
</div>
|
147 |
+
<div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
|
148 |
+
<p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
|
149 |
+
<span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
|
150 |
+
<a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
|
151 |
+
</p>
|
152 |
+
</div>
|
153 |
+
</div>
|
154 |
+
""")
|
155 |
+
|
156 |
+
def create_footer(self):
|
157 |
+
"""
|
158 |
+
Create the application footer.
|
159 |
+
|
160 |
+
Returns:
|
161 |
+
gr.HTML: Footer HTML component
|
162 |
+
"""
|
163 |
+
return gr.HTML("""
|
164 |
+
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
165 |
+
<div style="margin-bottom: 15px;">
|
166 |
+
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
|
167 |
+
</div>
|
168 |
+
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
169 |
+
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
170 |
+
<a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
|
171 |
+
<img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
|
172 |
+
</a>
|
173 |
+
</div>
|
174 |
+
</div>
|
175 |
+
""")
|
176 |
+
|
177 |
+
def create_image_tab(self):
|
178 |
+
"""
|
179 |
+
Create the image processing tab with all components.
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
Dict: Dictionary containing all image tab components
|
183 |
+
"""
|
184 |
+
components = {}
|
185 |
+
|
186 |
+
with gr.Tab("Image Processing"):
|
187 |
+
components['current_image_model'] = gr.State("yolov8m.pt")
|
188 |
+
|
189 |
+
with gr.Row(equal_height=False):
|
190 |
+
# Left Column: Image Input & Controls
|
191 |
+
with gr.Column(scale=4, elem_classes="input-panel"):
|
192 |
+
with gr.Group():
|
193 |
+
gr.HTML('<div class="section-heading">Upload Image</div>')
|
194 |
+
components['image_input'] = gr.Image(
|
195 |
+
type="pil",
|
196 |
+
label="Upload an image",
|
197 |
+
elem_classes="upload-box"
|
198 |
+
)
|
199 |
+
|
200 |
+
with gr.Accordion("Image Analysis Settings", open=False):
|
201 |
+
components['image_model_dropdown'] = gr.Dropdown(
|
202 |
+
choices=self.model_choices,
|
203 |
+
value="yolov8m.pt",
|
204 |
+
label="Select Model",
|
205 |
+
info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
|
206 |
+
)
|
207 |
+
|
208 |
+
components['image_model_info'] = gr.Markdown(
|
209 |
+
self.get_model_description("yolov8m.pt")
|
210 |
+
)
|
211 |
+
|
212 |
+
components['image_confidence'] = gr.Slider(
|
213 |
+
minimum=0.1, maximum=0.9, value=0.25, step=0.05,
|
214 |
+
label="Confidence Threshold",
|
215 |
+
info="Minimum confidence for displaying a detected object"
|
216 |
+
)
|
217 |
+
|
218 |
+
components['use_llm'] = gr.Checkbox(
|
219 |
+
label="Use LLM for enhanced scene descriptions",
|
220 |
+
value=True,
|
221 |
+
info="Provides more detailed and natural language descriptions (may increase processing time)"
|
222 |
+
)
|
223 |
+
|
224 |
+
components['use_landmark_detection'] = gr.Checkbox(
|
225 |
+
label="Use CLIP for Landmark Detection",
|
226 |
+
value=False,
|
227 |
+
info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
|
228 |
+
)
|
229 |
+
|
230 |
+
with gr.Accordion("Filter Classes", open=False):
|
231 |
+
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
232 |
+
with gr.Row():
|
233 |
+
components['people_btn'] = gr.Button("People", size="sm")
|
234 |
+
components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
|
235 |
+
components['animals_btn'] = gr.Button("Animals", size="sm")
|
236 |
+
components['objects_btn'] = gr.Button("Common Objects", size="sm")
|
237 |
+
|
238 |
+
components['image_class_filter'] = gr.Dropdown(
|
239 |
+
choices=self.class_choices_formatted,
|
240 |
+
multiselect=True,
|
241 |
+
label="Select Classes to Display",
|
242 |
+
info="Leave empty to show all detected objects"
|
243 |
+
)
|
244 |
+
|
245 |
+
components['image_detect_btn'] = gr.Button(
|
246 |
+
"Analyze Image",
|
247 |
+
variant="primary",
|
248 |
+
elem_classes="detect-btn"
|
249 |
+
)
|
250 |
+
|
251 |
+
# How to use section
|
252 |
+
with gr.Group(elem_classes="how-to-use"):
|
253 |
+
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
254 |
+
gr.Markdown("""
|
255 |
+
1. Upload an image or use the camera
|
256 |
+
2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
|
257 |
+
3. In **Analysis Settings**, you can:
|
258 |
+
* Uncheck **Use LLM** to skip enhanced descriptions (faster)
|
259 |
+
* Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
|
260 |
+
* Filter object classes to focus on specific types of objects *(optional)*
|
261 |
+
4. Click **Analyze Image** button
|
262 |
+
|
263 |
+
**💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
|
264 |
+
""")
|
265 |
+
|
266 |
+
# Image Examples
|
267 |
+
gr.Examples(
|
268 |
+
examples=[
|
269 |
+
"room_04.jpg",
|
270 |
+
"street_04.jpg",
|
271 |
+
"street_05.jpg",
|
272 |
+
"landmark_Louvre_01.jpg"
|
273 |
+
],
|
274 |
+
inputs=components['image_input'],
|
275 |
+
label="Example Images"
|
276 |
+
)
|
277 |
+
|
278 |
+
gr.HTML("""
|
279 |
+
<div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
|
280 |
+
<p style="font-size: 12px; color: #718096; margin: 0;">
|
281 |
+
📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
|
282 |
+
</p>
|
283 |
+
</div>
|
284 |
+
""")
|
285 |
+
|
286 |
+
# Right Column: Image Results
|
287 |
+
with gr.Column(scale=6, elem_classes="output-panel"):
|
288 |
+
with gr.Tabs(elem_classes="tabs"):
|
289 |
+
# Detection Result Tab
|
290 |
+
with gr.Tab("Detection Result"):
|
291 |
+
components['image_result_image'] = gr.Image(
|
292 |
+
type="pil",
|
293 |
+
label="Detection Result"
|
294 |
+
)
|
295 |
+
gr.HTML('<div class="section-heading">Detection Details</div>')
|
296 |
+
components['image_result_text'] = gr.Textbox(
|
297 |
+
label=None,
|
298 |
+
lines=10,
|
299 |
+
elem_id="detection-details",
|
300 |
+
container=False
|
301 |
+
)
|
302 |
+
|
303 |
+
# Scene Understanding Tab
|
304 |
+
with gr.Tab("Scene Understanding"):
|
305 |
+
gr.HTML('<div class="section-heading">Scene Analysis</div>')
|
306 |
+
|
307 |
+
# Info details
|
308 |
+
gr.HTML("""
|
309 |
+
<details class="info-details" style="margin: 5px 0 15px 0;">
|
310 |
+
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
311 |
+
🔍 The AI Vision Scout Report: Click for important notes about this analysis
|
312 |
+
</summary>
|
313 |
+
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
314 |
+
<p style="font-size: 13px; color: #718096; margin: 0;">
|
315 |
+
<b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
|
316 |
+
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
|
317 |
+
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
|
318 |
+
</p>
|
319 |
+
</div>
|
320 |
+
</details>
|
321 |
+
""")
|
322 |
+
|
323 |
+
gr.HTML('''
|
324 |
+
<div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
|
325 |
+
<p style="margin: 0; color: #4a5568;">
|
326 |
+
<b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
|
327 |
+
</p>
|
328 |
+
</div>
|
329 |
+
''')
|
330 |
+
|
331 |
+
components['image_scene_description_html'] = gr.HTML(
|
332 |
+
label=None,
|
333 |
+
elem_id="scene_analysis_description_text"
|
334 |
+
)
|
335 |
+
|
336 |
+
# Original Scene Analysis accordion
|
337 |
+
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
|
338 |
+
components['image_llm_description'] = gr.HTML(
|
339 |
+
label=None,
|
340 |
+
elem_id="original_scene_description_text"
|
341 |
+
)
|
342 |
+
|
343 |
+
with gr.Row():
|
344 |
+
with gr.Column(scale=1):
|
345 |
+
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
|
346 |
+
components['image_activities_list'] = gr.Dataframe(
|
347 |
+
headers=["Activity"],
|
348 |
+
datatype=["str"],
|
349 |
+
row_count=5,
|
350 |
+
col_count=1,
|
351 |
+
wrap=True
|
352 |
+
)
|
353 |
+
|
354 |
+
with gr.Column(scale=1):
|
355 |
+
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
|
356 |
+
components['image_safety_list'] = gr.Dataframe(
|
357 |
+
headers=["Concern"],
|
358 |
+
datatype=["str"],
|
359 |
+
row_count=5,
|
360 |
+
col_count=1,
|
361 |
+
wrap=True
|
362 |
+
)
|
363 |
+
|
364 |
+
gr.HTML('<div class="section-heading">Functional Zones</div>')
|
365 |
+
components['image_zones_json'] = gr.JSON(
|
366 |
+
label=None,
|
367 |
+
elem_classes="json-box"
|
368 |
+
)
|
369 |
+
|
370 |
+
gr.HTML('<div class="section-heading">Lighting Conditions</div>')
|
371 |
+
components['image_lighting_info'] = gr.JSON(
|
372 |
+
label=None,
|
373 |
+
elem_classes="json-box"
|
374 |
+
)
|
375 |
+
|
376 |
+
# Statistics Tab
|
377 |
+
with gr.Tab("Statistics"):
|
378 |
+
with gr.Row():
|
379 |
+
with gr.Column(scale=3, elem_classes="plot-column"):
|
380 |
+
gr.HTML('<div class="section-heading">Object Distribution</div>')
|
381 |
+
components['image_plot_output'] = gr.Plot(
|
382 |
+
label=None,
|
383 |
+
elem_classes="large-plot-container"
|
384 |
+
)
|
385 |
+
with gr.Column(scale=2, elem_classes="stats-column"):
|
386 |
+
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
387 |
+
components['image_stats_json'] = gr.JSON(
|
388 |
+
label=None,
|
389 |
+
elem_classes="enhanced-json-display"
|
390 |
+
)
|
391 |
+
|
392 |
+
return components
|
393 |
+
|
394 |
+
def create_video_tab(self):
|
395 |
+
"""
|
396 |
+
Create the video processing tab with all components.
|
397 |
+
|
398 |
+
Returns:
|
399 |
+
Dict: Dictionary containing all video tab components
|
400 |
+
"""
|
401 |
+
components = {}
|
402 |
+
|
403 |
+
with gr.Tab("Video Processing"):
|
404 |
+
with gr.Row(equal_height=False):
|
405 |
+
# Left Column: Video Input & Controls
|
406 |
+
with gr.Column(scale=4, elem_classes="input-panel"):
|
407 |
+
with gr.Group():
|
408 |
+
gr.HTML('<div class="section-heading">Video Input</div>')
|
409 |
+
|
410 |
+
# Input type selection
|
411 |
+
components['video_input_type'] = gr.Radio(
|
412 |
+
["upload", "url"],
|
413 |
+
label="Input Method",
|
414 |
+
value="upload",
|
415 |
+
info="Choose how to provide the video"
|
416 |
+
)
|
417 |
+
|
418 |
+
# File upload
|
419 |
+
with gr.Group(elem_id="upload-video-group"):
|
420 |
+
components['video_input'] = gr.Video(
|
421 |
+
label="Upload a video file (MP4, AVI, MOV)",
|
422 |
+
sources=["upload"],
|
423 |
+
visible=True
|
424 |
+
)
|
425 |
+
|
426 |
+
# URL input
|
427 |
+
with gr.Group(elem_id="url-video-group"):
|
428 |
+
components['video_url_input'] = gr.Textbox(
|
429 |
+
label="Enter video URL (YouTube or direct video link)",
|
430 |
+
placeholder="https://www.youtube.com/watch?v=...",
|
431 |
+
visible=False,
|
432 |
+
elem_classes="custom-video-url-input"
|
433 |
+
)
|
434 |
+
gr.HTML("""
|
435 |
+
<div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
|
436 |
+
<p style="margin: 0; color: #4b5563;">
|
437 |
+
Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
|
438 |
+
</p>
|
439 |
+
</div>
|
440 |
+
""")
|
441 |
+
|
442 |
+
with gr.Accordion("Video Analysis Settings", open=True):
|
443 |
+
components['video_model_dropdown'] = gr.Dropdown(
|
444 |
+
choices=self.model_choices,
|
445 |
+
value="yolov8n.pt",
|
446 |
+
label="Select Model (Video)",
|
447 |
+
info="Faster models (like 'n') are recommended"
|
448 |
+
)
|
449 |
+
components['video_confidence'] = gr.Slider(
|
450 |
+
minimum=0.1, maximum=0.9, value=0.4, step=0.05,
|
451 |
+
label="Confidence Threshold (Video)"
|
452 |
+
)
|
453 |
+
components['video_process_interval'] = gr.Slider(
|
454 |
+
minimum=1, maximum=60, value=10, step=1,
|
455 |
+
label="Processing Interval (Frames)",
|
456 |
+
info="Analyze every Nth frame (higher value = faster)"
|
457 |
+
)
|
458 |
+
|
459 |
+
components['video_process_btn'] = gr.Button(
|
460 |
+
"Process Video",
|
461 |
+
variant="primary",
|
462 |
+
elem_classes="detect-btn"
|
463 |
+
)
|
464 |
+
|
465 |
+
# How to use section
|
466 |
+
with gr.Group(elem_classes="how-to-use"):
|
467 |
+
gr.HTML('<div class="section-heading">How to Use (Video)</div>')
|
468 |
+
gr.Markdown("""
|
469 |
+
1. Choose your input method: Upload a file or enter a URL.
|
470 |
+
2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
|
471 |
+
3. Click "Process Video". **Processing can take a significant amount of time.**
|
472 |
+
4. The annotated video and summary will appear on the right when finished.
|
473 |
+
""")
|
474 |
+
|
475 |
+
# Video examples
|
476 |
+
gr.HTML('<div class="section-heading">Example Videos</div>')
|
477 |
+
gr.HTML("""
|
478 |
+
<div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
|
479 |
+
<p style="font-size: 14px; color: #4A5568; margin: 0;">
|
480 |
+
Upload any video containing objects that YOLO can detect. For testing, find sample videos
|
481 |
+
<a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
|
482 |
+
</p>
|
483 |
+
</div>
|
484 |
+
""")
|
485 |
+
|
486 |
+
# Right Column: Video Results
|
487 |
+
with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
|
488 |
+
gr.HTML("""
|
489 |
+
<div class="section-heading">Video Result</div>
|
490 |
+
<details class="info-details" style="margin: 5px 0 15px 0;">
|
491 |
+
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
492 |
+
🎬 Video Processing Notes
|
493 |
+
</summary>
|
494 |
+
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
495 |
+
<p style="font-size: 13px; color: #718096; margin: 0;">
|
496 |
+
The processed video includes bounding boxes around detected objects. For longer videos,
|
497 |
+
consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
|
498 |
+
</p>
|
499 |
+
</div>
|
500 |
+
</details>
|
501 |
+
""")
|
502 |
+
|
503 |
+
components['video_output'] = gr.Video(
|
504 |
+
label="Processed Video",
|
505 |
+
elem_classes="video-output-container"
|
506 |
+
)
|
507 |
+
|
508 |
+
gr.HTML('<div class="section-heading">Processing Summary</div>')
|
509 |
+
components['video_summary_text'] = gr.HTML(
|
510 |
+
label=None,
|
511 |
+
elem_id="video-summary-html-output"
|
512 |
+
)
|
513 |
+
|
514 |
+
gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
|
515 |
+
components['video_stats_json'] = gr.JSON(
|
516 |
+
label=None,
|
517 |
+
elem_classes="video-stats-display"
|
518 |
+
)
|
519 |
+
|
520 |
+
return components
|
521 |
+
|
522 |
+
def get_filter_button_mappings(self):
|
523 |
+
"""
|
524 |
+
Get the class ID mappings for filter buttons.
|
525 |
+
|
526 |
+
Returns:
|
527 |
+
Dict: Dictionary containing class ID lists for different categories
|
528 |
+
"""
|
529 |
+
available_classes_list = self.get_all_classes()
|
530 |
+
|
531 |
+
return {
|
532 |
+
'people_classes_ids': [0],
|
533 |
+
'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
|
534 |
+
'animals_classes_ids': list(range(14, 24)),
|
535 |
+
'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
|
536 |
+
'available_classes_list': available_classes_list
|
537 |
+
}
|
538 |
+
|
539 |
+
def create_interface(self,
|
540 |
+
handle_image_upload_fn,
|
541 |
+
handle_video_upload_fn,
|
542 |
+
download_video_from_url_fn):
|
543 |
+
"""
|
544 |
+
Create the complete Gradio interface.
|
545 |
+
|
546 |
+
Args:
|
547 |
+
handle_image_upload_fn: Function to handle image upload
|
548 |
+
handle_video_upload_fn: Function to handle video upload
|
549 |
+
download_video_from_url_fn: Function to download video from URL
|
550 |
+
|
551 |
+
Returns:
|
552 |
+
gr.Blocks: Complete Gradio interface
|
553 |
+
"""
|
554 |
+
css = self.get_css_styles()
|
555 |
+
|
556 |
+
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
557 |
+
|
558 |
+
# Header
|
559 |
+
with gr.Group(elem_classes="app-header"):
|
560 |
+
self.create_header()
|
561 |
+
|
562 |
+
# Main Content with Tabs
|
563 |
+
with gr.Tabs(elem_classes="tabs"):
|
564 |
+
|
565 |
+
# Image Processing Tab
|
566 |
+
image_components = self.create_image_tab()
|
567 |
+
|
568 |
+
# Video Processing Tab
|
569 |
+
video_components = self.create_video_tab()
|
570 |
+
|
571 |
+
# Footer
|
572 |
+
self.create_footer()
|
573 |
+
|
574 |
+
# Setup Event Listeners
|
575 |
+
self._setup_event_listeners(
|
576 |
+
image_components,
|
577 |
+
video_components,
|
578 |
+
handle_image_upload_fn,
|
579 |
+
handle_video_upload_fn
|
580 |
+
)
|
581 |
+
|
582 |
+
return demo
|
583 |
+
|
584 |
+
def _setup_event_listeners(self,
|
585 |
+
image_components,
|
586 |
+
video_components,
|
587 |
+
handle_image_upload_fn,
|
588 |
+
handle_video_upload_fn):
|
589 |
+
"""
|
590 |
+
Setup all event listeners for the interface.
|
591 |
+
|
592 |
+
Args:
|
593 |
+
image_components: Dictionary of image tab components
|
594 |
+
video_components: Dictionary of video tab components
|
595 |
+
handle_image_upload_fn: Function to handle image upload
|
596 |
+
handle_video_upload_fn: Function to handle video upload
|
597 |
+
"""
|
598 |
+
# Image Model Change Handler
|
599 |
+
image_components['image_model_dropdown'].change(
|
600 |
+
fn=lambda model: (model, self.get_model_description(model)),
|
601 |
+
inputs=[image_components['image_model_dropdown']],
|
602 |
+
outputs=[image_components['current_image_model'], image_components['image_model_info']]
|
603 |
+
)
|
604 |
+
|
605 |
+
# Image Filter Buttons
|
606 |
+
filter_mappings = self.get_filter_button_mappings()
|
607 |
+
available_classes_list = filter_mappings['available_classes_list']
|
608 |
+
people_classes_ids = filter_mappings['people_classes_ids']
|
609 |
+
vehicles_classes_ids = filter_mappings['vehicles_classes_ids']
|
610 |
+
animals_classes_ids = filter_mappings['animals_classes_ids']
|
611 |
+
common_objects_ids = filter_mappings['common_objects_ids']
|
612 |
+
|
613 |
+
image_components['people_btn'].click(
|
614 |
+
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
|
615 |
+
outputs=image_components['image_class_filter']
|
616 |
+
)
|
617 |
+
image_components['vehicles_btn'].click(
|
618 |
+
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
|
619 |
+
outputs=image_components['image_class_filter']
|
620 |
+
)
|
621 |
+
image_components['animals_btn'].click(
|
622 |
+
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
|
623 |
+
outputs=image_components['image_class_filter']
|
624 |
+
)
|
625 |
+
image_components['objects_btn'].click(
|
626 |
+
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
|
627 |
+
outputs=image_components['image_class_filter']
|
628 |
+
)
|
629 |
+
|
630 |
+
# Video Input Type Change Handler
|
631 |
+
video_components['video_input_type'].change(
|
632 |
+
fn=lambda input_type: [
|
633 |
+
# Show/hide file upload
|
634 |
+
gr.update(visible=(input_type == "upload")),
|
635 |
+
# Show/hide URL input
|
636 |
+
gr.update(visible=(input_type == "url"))
|
637 |
+
],
|
638 |
+
inputs=[video_components['video_input_type']],
|
639 |
+
outputs=[video_components['video_input'], video_components['video_url_input']]
|
640 |
+
)
|
641 |
+
|
642 |
+
# Image Detect Button Click Handler
|
643 |
+
image_components['image_detect_btn'].click(
|
644 |
+
fn=handle_image_upload_fn,
|
645 |
+
inputs=[
|
646 |
+
image_components['image_input'],
|
647 |
+
image_components['image_model_dropdown'],
|
648 |
+
image_components['image_confidence'],
|
649 |
+
image_components['image_class_filter'],
|
650 |
+
image_components['use_llm'],
|
651 |
+
image_components['use_landmark_detection']
|
652 |
+
],
|
653 |
+
outputs=[
|
654 |
+
image_components['image_result_image'],
|
655 |
+
image_components['image_result_text'],
|
656 |
+
image_components['image_stats_json'],
|
657 |
+
image_components['image_plot_output'],
|
658 |
+
image_components['image_scene_description_html'],
|
659 |
+
image_components['image_llm_description'],
|
660 |
+
image_components['image_activities_list'],
|
661 |
+
image_components['image_safety_list'],
|
662 |
+
image_components['image_zones_json'],
|
663 |
+
image_components['image_lighting_info']
|
664 |
+
]
|
665 |
+
)
|
666 |
+
|
667 |
+
# Video Process Button Click Handler
|
668 |
+
video_components['video_process_btn'].click(
|
669 |
+
fn=handle_video_upload_fn,
|
670 |
+
inputs=[
|
671 |
+
video_components['video_input'],
|
672 |
+
video_components['video_url_input'],
|
673 |
+
video_components['video_input_type'],
|
674 |
+
video_components['video_model_dropdown'],
|
675 |
+
video_components['video_confidence'],
|
676 |
+
video_components['video_process_interval']
|
677 |
+
],
|
678 |
+
outputs=[
|
679 |
+
video_components['video_output'],
|
680 |
+
video_components['video_summary_text'],
|
681 |
+
video_components['video_stats_json']
|
682 |
+
]
|
683 |
+
)
|