DawnC commited on
Commit
ba55edb
·
verified ·
1 Parent(s): 95b3ba7

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -41,3 +41,4 @@ room_02.jpg filter=lfs diff=lfs merge=lfs -text
41
  street_04.jpg filter=lfs diff=lfs merge=lfs -text
42
  landmark_Louvre_01.jpg filter=lfs diff=lfs merge=lfs -text
43
  street_05.jpg filter=lfs diff=lfs merge=lfs -text
 
 
41
  street_04.jpg filter=lfs diff=lfs merge=lfs -text
42
  landmark_Louvre_01.jpg filter=lfs diff=lfs merge=lfs -text
43
  street_05.jpg filter=lfs diff=lfs merge=lfs -text
44
+ room_04.jpg filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -17,12 +17,20 @@ from style import Style
17
  from image_processor import ImageProcessor
18
  from video_processor import VideoProcessor
19
  from llm_enhancer import LLMEnhancer
 
20
 
21
  # Initialize Processors with LLM support
22
  image_processor = None
23
  video_processor = None
 
24
 
25
  def initialize_processors():
 
 
 
 
 
 
26
  global image_processor, video_processor
27
 
28
  try:
@@ -30,7 +38,7 @@ def initialize_processors():
30
  image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
31
  print("ImageProcessor initialized successfully with LLM")
32
 
33
- # 添加診斷檢查
34
  if hasattr(image_processor, 'scene_analyzer'):
35
  if image_processor.scene_analyzer is not None:
36
  print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
@@ -66,49 +74,41 @@ def initialize_processors():
66
  video_processor = None
67
  return False
68
 
69
- # Initialize processors
70
- initialization_success = initialize_processors()
71
- if not initialization_success:
72
- print("WARNING: Failed to initialize processors. Application may not function correctly.")
73
-
74
- # Helper Function
75
- def get_all_classes():
76
- """Gets all available COCO classes."""
77
- # Try to get from a loaded model first
78
- if image_processor and image_processor.model_instances:
79
- for model_instance in image_processor.model_instances.values():
80
- if model_instance and model_instance.is_model_loaded:
81
- try:
82
- # Ensure class_names is a dict {id: name}
83
- if isinstance(model_instance.class_names, dict):
84
- return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
85
- except Exception as e:
86
- print(f"Error getting class names from model: {e}")
87
-
88
- # Fallback to standard COCO (ensure keys are ints)
89
- default_classes = {
90
- 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
91
- 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
92
- 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
93
- 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
94
- 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
95
- 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
96
- 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
97
- 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
98
- 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
99
- 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
100
- 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
101
- 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
102
- 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
103
- 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
104
- 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
105
- 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
106
- }
107
- return sorted(default_classes.items())
108
 
109
  @spaces.GPU(duration=180)
110
  def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
111
- """Processes a single uploaded image."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  # Enhanced safety check for image_processor
113
  if image_processor is None:
114
  error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
@@ -140,6 +140,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
140
 
141
  print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
142
  print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
 
143
  try:
144
  image_processor.use_llm = use_llm
145
 
@@ -155,19 +156,19 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
155
  image_processor.scene_analyzer.use_landmark_detection = enable_landmark
156
  image_processor.scene_analyzer.enable_landmark = enable_landmark
157
 
158
- # 確保處理器也設置了這選項
159
  image_processor.enable_landmark = enable_landmark
160
 
161
  # 檢查並設置更深層次的組件
162
  if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
163
  image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
164
 
165
- # 檢查並設置CLIP分析器上的標記
166
  if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
167
  if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
168
  image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
169
 
170
- # 檢查並設置LLM增強器
171
  if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
172
  if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
173
  image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
@@ -198,7 +199,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
198
  class_ids_to_filter = None
199
  if filter_classes:
200
  class_ids_to_filter = []
201
- available_classes_dict = dict(get_all_classes())
202
  name_to_id = {name: id for id, name in available_classes_dict.items()}
203
  for class_str in filter_classes:
204
  class_name_or_id = class_str.split(":")[0].strip()
@@ -235,7 +236,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
235
  # Prepare visualization data for the plot
236
  plot_figure = None
237
  if stats and "class_statistics" in stats and stats["class_statistics"]:
238
- available_classes_dict = dict(get_all_classes())
239
  viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
240
  if "error" not in viz_data:
241
  plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
@@ -485,8 +486,20 @@ def download_video_from_url(video_url, max_duration_minutes=10):
485
 
486
  @spaces.GPU
487
  def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
488
- """Handles video upload or URL input and calls the VideoProcessor."""
489
-
 
 
 
 
 
 
 
 
 
 
 
 
490
  print(f"Received video request: input_type={input_type}")
491
  video_path = None
492
 
@@ -534,369 +547,34 @@ def handle_video_upload(video_input, video_url, input_type, model_name, confiden
534
  return None, error_html, {"error": str(e)}
535
 
536
 
537
- # Create Gradio Interface
538
- def create_interface():
539
- """Creates the Gradio interface with Tabs."""
540
- css = Style.get_css()
541
- available_models = DetectionModel.get_available_models()
542
- model_choices = [model["model_file"] for model in available_models]
543
- class_choices_formatted = [f"{id}: {name}" for id, name in get_all_classes()] # Use formatted choices
544
-
545
- with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
546
-
547
- # Header
548
- with gr.Group(elem_classes="app-header"):
549
- gr.HTML("""
550
- <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
551
- <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
552
- <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
553
- <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
554
- <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
555
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
556
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
557
- </div>
558
- <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
559
- <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
560
- <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
561
- <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
562
- </p>
563
- </div>
564
- </div>
565
- """)
566
-
567
- # Main Content with Tabs
568
- with gr.Tabs(elem_classes="tabs"):
569
-
570
- # Tab 1: Image Processing
571
- with gr.Tab("Image Processing"):
572
- current_image_model = gr.State("yolov8m.pt") # State for image model selection
573
- with gr.Row(equal_height=False): # Allow columns to have different heights
574
- # Left Column: Image Input & Controls
575
- with gr.Column(scale=4, elem_classes="input-panel"):
576
- with gr.Group():
577
- gr.HTML('<div class="section-heading">Upload Image</div>')
578
- image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
579
-
580
- with gr.Accordion("Image Analysis Settings", open=False):
581
- image_model_dropdown = gr.Dropdown(
582
- choices=model_choices,
583
- value="yolov8m.pt", # Default for images
584
- label="Select Model",
585
- info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
586
- )
587
- # Display model info
588
- image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
589
-
590
- image_confidence = gr.Slider(
591
- minimum=0.1, maximum=0.9, value=0.25, step=0.05,
592
- label="Confidence Threshold",
593
- info="Minimum confidence for displaying a detected object"
594
- )
595
-
596
- use_llm = gr.Checkbox(
597
- label="Use LLM for enhanced scene descriptions",
598
- value=True,
599
- info="Provides more detailed and natural language descriptions (may increase processing time)"
600
- )
601
-
602
- use_landmark_detection = gr.Checkbox(
603
- label="Use CLIP for Landmark Detection",
604
- value=False,
605
- info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
606
- )
607
-
608
- with gr.Accordion("Filter Classes", open=False):
609
- gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
610
- with gr.Row():
611
- people_btn = gr.Button("People", size="sm")
612
- vehicles_btn = gr.Button("Vehicles", size="sm")
613
- animals_btn = gr.Button("Animals", size="sm")
614
- objects_btn = gr.Button("Common Objects", size="sm")
615
- image_class_filter = gr.Dropdown(
616
- choices=class_choices_formatted, # Use formatted choices
617
- multiselect=True,
618
- label="Select Classes to Display",
619
- info="Leave empty to show all detected objects"
620
- )
621
-
622
- image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")
623
-
624
- with gr.Group(elem_classes="how-to-use"):
625
- gr.HTML('<div class="section-heading">How to Use (Image)</div>')
626
- gr.Markdown("""
627
- 1. Upload an image or use the camera
628
- 2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
629
- 3. In **Analysis Settings**, you can:
630
- * Uncheck **Use LLM** to skip enhanced descriptions (faster)
631
- * Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
632
- * Filter object classes to focus on specific types of objects *(optional)*
633
- 4. Click **Analyze Image** button
634
-
635
- **💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
636
- """)
637
-
638
-
639
- # Image Examples
640
- gr.Examples(
641
- examples=[
642
- "room_01.jpg",
643
- "street_04.jpg",
644
- "street_05.jpg",
645
- "landmark_Louvre_01.jpg"
646
- ],
647
- inputs=image_input,
648
- label="Example Images"
649
- )
650
-
651
- gr.HTML("""
652
- <div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
653
- <p style="font-size: 12px; color: #718096; margin: 0;">
654
- 📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
655
- </p>
656
- </div>
657
- """)
658
-
659
- # Right Column: Image Results
660
- with gr.Column(scale=6, elem_classes="output-panel"):
661
- with gr.Tabs(elem_classes="tabs"):
662
- with gr.Tab("Detection Result"):
663
- image_result_image = gr.Image(type="pil", label="Detection Result")
664
- gr.HTML('<div class="section-heading">Detection Details</div>')
665
- image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)
666
-
667
- with gr.Tab("Scene Understanding"):
668
- gr.HTML('<div class="section-heading">Scene Analysis</div>')
669
- gr.HTML("""
670
- <details class="info-details" style="margin: 5px 0 15px 0;">
671
- <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
672
- 🔍 The AI Vision Scout Report: Click for important notes about this analysis
673
- </summary>
674
- <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
675
- <p style="font-size: 13px; color: #718096; margin: 0;">
676
- <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
677
- Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
678
- Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
679
- </p>
680
- </div>
681
- </details>
682
- """)
683
-
684
- gr.HTML('''
685
- <div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
686
- <p style="margin: 0; color: #4a5568;">
687
- <b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
688
- </p>
689
- </div>
690
- ''')
691
- image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
692
-
693
- # 使用LLM增強敘述時也會顯示原本敘述內容
694
- with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
695
- image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
696
-
697
- with gr.Row():
698
- with gr.Column(scale=1):
699
- gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
700
- image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)
701
-
702
- with gr.Column(scale=1):
703
- gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
704
- image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)
705
-
706
- gr.HTML('<div class="section-heading">Functional Zones</div>')
707
- image_zones_json = gr.JSON(label=None, elem_classes="json-box")
708
-
709
- gr.HTML('<div class="section-heading">Lighting Conditions</div>')
710
- image_lighting_info = gr.JSON(label=None, elem_classes="json-box")
711
-
712
- with gr.Tab("Statistics"):
713
- with gr.Row():
714
- with gr.Column(scale=3, elem_classes="plot-column"):
715
- gr.HTML('<div class="section-heading">Object Distribution</div>')
716
- image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
717
- with gr.Column(scale=2, elem_classes="stats-column"):
718
- gr.HTML('<div class="section-heading">Detection Statistics</div>')
719
- image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
720
-
721
- # Tab 2: Video Processing
722
- with gr.Tab("Video Processing"):
723
- with gr.Row(equal_height=False):
724
- # Left Column: Video Input & Controls
725
- with gr.Column(scale=4, elem_classes="input-panel"):
726
- with gr.Group():
727
- gr.HTML('<div class="section-heading">Video Input</div>')
728
-
729
- # Add input type selection
730
- video_input_type = gr.Radio(
731
- ["upload", "url"],
732
- label="Input Method",
733
- value="upload",
734
- info="Choose how to provide the video"
735
- )
736
-
737
- # File upload (will be shown/hidden based on selection)
738
- with gr.Group(elem_id="upload-video-group"):
739
- video_input = gr.Video(
740
- label="Upload a video file (MP4, AVI, MOV)",
741
- sources=["upload"],
742
- visible=True
743
- )
744
-
745
- # URL input (will be shown/hidden based on selection)
746
- with gr.Group(elem_id="url-video-group"):
747
- video_url_input = gr.Textbox(
748
- label="Enter video URL (YouTube or direct video link)",
749
- placeholder="https://www.youtube.com/watch?v=...",
750
- visible=False,
751
- elem_classes="custom-video-url-input"
752
- )
753
- gr.HTML("""
754
- <div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
755
- <p style="margin: 0; color: #4b5563;">
756
- Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
757
- </p>
758
- </div>
759
- """)
760
-
761
- with gr.Accordion("Video Analysis Settings", open=True):
762
- video_model_dropdown = gr.Dropdown(
763
- choices=model_choices,
764
- value="yolov8n.pt", # Default 'n' for video
765
- label="Select Model (Video)",
766
- info="Faster models (like 'n') are recommended"
767
- )
768
- video_confidence = gr.Slider(
769
- minimum=0.1, maximum=0.9, value=0.4, step=0.05,
770
- label="Confidence Threshold (Video)"
771
- )
772
- video_process_interval = gr.Slider(
773
- minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
774
- label="Processing Interval (Frames)",
775
- info="Analyze every Nth frame (higher value = faster)"
776
- )
777
- video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")
778
-
779
- with gr.Group(elem_classes="how-to-use"):
780
- gr.HTML('<div class="section-heading">How to Use (Video)</div>')
781
- gr.Markdown("""
782
- 1. Choose your input method: Upload a file or enter a URL.
783
- 2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
784
- 3. Click "Process Video". **Processing can take a significant amount of time.**
785
- 4. The annotated video and summary will appear on the right when finished.
786
- """)
787
-
788
- # Add video examples
789
- gr.HTML('<div class="section-heading">Example Videos</div>')
790
- gr.HTML("""
791
- <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
792
- <p style="font-size: 14px; color: #4A5568; margin: 0;">
793
- Upload any video containing objects that YOLO can detect. For testing, find sample videos
794
- <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
795
- </p>
796
- </div>
797
- """)
798
-
799
- # Right Column: Video Results
800
- with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
801
- gr.HTML("""
802
- <div class="section-heading">Video Result</div>
803
- <details class="info-details" style="margin: 5px 0 15px 0;">
804
- <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
805
- 🎬 Video Processing Notes
806
- </summary>
807
- <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
808
- <p style="font-size: 13px; color: #718096; margin: 0;">
809
- The processed video includes bounding boxes around detected objects. For longer videos,
810
- consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
811
- </p>
812
- </div>
813
- </details>
814
- """)
815
- video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file
816
-
817
- gr.HTML('<div class="section-heading">Processing Summary</div>')
818
- # 使用HTML顯示影片的摘要
819
- video_summary_text = gr.HTML(
820
- label=None,
821
- elem_id="video-summary-html-output"
822
- )
823
-
824
- gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
825
- video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
826
-
827
- # Event Listeners
828
- # Image Model Change Handler
829
- image_model_dropdown.change(
830
- fn=lambda model: (model, DetectionModel.get_model_description(model)),
831
- inputs=[image_model_dropdown],
832
- outputs=[current_image_model, image_model_info] # Update state and description
833
- )
834
-
835
- # Image Filter Buttons
836
- available_classes_list = get_all_classes() # Get list of (id, name)
837
- people_classes_ids = [0]
838
- vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
839
- animals_classes_ids = list(range(14, 24))
840
- common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book
841
-
842
- people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
843
- vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
844
- animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
845
- objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)
846
-
847
- video_input_type.change(
848
- fn=lambda input_type: [
849
- # Show/hide file upload
850
- gr.update(visible=(input_type == "upload")),
851
- # Show/hide URL input
852
- gr.update(visible=(input_type == "url"))
853
- ],
854
- inputs=[video_input_type],
855
- outputs=[video_input, video_url_input]
856
- )
857
-
858
- image_detect_btn.click(
859
- fn=handle_image_upload,
860
- inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm, use_landmark_detection ],
861
- outputs=[
862
- image_result_image, image_result_text, image_stats_json, image_plot_output,
863
- image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
864
- image_lighting_info
865
- ]
866
- )
867
-
868
- video_process_btn.click(
869
- fn=handle_video_upload,
870
- inputs=[
871
- video_input,
872
- video_url_input,
873
- video_input_type,
874
- video_model_dropdown,
875
- video_confidence,
876
- video_process_interval
877
- ],
878
- outputs=[video_output, video_summary_text, video_stats_json]
879
- )
880
-
881
- # Footer
882
- gr.HTML("""
883
- <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
884
- <div style="margin-bottom: 15px;">
885
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
886
- </div>
887
- <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
888
- <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
889
- <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
890
- <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
891
- </a>
892
- </div>
893
- </div>
894
- """)
895
-
896
- return demo
897
 
898
 
899
  if __name__ == "__main__":
900
- demo_interface = create_interface()
901
-
902
- demo_interface.launch(debug=True)
 
17
  from image_processor import ImageProcessor
18
  from video_processor import VideoProcessor
19
  from llm_enhancer import LLMEnhancer
20
+ from ui_manager import UIManager
21
 
22
  # Initialize Processors with LLM support
23
  image_processor = None
24
  video_processor = None
25
+ ui_manager = None
26
 
27
  def initialize_processors():
28
+ """
29
+ Initialize the image and video processors with LLM support.
30
+
31
+ Returns:
32
+ bool: True if initialization was successful, False otherwise
33
+ """
34
  global image_processor, video_processor
35
 
36
  try:
 
38
  image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
39
  print("ImageProcessor initialized successfully with LLM")
40
 
41
+ # 檢查狀態
42
  if hasattr(image_processor, 'scene_analyzer'):
43
  if image_processor.scene_analyzer is not None:
44
  print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
 
74
  video_processor = None
75
  return False
76
 
77
+ def initialize_ui_manager():
78
+ """
79
+ Initialize the UI manager and set up references to processors.
80
+
81
+ Returns:
82
+ UIManager: Initialized UI manager instance
83
+ """
84
+ global ui_manager, image_processor
85
+
86
+ ui_manager = UIManager()
87
+
88
+ # Set image processor reference for dynamic class retrieval
89
+ if image_processor:
90
+ ui_manager.set_image_processor(image_processor)
91
+
92
+ return ui_manager
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  @spaces.GPU(duration=180)
95
  def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
96
+ """
97
+ Processes a single uploaded image.
98
+
99
+ Args:
100
+ image: PIL Image object
101
+ model_name: Name of the YOLO model to use
102
+ confidence_threshold: Confidence threshold for detections
103
+ filter_classes: List of class names/IDs to filter
104
+ use_llm: Whether to use LLM for enhanced descriptions
105
+ enable_landmark: Whether to enable landmark detection
106
+
107
+ Returns:
108
+ Tuple: (result_image, result_text, formatted_stats, plot_figure,
109
+ scene_description_html, original_desc_html, activities_list_data,
110
+ safety_data, zones, lighting)
111
+ """
112
  # Enhanced safety check for image_processor
113
  if image_processor is None:
114
  error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
 
140
 
141
  print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
142
  print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
143
+
144
  try:
145
  image_processor.use_llm = use_llm
146
 
 
156
  image_processor.scene_analyzer.use_landmark_detection = enable_landmark
157
  image_processor.scene_analyzer.enable_landmark = enable_landmark
158
 
159
+ # 確保處理器也設置了這選項(檢測地標用)
160
  image_processor.enable_landmark = enable_landmark
161
 
162
  # 檢查並設置更深層次的組件
163
  if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
164
  image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
165
 
166
+ # 檢查並設置CLIP Analyzer
167
  if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
168
  if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
169
  image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
170
 
171
+ # 檢查並設置LLM方面
172
  if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
173
  if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
174
  image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
 
199
  class_ids_to_filter = None
200
  if filter_classes:
201
  class_ids_to_filter = []
202
+ available_classes_dict = dict(ui_manager.get_all_classes())
203
  name_to_id = {name: id for id, name in available_classes_dict.items()}
204
  for class_str in filter_classes:
205
  class_name_or_id = class_str.split(":")[0].strip()
 
236
  # Prepare visualization data for the plot
237
  plot_figure = None
238
  if stats and "class_statistics" in stats and stats["class_statistics"]:
239
+ available_classes_dict = dict(ui_manager.get_all_classes())
240
  viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
241
  if "error" not in viz_data:
242
  plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
 
486
 
487
  @spaces.GPU
488
  def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
489
+ """
490
+ Handles video upload or URL input and calls the VideoProcessor.
491
+
492
+ Args:
493
+ video_input: Uploaded video file
494
+ video_url: Video URL (if using URL input)
495
+ input_type: Type of input ("upload" or "url")
496
+ model_name: Name of the YOLO model to use
497
+ confidence_threshold: Confidence threshold for detections
498
+ process_interval: Frame processing interval
499
+
500
+ Returns:
501
+ Tuple: (output_video_path, summary_html, formatted_stats)
502
+ """
503
  print(f"Received video request: input_type={input_type}")
504
  video_path = None
505
 
 
547
  return None, error_html, {"error": str(e)}
548
 
549
 
550
+ def main():
551
+ """
552
+ Main function to initialize processors and launch the Gradio interface.
553
+ """
554
+ global ui_manager
555
+
556
+ # Initialize processors
557
+ print("Initializing processors...")
558
+ initialization_success = initialize_processors()
559
+ if not initialization_success:
560
+ print("WARNING: Failed to initialize processors. Application may not function correctly.")
561
+ return
562
+
563
+ # Initialize UI manager
564
+ print("Initializing UI manager...")
565
+ ui_manager = initialize_ui_manager()
566
+
567
+ # Create and launch the Gradio interface
568
+ print("Creating Gradio interface...")
569
+ demo_interface = ui_manager.create_interface(
570
+ handle_image_upload_fn=handle_image_upload,
571
+ handle_video_upload_fn=handle_video_upload,
572
+ download_video_from_url_fn=download_video_from_url
573
+ )
574
+
575
+ print("Launching application...")
576
+ demo_interface.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
 
579
  if __name__ == "__main__":
580
+ main()
 
 
functional_zone_identifier.py CHANGED
@@ -11,7 +11,7 @@ class FunctionalZoneIdentifier:
11
  整合區域評估和場景特定的區域辨識邏輯,提供統一的功能區域辨識接口
12
  """
13
 
14
- def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None):
15
  """
16
  初始化功能區域識別器
17
 
@@ -26,6 +26,7 @@ class FunctionalZoneIdentifier:
26
 
27
  self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
28
  self.viewpoint_detector = scene_viewpoint_analyzer
 
29
 
30
  logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
31
 
@@ -68,7 +69,7 @@ class FunctionalZoneIdentifier:
68
  logger.info("Insufficient objects for zone identification")
69
  return {}
70
 
71
- # 5. 建立 category_regions
72
  category_regions = self._build_category_regions_mapping(detected_objects)
73
  zones = {}
74
 
@@ -247,7 +248,7 @@ class FunctionalZoneIdentifier:
247
  objects = zone_data.get("objects", [])
248
  region = zone_data.get("region", "")
249
 
250
- # 優先檢查是否含有 traffic light
251
  if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
252
  return "traffic control zone"
253
 
@@ -438,36 +439,42 @@ class FunctionalZoneIdentifier:
438
  def _categorize_object(self, obj: Dict) -> str:
439
  """
440
  將檢測到的物件分類到功能類別中,用於區域識別
441
-
442
- Args:
443
- obj: 物件字典
444
-
445
- Returns:
446
- 物件功能類別字串
447
  """
448
  try:
449
  class_id = obj.get("class_id", -1)
450
- class_name = obj.get("class_name", "").lower()
 
 
 
 
 
451
 
452
- # 使用現有的類別映射(如果可用)
453
  if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
454
  for category, ids in self.OBJECT_CATEGORIES.items():
455
  if class_id in ids:
456
- return category
 
457
 
458
- # 基於COCO類別名稱的後備分類
459
  furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
460
  plant_items = ["potted plant"]
461
  electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
462
  vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
463
  person_items = ["person"]
464
- kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
465
- "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
466
- "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
467
- sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
468
- "baseball glove", "skateboard", "surfboard", "tennis racket"]
 
 
 
 
469
  personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
470
 
 
471
  if any(item in class_name for item in furniture_items):
472
  return "furniture"
473
  elif any(item in class_name for item in plant_items):
@@ -479,11 +486,11 @@ class FunctionalZoneIdentifier:
479
  elif any(item in class_name for item in person_items):
480
  return "person"
481
  elif any(item in class_name for item in kitchen_items):
482
- return "kitchen_items"
483
  elif any(item in class_name for item in sports_items):
484
  return "sports"
485
  elif any(item in class_name for item in personal_items):
486
- return "personal_items"
487
  else:
488
  return "misc"
489
 
@@ -492,6 +499,42 @@ class FunctionalZoneIdentifier:
492
  logger.error(traceback.format_exc())
493
  return "misc"
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
496
  """
497
  當沒有匹配到特定場景類型時的一般功能區域識別
@@ -791,7 +834,7 @@ class FunctionalZoneIdentifier:
791
 
792
  }
793
 
794
- # 1. 統計 current_zones 裡,已使用掉的 (class_name, region) 次數
795
  used_count = {}
796
  for zone_info in current_zones.values():
797
  rg = zone_info.get("region", "")
@@ -799,7 +842,7 @@ class FunctionalZoneIdentifier:
799
  key = (obj_name, rg)
800
  used_count[key] = used_count.get(key, 0) + 1
801
 
802
- # 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
803
  total_count = {}
804
  for obj in all_detected_objects:
805
  cname = obj.get("class_name", "")
@@ -807,7 +850,7 @@ class FunctionalZoneIdentifier:
807
  key = (cname, rg)
808
  total_count[key] = total_count.get(key, 0) + 1
809
 
810
- # 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
811
  category_to_fallback = {
812
  # 行人與交通工具
813
  "person": "pedestrian area",
@@ -906,12 +949,12 @@ class FunctionalZoneIdentifier:
906
  "potted plant": "decorative area",
907
  }
908
 
909
- # 4. 計算缺少的 (class_name, region) 並建立 fallback zone
910
  for (cname, rg), total in total_count.items():
911
  used = used_count.get((cname, rg), 0)
912
  missing = total - used
913
  if missing <= 0:
914
- continue
915
 
916
  # (A) 決定這個 cname 在 fallback 裡屬於哪個大 class(zone_type)
917
  zone_type = category_to_fallback.get(cname, "miscellaneous area")
 
11
  整合區域評估和場景特定的區域辨識邏輯,提供統一的功能區域辨識接口
12
  """
13
 
14
+ def __init__(self, zone_evaluator=None, scene_zone_identifier=None, scene_viewpoint_analyzer=None, object_categories=None):
15
  """
16
  初始化功能區域識別器
17
 
 
26
 
27
  self.scene_viewpoint_analyzer = scene_viewpoint_analyzer
28
  self.viewpoint_detector = scene_viewpoint_analyzer
29
+ self.OBJECT_CATEGORIES = object_categories or {}
30
 
31
  logger.info("FunctionalZoneIdentifier initialized successfully with SceneViewpointAnalyzer")
32
 
 
69
  logger.info("Insufficient objects for zone identification")
70
  return {}
71
 
72
+ # 5. 建立 category_regions
73
  category_regions = self._build_category_regions_mapping(detected_objects)
74
  zones = {}
75
 
 
248
  objects = zone_data.get("objects", [])
249
  region = zone_data.get("region", "")
250
 
251
+ # 優先檢查是否含有 traffic light
252
  if any(obj == "traffic light" or "traffic light" in obj for obj in objects):
253
  return "traffic control zone"
254
 
 
439
  def _categorize_object(self, obj: Dict) -> str:
440
  """
441
  將檢測到的物件分類到功能類別中,用於區域識別
442
+ 確保所有返回值都使用自然語言格式,避免底線或技術性標識符
 
 
 
 
 
443
  """
444
  try:
445
  class_id = obj.get("class_id", -1)
446
+ class_name = obj.get("class_name", "").lower().strip()
447
+
448
+ # 優先處理 traffic light
449
+ # 只要 class_id == 9 或 class_name 包含 "traffic light",就分類為 "traffic light"
450
+ if class_id == 9 or "traffic light" in class_name:
451
+ return "traffic light"
452
 
453
+ # 如果有自訂的 OBJECT_CATEGORIES 映射,優先使用它
454
  if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
455
  for category, ids in self.OBJECT_CATEGORIES.items():
456
  if class_id in ids:
457
+ # 確保返回的類別名稱使用自然語言格式
458
+ return self._clean_category_name(category)
459
 
460
+ # COCO class default name
461
  furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
462
  plant_items = ["potted plant"]
463
  electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
464
  vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
465
  person_items = ["person"]
466
+ kitchen_items = [
467
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
468
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
469
+ "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
470
+ ]
471
+ sports_items = [
472
+ "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
473
+ "baseball glove", "skateboard", "surfboard", "tennis racket"
474
+ ]
475
  personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
476
 
477
+ # fallback natural language
478
  if any(item in class_name for item in furniture_items):
479
  return "furniture"
480
  elif any(item in class_name for item in plant_items):
 
486
  elif any(item in class_name for item in person_items):
487
  return "person"
488
  elif any(item in class_name for item in kitchen_items):
489
+ return "kitchen items" # 移除底線
490
  elif any(item in class_name for item in sports_items):
491
  return "sports"
492
  elif any(item in class_name for item in personal_items):
493
+ return "personal items" # 移除底線
494
  else:
495
  return "misc"
496
 
 
499
  logger.error(traceback.format_exc())
500
  return "misc"
501
 
502
+ def _clean_category_name(self, category: str) -> str:
503
+ """
504
+ 清理類別名稱,移除底線並轉換為較自然的格式
505
+
506
+ Args:
507
+ category: 原始類別名稱
508
+
509
+ Returns:
510
+ str: 清理後的類別名稱
511
+ """
512
+ try:
513
+ if not category:
514
+ return "misc"
515
+
516
+ # 將底線替換為空格
517
+ cleaned = category.replace('_', ' ')
518
+
519
+ # 處理常見的技術性命名模式
520
+ replacements = {
521
+ 'kitchen items': 'kitchen items',
522
+ 'personal items': 'personal items',
523
+ 'traffic light': 'traffic light',
524
+ 'misc items': 'misc'
525
+ }
526
+
527
+ # 應用特定的替換規則
528
+ for old_term, new_term in replacements.items():
529
+ if cleaned == old_term:
530
+ return new_term
531
+
532
+ return cleaned.strip()
533
+
534
+ except Exception as e:
535
+ logger.warning(f"Error cleaning category name '{category}': {str(e)}")
536
+ return "misc"
537
+
538
  def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
539
  """
540
  當沒有匹配到特定場景類型時的一般功能區域識別
 
834
 
835
  }
836
 
837
+ # 1. 統計 current_zones 裡,已使用掉的 (class_name, region) 次數
838
  used_count = {}
839
  for zone_info in current_zones.values():
840
  rg = zone_info.get("region", "")
 
842
  key = (obj_name, rg)
843
  used_count[key] = used_count.get(key, 0) + 1
844
 
845
+ # 2. 統計 all_detected_objects 裡的 (class_name, region) 總次數
846
  total_count = {}
847
  for obj in all_detected_objects:
848
  cname = obj.get("class_name", "")
 
850
  key = (cname, rg)
851
  total_count[key] = total_count.get(key, 0) + 1
852
 
853
+ # 3. 把 default_classes 轉換成「class_name → fallback 區域 type」的對照表
854
  category_to_fallback = {
855
  # 行人與交通工具
856
  "person": "pedestrian area",
 
949
  "potted plant": "decorative area",
950
  }
951
 
952
+ # 4. 計算缺少的 (class_name, region) 並建立 fallback zone
953
  for (cname, rg), total in total_count.items():
954
  used = used_count.get((cname, rg), 0)
955
  missing = total - used
956
  if missing <= 0:
957
+ continue
958
 
959
  # (A) 決定這個 cname 在 fallback 裡屬於哪個大 class(zone_type)
960
  zone_type = category_to_fallback.get(cname, "miscellaneous area")
room_04.jpg ADDED

Git LFS Details

  • SHA256: 5c76115d8b32539119350b52b7fb8aebf862f15d3c5416b84e1cac447ff22d4f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.18 MB
scene_analysis_coordinator.py CHANGED
@@ -333,6 +333,9 @@ class SceneAnalysisCoordinator:
333
  scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
334
  )
335
  possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
 
 
 
336
 
337
  # 準備最終結果
338
  return {
@@ -345,6 +348,7 @@ class SceneAnalysisCoordinator:
345
  "object_count": len(detected_objects_from_landmarks_list),
346
  "regions": region_analysis,
347
  "possible_activities": possible_activities,
 
348
  "functional_zones": functional_zones,
349
  "detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
350
  "primary_landmark": primary_landmark,
@@ -463,26 +467,18 @@ class SceneAnalysisCoordinator:
463
  # 空間分析
464
  region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
465
 
 
 
 
 
 
 
 
466
  # 地標處理和整合
467
  landmark_objects_identified = []
468
  landmark_specific_activities = []
469
  final_landmark_info = {}
470
 
471
- if self.use_clip and current_run_enable_landmark:
472
- detected_objects_main, landmark_objects_identified = self.landmark_processing_manager.process_unknown_objects(
473
- detection_result, detected_objects_main, self.clip_analyzer
474
- )
475
-
476
- if landmark_objects_identified:
477
- landmark_specific_activities = self.landmark_processing_manager.extract_landmark_specific_activities(
478
- landmark_objects_identified
479
- )
480
- final_landmark_info = {
481
- "detected_landmarks": landmark_objects_identified,
482
- "primary_landmark": max(landmark_objects_identified, key=lambda x: x.get("confidence", 0.0), default=None),
483
- "detailed_landmarks": landmark_objects_identified
484
- }
485
-
486
  # 如果當前運行禁用地標檢測,清理地標物體
487
  if not current_run_enable_landmark:
488
  detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
 
333
  scene_confidence, lighting_info, functional_zones, landmark_results, image_dims_val
334
  )
335
  possible_activities = self._extract_possible_activities(detected_objects_from_landmarks_list, landmark_results)
336
+ safety_concerns = []
337
+ if self.descriptor and hasattr(self.descriptor, '_identify_safety_concerns'):
338
+ safety_concerns = self.descriptor._identify_safety_concerns(detected_objects_from_landmarks_list, best_scene_val)
339
 
340
  # 準備最終結果
341
  return {
 
348
  "object_count": len(detected_objects_from_landmarks_list),
349
  "regions": region_analysis,
350
  "possible_activities": possible_activities,
351
+ "safety_concerns": safety_concerns,
352
  "functional_zones": functional_zones,
353
  "detected_landmarks": [lm for lm in detected_objects_from_landmarks_list if lm.get("is_landmark", False)],
354
  "primary_landmark": primary_landmark,
 
467
  # 空間分析
468
  region_analysis_val = self.spatial_analyzer._analyze_regions(detected_objects_main)
469
 
470
+ if current_run_enable_landmark:
471
+ self.logger.info("Using landmark detection logic for YOLO scene")
472
+ return self._handle_no_yolo_detections(
473
+ original_image_pil, image_dims_val, current_run_enable_landmark,
474
+ lighting_info, places365_info
475
+ )
476
+
477
  # 地標處理和整合
478
  landmark_objects_identified = []
479
  landmark_specific_activities = []
480
  final_landmark_info = {}
481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  # 如果當前運行禁用地標檢測,清理地標物體
483
  if not current_run_enable_landmark:
484
  detected_objects_main = [obj for obj in detected_objects_main if not obj.get("is_landmark", False)]
spatial_analyzer.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  import traceback
6
  from typing import Dict, List, Tuple, Any, Optional
7
 
 
8
  from region_analyzer import RegionAnalyzer
9
  from object_extractor import ObjectExtractor
10
  from scene_viewpoint_analyzer import SceneViewpointAnalyzer
@@ -31,6 +32,9 @@ class SpatialAnalyzer:
31
  """
32
  try:
33
  # 初始化所有子組件
 
 
 
34
  self.region_analyzer = RegionAnalyzer()
35
  self.object_extractor = ObjectExtractor(class_names, object_categories)
36
 
@@ -41,12 +45,10 @@ class SpatialAnalyzer:
41
  self.functional_zone_identifier = FunctionalZoneIdentifier(
42
  zone_evaluator=self.zone_evaluator,
43
  scene_zone_identifier=self.scene_zone_identifier,
44
- scene_viewpoint_analyzer=self.scene_viewpoint_analyzer
 
45
  )
46
 
47
- self.class_names = class_names
48
- self.OBJECT_CATEGORIES = object_categories or {}
49
-
50
  self.enhance_descriptor = None
51
 
52
  # 接近分析的距離閾值(標準化)
@@ -171,105 +173,6 @@ class SpatialAnalyzer:
171
  logger.error(traceback.format_exc())
172
  return {}
173
 
174
- def _categorize_object(self, obj: Dict) -> str:
175
- """
176
- 將檢測到的物件分類到功能類別中,用於區域識別
177
- 確保所有返回值都使用自然語言格式,避免底線或技術性標識符
178
- """
179
- try:
180
- class_id = obj.get("class_id", -1)
181
- class_name = obj.get("class_name", "").lower().strip()
182
-
183
- # 優先處理 traffic light
184
- # 只要 class_id == 9 或 class_name 包含 "traffic light",就分類為 "traffic light"
185
- if class_id == 9 or "traffic light" in class_name:
186
- return "traffic light"
187
-
188
- # 如果有自訂的 OBJECT_CATEGORIES 映射,優先使用它
189
- if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
190
- for category, ids in self.OBJECT_CATEGORIES.items():
191
- if class_id in ids:
192
- # 確保返回的類別名稱使用自然語言格式
193
- return self._clean_category_name(category)
194
-
195
- # COCO class default name
196
- furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
197
- plant_items = ["potted plant"]
198
- electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
199
- vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
200
- person_items = ["person"]
201
- kitchen_items = [
202
- "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
203
- "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
204
- "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"
205
- ]
206
- sports_items = [
207
- "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
208
- "baseball glove", "skateboard", "surfboard", "tennis racket"
209
- ]
210
- personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
211
-
212
- # fallback natural language
213
- if any(item in class_name for item in furniture_items):
214
- return "furniture"
215
- elif any(item in class_name for item in plant_items):
216
- return "plant"
217
- elif any(item in class_name for item in electronic_items):
218
- return "electronics"
219
- elif any(item in class_name for item in vehicle_items):
220
- return "vehicle"
221
- elif any(item in class_name for item in person_items):
222
- return "person"
223
- elif any(item in class_name for item in kitchen_items):
224
- return "kitchen items" # 移除底線
225
- elif any(item in class_name for item in sports_items):
226
- return "sports"
227
- elif any(item in class_name for item in personal_items):
228
- return "personal items" # 移除底線
229
- else:
230
- return "misc"
231
-
232
- except Exception as e:
233
- logger.error(f"Error categorizing object: {str(e)}")
234
- logger.error(traceback.format_exc())
235
- return "misc"
236
-
237
- def _clean_category_name(self, category: str) -> str:
238
- """
239
- 清理類別名稱,移除底線並轉換為較自然的格式
240
-
241
- Args:
242
- category: 原始類別名稱
243
-
244
- Returns:
245
- str: 清理後的類別名稱
246
- """
247
- try:
248
- if not category:
249
- return "misc"
250
-
251
- # 將底線替換為空格
252
- cleaned = category.replace('_', ' ')
253
-
254
- # 處理常見的技術性命名模式
255
- replacements = {
256
- 'kitchen items': 'kitchen items',
257
- 'personal items': 'personal items',
258
- 'traffic light': 'traffic light',
259
- 'misc items': 'misc'
260
- }
261
-
262
- # 應用特定的替換規則
263
- for old_term, new_term in replacements.items():
264
- if cleaned == old_term:
265
- return new_term
266
-
267
- return cleaned.strip()
268
-
269
- except Exception as e:
270
- logger.warning(f"Error cleaning category name '{category}': {str(e)}")
271
- return "misc"
272
-
273
  def _get_object_categories(self, detected_objects: List[Dict]) -> set:
274
  """
275
  從檢測到的物件中獲取唯一的物件類別
 
5
  import traceback
6
  from typing import Dict, List, Tuple, Any, Optional
7
 
8
+ from object_categories import OBJECT_CATEGORIES
9
  from region_analyzer import RegionAnalyzer
10
  from object_extractor import ObjectExtractor
11
  from scene_viewpoint_analyzer import SceneViewpointAnalyzer
 
32
  """
33
  try:
34
  # 初始化所有子組件
35
+ self.class_names = class_names
36
+ self.OBJECT_CATEGORIES = object_categories or {}
37
+
38
  self.region_analyzer = RegionAnalyzer()
39
  self.object_extractor = ObjectExtractor(class_names, object_categories)
40
 
 
45
  self.functional_zone_identifier = FunctionalZoneIdentifier(
46
  zone_evaluator=self.zone_evaluator,
47
  scene_zone_identifier=self.scene_zone_identifier,
48
+ scene_viewpoint_analyzer=self.scene_viewpoint_analyzer,
49
+ object_categories=self.OBJECT_CATEGORIES
50
  )
51
 
 
 
 
52
  self.enhance_descriptor = None
53
 
54
  # 接近分析的距離閾值(標準化)
 
173
  logger.error(traceback.format_exc())
174
  return {}
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def _get_object_categories(self, detected_objects: List[Dict]) -> set:
177
  """
178
  從檢測到的物件中獲取唯一的物件類別
ui_manager.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict, List, Any, Optional, Tuple
3
+ import matplotlib.pyplot as plt
4
+
5
+ from detection_model import DetectionModel
6
+ from style import Style
7
+
8
+ class UIManager:
9
+ """
10
+ Manages all UI-related functionality for the VisionScout application.
11
+ Handles Gradio interface creation, component definitions, and event binding.
12
+ """
13
+
14
+ def __init__(self):
15
+ """Initialize the UI Manager."""
16
+ self.available_models = None
17
+ self.model_choices = []
18
+ self.class_choices_formatted = []
19
+ self._setup_model_choices()
20
+
21
+ def _setup_model_choices(self):
22
+ """Setup model choices for dropdowns."""
23
+ try:
24
+ self.available_models = DetectionModel.get_available_models()
25
+ self.model_choices = [model["model_file"] for model in self.available_models]
26
+ except ImportError:
27
+ # Fallback model choices if DetectionModel is not available
28
+ self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
29
+
30
+ # Setup class choices
31
+ self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
32
+
33
+ def get_all_classes(self):
34
+ """
35
+ Gets all available COCO classes.
36
+
37
+ Returns:
38
+ List[Tuple[int, str]]: List of (class_id, class_name) tuples
39
+ """
40
+ # Try to get from a loaded model first
41
+ try:
42
+ # This will be injected by the main app when processors are available
43
+ if hasattr(self, '_image_processor') and self._image_processor and self._image_processor.model_instances:
44
+ for model_instance in self._image_processor.model_instances.values():
45
+ if model_instance and model_instance.is_model_loaded:
46
+ try:
47
+ # Ensure class_names is a dict {id: name}
48
+ if isinstance(model_instance.class_names, dict):
49
+ return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
50
+ except Exception as e:
51
+ print(f"Error getting class names from model: {e}")
52
+ except Exception:
53
+ pass
54
+
55
+ # Fallback to standard COCO (ensure keys are ints)
56
+ default_classes = {
57
+ 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
58
+ 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
59
+ 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
60
+ 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
61
+ 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
62
+ 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
63
+ 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
64
+ 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
65
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
66
+ 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
67
+ 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
68
+ 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
69
+ 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
70
+ 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
71
+ 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
72
+ 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
73
+ }
74
+ return sorted(default_classes.items())
75
+
76
+ def set_image_processor(self, image_processor):
77
+ """
78
+ Set the image processor reference for dynamic class retrieval.
79
+
80
+ Args:
81
+ image_processor: The ImageProcessor instance
82
+ """
83
+ self._image_processor = image_processor
84
+
85
+ def get_css_styles(self):
86
+ """
87
+ Get CSS styles for the interface.
88
+
89
+ Returns:
90
+ str: CSS styles
91
+ """
92
+ try:
93
+ return Style.get_css()
94
+ except ImportError:
95
+ # Fallback CSS if Style module is not available
96
+ return """
97
+ .app-header {
98
+ text-align: center;
99
+ padding: 2rem 0 3rem 0;
100
+ background: linear-gradient(135deg, #f0f9ff, #e1f5fe);
101
+ }
102
+ .section-heading {
103
+ font-size: 1.2rem;
104
+ font-weight: bold;
105
+ color: #2D3748;
106
+ margin: 1rem 0 0.5rem 0;
107
+ }
108
+ .detect-btn {
109
+ background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
110
+ color: white !important;
111
+ border: none !important;
112
+ border-radius: 8px !important;
113
+ }
114
+ """
115
+
116
+ def get_model_description(self, model_name):
117
+ """
118
+ Get model description for the given model name.
119
+
120
+ Args:
121
+ model_name: Name of the model
122
+
123
+ Returns:
124
+ str: Model description
125
+ """
126
+ try:
127
+ return DetectionModel.get_model_description(model_name)
128
+ except ImportError:
129
+ return f"Model: {model_name}"
130
+
131
+ def create_header(self):
132
+ """
133
+ Create the application header.
134
+
135
+ Returns:
136
+ gr.HTML: Header HTML component
137
+ """
138
+ return gr.HTML("""
139
+ <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
140
+ <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
141
+ <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
142
+ <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
143
+ <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
144
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
145
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
146
+ </div>
147
+ <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
148
+ <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
149
+ <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
150
+ <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
151
+ </p>
152
+ </div>
153
+ </div>
154
+ """)
155
+
156
+ def create_footer(self):
157
+ """
158
+ Create the application footer.
159
+
160
+ Returns:
161
+ gr.HTML: Footer HTML component
162
+ """
163
+ return gr.HTML("""
164
+ <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
165
+ <div style="margin-bottom: 15px;">
166
+ <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
167
+ </div>
168
+ <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
169
+ <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
170
+ <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
171
+ <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
172
+ </a>
173
+ </div>
174
+ </div>
175
+ """)
176
+
177
+ def create_image_tab(self):
178
+ """
179
+ Create the image processing tab with all components.
180
+
181
+ Returns:
182
+ Dict: Dictionary containing all image tab components
183
+ """
184
+ components = {}
185
+
186
+ with gr.Tab("Image Processing"):
187
+ components['current_image_model'] = gr.State("yolov8m.pt")
188
+
189
+ with gr.Row(equal_height=False):
190
+ # Left Column: Image Input & Controls
191
+ with gr.Column(scale=4, elem_classes="input-panel"):
192
+ with gr.Group():
193
+ gr.HTML('<div class="section-heading">Upload Image</div>')
194
+ components['image_input'] = gr.Image(
195
+ type="pil",
196
+ label="Upload an image",
197
+ elem_classes="upload-box"
198
+ )
199
+
200
+ with gr.Accordion("Image Analysis Settings", open=False):
201
+ components['image_model_dropdown'] = gr.Dropdown(
202
+ choices=self.model_choices,
203
+ value="yolov8m.pt",
204
+ label="Select Model",
205
+ info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
206
+ )
207
+
208
+ components['image_model_info'] = gr.Markdown(
209
+ self.get_model_description("yolov8m.pt")
210
+ )
211
+
212
+ components['image_confidence'] = gr.Slider(
213
+ minimum=0.1, maximum=0.9, value=0.25, step=0.05,
214
+ label="Confidence Threshold",
215
+ info="Minimum confidence for displaying a detected object"
216
+ )
217
+
218
+ components['use_llm'] = gr.Checkbox(
219
+ label="Use LLM for enhanced scene descriptions",
220
+ value=True,
221
+ info="Provides more detailed and natural language descriptions (may increase processing time)"
222
+ )
223
+
224
+ components['use_landmark_detection'] = gr.Checkbox(
225
+ label="Use CLIP for Landmark Detection",
226
+ value=False,
227
+ info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
228
+ )
229
+
230
+ with gr.Accordion("Filter Classes", open=False):
231
+ gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
232
+ with gr.Row():
233
+ components['people_btn'] = gr.Button("People", size="sm")
234
+ components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
235
+ components['animals_btn'] = gr.Button("Animals", size="sm")
236
+ components['objects_btn'] = gr.Button("Common Objects", size="sm")
237
+
238
+ components['image_class_filter'] = gr.Dropdown(
239
+ choices=self.class_choices_formatted,
240
+ multiselect=True,
241
+ label="Select Classes to Display",
242
+ info="Leave empty to show all detected objects"
243
+ )
244
+
245
+ components['image_detect_btn'] = gr.Button(
246
+ "Analyze Image",
247
+ variant="primary",
248
+ elem_classes="detect-btn"
249
+ )
250
+
251
+ # How to use section
252
+ with gr.Group(elem_classes="how-to-use"):
253
+ gr.HTML('<div class="section-heading">How to Use (Image)</div>')
254
+ gr.Markdown("""
255
+ 1. Upload an image or use the camera
256
+ 2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
257
+ 3. In **Analysis Settings**, you can:
258
+ * Uncheck **Use LLM** to skip enhanced descriptions (faster)
259
+ * Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
260
+ * Filter object classes to focus on specific types of objects *(optional)*
261
+ 4. Click **Analyze Image** button
262
+
263
+ **💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
264
+ """)
265
+
266
+ # Image Examples
267
+ gr.Examples(
268
+ examples=[
269
+ "room_04.jpg",
270
+ "street_04.jpg",
271
+ "street_05.jpg",
272
+ "landmark_Louvre_01.jpg"
273
+ ],
274
+ inputs=components['image_input'],
275
+ label="Example Images"
276
+ )
277
+
278
+ gr.HTML("""
279
+ <div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
280
+ <p style="font-size: 12px; color: #718096; margin: 0;">
281
+ 📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
282
+ </p>
283
+ </div>
284
+ """)
285
+
286
+ # Right Column: Image Results
287
+ with gr.Column(scale=6, elem_classes="output-panel"):
288
+ with gr.Tabs(elem_classes="tabs"):
289
+ # Detection Result Tab
290
+ with gr.Tab("Detection Result"):
291
+ components['image_result_image'] = gr.Image(
292
+ type="pil",
293
+ label="Detection Result"
294
+ )
295
+ gr.HTML('<div class="section-heading">Detection Details</div>')
296
+ components['image_result_text'] = gr.Textbox(
297
+ label=None,
298
+ lines=10,
299
+ elem_id="detection-details",
300
+ container=False
301
+ )
302
+
303
+ # Scene Understanding Tab
304
+ with gr.Tab("Scene Understanding"):
305
+ gr.HTML('<div class="section-heading">Scene Analysis</div>')
306
+
307
+ # Info details
308
+ gr.HTML("""
309
+ <details class="info-details" style="margin: 5px 0 15px 0;">
310
+ <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
311
+ 🔍 The AI Vision Scout Report: Click for important notes about this analysis
312
+ </summary>
313
+ <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
314
+ <p style="font-size: 13px; color: #718096; margin: 0;">
315
+ <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
316
+ Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
317
+ Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
318
+ </p>
319
+ </div>
320
+ </details>
321
+ """)
322
+
323
+ gr.HTML('''
324
+ <div style="margin-top: 5px; padding: 6px 10px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #63b3ed; font-size: 12px; margin-bottom: 10px;">
325
+ <p style="margin: 0; color: #4a5568;">
326
+ <b>Note:</b> AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
327
+ </p>
328
+ </div>
329
+ ''')
330
+
331
+ components['image_scene_description_html'] = gr.HTML(
332
+ label=None,
333
+ elem_id="scene_analysis_description_text"
334
+ )
335
+
336
+ # Original Scene Analysis accordion
337
+ with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
338
+ components['image_llm_description'] = gr.HTML(
339
+ label=None,
340
+ elem_id="original_scene_description_text"
341
+ )
342
+
343
+ with gr.Row():
344
+ with gr.Column(scale=1):
345
+ gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
346
+ components['image_activities_list'] = gr.Dataframe(
347
+ headers=["Activity"],
348
+ datatype=["str"],
349
+ row_count=5,
350
+ col_count=1,
351
+ wrap=True
352
+ )
353
+
354
+ with gr.Column(scale=1):
355
+ gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
356
+ components['image_safety_list'] = gr.Dataframe(
357
+ headers=["Concern"],
358
+ datatype=["str"],
359
+ row_count=5,
360
+ col_count=1,
361
+ wrap=True
362
+ )
363
+
364
+ gr.HTML('<div class="section-heading">Functional Zones</div>')
365
+ components['image_zones_json'] = gr.JSON(
366
+ label=None,
367
+ elem_classes="json-box"
368
+ )
369
+
370
+ gr.HTML('<div class="section-heading">Lighting Conditions</div>')
371
+ components['image_lighting_info'] = gr.JSON(
372
+ label=None,
373
+ elem_classes="json-box"
374
+ )
375
+
376
+ # Statistics Tab
377
+ with gr.Tab("Statistics"):
378
+ with gr.Row():
379
+ with gr.Column(scale=3, elem_classes="plot-column"):
380
+ gr.HTML('<div class="section-heading">Object Distribution</div>')
381
+ components['image_plot_output'] = gr.Plot(
382
+ label=None,
383
+ elem_classes="large-plot-container"
384
+ )
385
+ with gr.Column(scale=2, elem_classes="stats-column"):
386
+ gr.HTML('<div class="section-heading">Detection Statistics</div>')
387
+ components['image_stats_json'] = gr.JSON(
388
+ label=None,
389
+ elem_classes="enhanced-json-display"
390
+ )
391
+
392
+ return components
393
+
394
+ def create_video_tab(self):
395
+ """
396
+ Create the video processing tab with all components.
397
+
398
+ Returns:
399
+ Dict: Dictionary containing all video tab components
400
+ """
401
+ components = {}
402
+
403
+ with gr.Tab("Video Processing"):
404
+ with gr.Row(equal_height=False):
405
+ # Left Column: Video Input & Controls
406
+ with gr.Column(scale=4, elem_classes="input-panel"):
407
+ with gr.Group():
408
+ gr.HTML('<div class="section-heading">Video Input</div>')
409
+
410
+ # Input type selection
411
+ components['video_input_type'] = gr.Radio(
412
+ ["upload", "url"],
413
+ label="Input Method",
414
+ value="upload",
415
+ info="Choose how to provide the video"
416
+ )
417
+
418
+ # File upload
419
+ with gr.Group(elem_id="upload-video-group"):
420
+ components['video_input'] = gr.Video(
421
+ label="Upload a video file (MP4, AVI, MOV)",
422
+ sources=["upload"],
423
+ visible=True
424
+ )
425
+
426
+ # URL input
427
+ with gr.Group(elem_id="url-video-group"):
428
+ components['video_url_input'] = gr.Textbox(
429
+ label="Enter video URL (YouTube or direct video link)",
430
+ placeholder="https://www.youtube.com/watch?v=...",
431
+ visible=False,
432
+ elem_classes="custom-video-url-input"
433
+ )
434
+ gr.HTML("""
435
+ <div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
436
+ <p style="margin: 0; color: #4b5563;">
437
+ Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
438
+ </p>
439
+ </div>
440
+ """)
441
+
442
+ with gr.Accordion("Video Analysis Settings", open=True):
443
+ components['video_model_dropdown'] = gr.Dropdown(
444
+ choices=self.model_choices,
445
+ value="yolov8n.pt",
446
+ label="Select Model (Video)",
447
+ info="Faster models (like 'n') are recommended"
448
+ )
449
+ components['video_confidence'] = gr.Slider(
450
+ minimum=0.1, maximum=0.9, value=0.4, step=0.05,
451
+ label="Confidence Threshold (Video)"
452
+ )
453
+ components['video_process_interval'] = gr.Slider(
454
+ minimum=1, maximum=60, value=10, step=1,
455
+ label="Processing Interval (Frames)",
456
+ info="Analyze every Nth frame (higher value = faster)"
457
+ )
458
+
459
+ components['video_process_btn'] = gr.Button(
460
+ "Process Video",
461
+ variant="primary",
462
+ elem_classes="detect-btn"
463
+ )
464
+
465
+ # How to use section
466
+ with gr.Group(elem_classes="how-to-use"):
467
+ gr.HTML('<div class="section-heading">How to Use (Video)</div>')
468
+ gr.Markdown("""
469
+ 1. Choose your input method: Upload a file or enter a URL.
470
+ 2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
471
+ 3. Click "Process Video". **Processing can take a significant amount of time.**
472
+ 4. The annotated video and summary will appear on the right when finished.
473
+ """)
474
+
475
+ # Video examples
476
+ gr.HTML('<div class="section-heading">Example Videos</div>')
477
+ gr.HTML("""
478
+ <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
479
+ <p style="font-size: 14px; color: #4A5568; margin: 0;">
480
+ Upload any video containing objects that YOLO can detect. For testing, find sample videos
481
+ <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
482
+ </p>
483
+ </div>
484
+ """)
485
+
486
+ # Right Column: Video Results
487
+ with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
488
+ gr.HTML("""
489
+ <div class="section-heading">Video Result</div>
490
+ <details class="info-details" style="margin: 5px 0 15px 0;">
491
+ <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
492
+ 🎬 Video Processing Notes
493
+ </summary>
494
+ <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
495
+ <p style="font-size: 13px; color: #718096; margin: 0;">
496
+ The processed video includes bounding boxes around detected objects. For longer videos,
497
+ consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
498
+ </p>
499
+ </div>
500
+ </details>
501
+ """)
502
+
503
+ components['video_output'] = gr.Video(
504
+ label="Processed Video",
505
+ elem_classes="video-output-container"
506
+ )
507
+
508
+ gr.HTML('<div class="section-heading">Processing Summary</div>')
509
+ components['video_summary_text'] = gr.HTML(
510
+ label=None,
511
+ elem_id="video-summary-html-output"
512
+ )
513
+
514
+ gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
515
+ components['video_stats_json'] = gr.JSON(
516
+ label=None,
517
+ elem_classes="video-stats-display"
518
+ )
519
+
520
+ return components
521
+
522
+ def get_filter_button_mappings(self):
523
+ """
524
+ Get the class ID mappings for filter buttons.
525
+
526
+ Returns:
527
+ Dict: Dictionary containing class ID lists for different categories
528
+ """
529
+ available_classes_list = self.get_all_classes()
530
+
531
+ return {
532
+ 'people_classes_ids': [0],
533
+ 'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
534
+ 'animals_classes_ids': list(range(14, 24)),
535
+ 'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
536
+ 'available_classes_list': available_classes_list
537
+ }
538
+
539
+ def create_interface(self,
540
+ handle_image_upload_fn,
541
+ handle_video_upload_fn,
542
+ download_video_from_url_fn):
543
+ """
544
+ Create the complete Gradio interface.
545
+
546
+ Args:
547
+ handle_image_upload_fn: Function to handle image upload
548
+ handle_video_upload_fn: Function to handle video upload
549
+ download_video_from_url_fn: Function to download video from URL
550
+
551
+ Returns:
552
+ gr.Blocks: Complete Gradio interface
553
+ """
554
+ css = self.get_css_styles()
555
+
556
+ with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
557
+
558
+ # Header
559
+ with gr.Group(elem_classes="app-header"):
560
+ self.create_header()
561
+
562
+ # Main Content with Tabs
563
+ with gr.Tabs(elem_classes="tabs"):
564
+
565
+ # Image Processing Tab
566
+ image_components = self.create_image_tab()
567
+
568
+ # Video Processing Tab
569
+ video_components = self.create_video_tab()
570
+
571
+ # Footer
572
+ self.create_footer()
573
+
574
+ # Setup Event Listeners
575
+ self._setup_event_listeners(
576
+ image_components,
577
+ video_components,
578
+ handle_image_upload_fn,
579
+ handle_video_upload_fn
580
+ )
581
+
582
+ return demo
583
+
584
+ def _setup_event_listeners(self,
585
+ image_components,
586
+ video_components,
587
+ handle_image_upload_fn,
588
+ handle_video_upload_fn):
589
+ """
590
+ Setup all event listeners for the interface.
591
+
592
+ Args:
593
+ image_components: Dictionary of image tab components
594
+ video_components: Dictionary of video tab components
595
+ handle_image_upload_fn: Function to handle image upload
596
+ handle_video_upload_fn: Function to handle video upload
597
+ """
598
+ # Image Model Change Handler
599
+ image_components['image_model_dropdown'].change(
600
+ fn=lambda model: (model, self.get_model_description(model)),
601
+ inputs=[image_components['image_model_dropdown']],
602
+ outputs=[image_components['current_image_model'], image_components['image_model_info']]
603
+ )
604
+
605
+ # Image Filter Buttons
606
+ filter_mappings = self.get_filter_button_mappings()
607
+ available_classes_list = filter_mappings['available_classes_list']
608
+ people_classes_ids = filter_mappings['people_classes_ids']
609
+ vehicles_classes_ids = filter_mappings['vehicles_classes_ids']
610
+ animals_classes_ids = filter_mappings['animals_classes_ids']
611
+ common_objects_ids = filter_mappings['common_objects_ids']
612
+
613
+ image_components['people_btn'].click(
614
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
615
+ outputs=image_components['image_class_filter']
616
+ )
617
+ image_components['vehicles_btn'].click(
618
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
619
+ outputs=image_components['image_class_filter']
620
+ )
621
+ image_components['animals_btn'].click(
622
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
623
+ outputs=image_components['image_class_filter']
624
+ )
625
+ image_components['objects_btn'].click(
626
+ lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
627
+ outputs=image_components['image_class_filter']
628
+ )
629
+
630
+ # Video Input Type Change Handler
631
+ video_components['video_input_type'].change(
632
+ fn=lambda input_type: [
633
+ # Show/hide file upload
634
+ gr.update(visible=(input_type == "upload")),
635
+ # Show/hide URL input
636
+ gr.update(visible=(input_type == "url"))
637
+ ],
638
+ inputs=[video_components['video_input_type']],
639
+ outputs=[video_components['video_input'], video_components['video_url_input']]
640
+ )
641
+
642
+ # Image Detect Button Click Handler
643
+ image_components['image_detect_btn'].click(
644
+ fn=handle_image_upload_fn,
645
+ inputs=[
646
+ image_components['image_input'],
647
+ image_components['image_model_dropdown'],
648
+ image_components['image_confidence'],
649
+ image_components['image_class_filter'],
650
+ image_components['use_llm'],
651
+ image_components['use_landmark_detection']
652
+ ],
653
+ outputs=[
654
+ image_components['image_result_image'],
655
+ image_components['image_result_text'],
656
+ image_components['image_stats_json'],
657
+ image_components['image_plot_output'],
658
+ image_components['image_scene_description_html'],
659
+ image_components['image_llm_description'],
660
+ image_components['image_activities_list'],
661
+ image_components['image_safety_list'],
662
+ image_components['image_zones_json'],
663
+ image_components['image_lighting_info']
664
+ ]
665
+ )
666
+
667
+ # Video Process Button Click Handler
668
+ video_components['video_process_btn'].click(
669
+ fn=handle_video_upload_fn,
670
+ inputs=[
671
+ video_components['video_input'],
672
+ video_components['video_url_input'],
673
+ video_components['video_input_type'],
674
+ video_components['video_model_dropdown'],
675
+ video_components['video_confidence'],
676
+ video_components['video_process_interval']
677
+ ],
678
+ outputs=[
679
+ video_components['video_output'],
680
+ video_components['video_summary_text'],
681
+ video_components['video_stats_json']
682
+ ]
683
+ )