_base_ = [ # 'mmdet3d::_base_/datasets/nus-3d.py', '../../../configs/_base_/default_runtime.py' ] custom_imports = dict(imports=['projects.DETR3D.detr3d']) # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] voxel_size = [0.2, 0.2, 8] img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], bgr_to_rgb=False) # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) # this means type='DETR3D' will be processed as 'mmdet3d.DETR3D' default_scope = 'mmdet3d' model = dict( type='DETR3D', use_grid_mask=True, data_preprocessor=dict( type='Det3DDataPreprocessor', **img_norm_cfg, pad_size_divisor=32), img_backbone=dict( type='mmdet.ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True)), img_neck=dict( type='mmdet.FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs='on_output', num_outs=4, relu_before_extra_convs=True), pts_bbox_head=dict( type='DETR3DHead', num_query=900, num_classes=10, in_channels=256, sync_cls_avg_factor=True, with_box_refine=True, as_two_stage=False, transformer=dict( type='Detr3DTransformer', decoder=dict( type='Detr3DTransformerDecoder', num_layers=6, return_intermediate=True, transformerlayers=dict( type='mmdet.DetrTransformerDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', # mmcv. embed_dims=256, num_heads=8, dropout=0.1), dict( type='Detr3DCrossAtten', pc_range=point_cloud_range, num_points=1, embed_dims=256) ], feedforward_channels=512, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')))), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, voxel_size=voxel_size, num_classes=10), positional_encoding=dict( type='mmdet.SinePositionalEncoding', num_feats=128, normalize=True, offset=-0.5), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='mmdet.L1Loss', loss_weight=0.25), loss_iou=dict(type='mmdet.GIoULoss', loss_weight=0.0)), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='mmdet.FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), # ↓ Fake cost. This is just to get compatible with DETR head iou_cost=dict(type='mmdet.IoUCost', weight=0.0), pc_range=point_cloud_range)))) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' test_transforms = [ dict( type='RandomResize3D', scale=(1600, 900), ratio_range=(1., 1.), keep_ratio=True) ] train_transforms = [dict(type='PhotoMetricDistortion3D')] + test_transforms backend_args = None train_pipeline = [ dict( type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6, backend_args=backend_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), dict(type='MultiViewWrapper', transforms=train_transforms), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadMultiViewImageFromFiles', to_float32=True, num_views=6, backend_args=backend_args), dict(type='MultiViewWrapper', transforms=test_transforms), dict(type='Pack3DDetInputs', keys=['img']) ] metainfo = dict(classes=class_names) data_prefix = dict( pts='', CAM_FRONT='samples/CAM_FRONT', CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', CAM_BACK='samples/CAM_BACK', CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', CAM_BACK_LEFT='samples/CAM_BACK_LEFT') train_dataloader = dict( batch_size=1, num_workers=4, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type=dataset_type, data_root=data_root, ann_file='nuscenes_infos_train.pkl', pipeline=train_pipeline, load_type='frame_based', metainfo=metainfo, modality=input_modality, test_mode=False, data_prefix=data_prefix, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', backend_args=backend_args)) val_dataloader = dict( batch_size=1, num_workers=4, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='nuscenes_infos_val.pkl', load_type='frame_based', pipeline=test_pipeline, metainfo=metainfo, modality=input_modality, test_mode=True, data_prefix=data_prefix, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = val_dataloader val_evaluator = dict( type='NuScenesMetric', data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', metric='bbox', backend_args=backend_args) test_evaluator = val_evaluator optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), paramwise_cfg=dict(custom_keys={'img_backbone': dict(lr_mult=0.1)}), clip_grad=dict(max_norm=35, norm_type=2), ) # learning policy param_scheduler = [ dict( type='LinearLR', start_factor=1.0 / 3, by_epoch=False, begin=0, end=500), dict( type='CosineAnnealingLR', by_epoch=True, begin=0, end=24, T_max=24, eta_min_ratio=1e-3) ] total_epochs = 24 train_cfg = dict( type='EpochBasedTrainLoop', max_epochs=total_epochs, val_interval=2) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') default_hooks = dict( checkpoint=dict( type='CheckpointHook', interval=1, max_keep_ckpts=1, save_last=True)) load_from = 'ckpts/fcos3d.pth' # setuptools 65 downgrades to 58. # In mmlab-node we use setuptools 61 but occurs NO errors vis_backends = [dict(type='TensorboardVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')