yonigozlan
/

sam2.1_hiera_large

@@ -2,7 +2,6 @@
   "architectures": [
     "Sam2VideoModel"
   ],
-  "binarize_mask_from_pts_for_mem_enc": true,
   "enable_occlusion_spatial_embedding": true,
   "enable_temporal_pos_encoding_for_object_pointers": true,
   "fill_hole_area": 8,
@@ -13,7 +12,6 @@
     "dynamic_multimask_stability_delta": 0.05,
     "dynamic_multimask_stability_thresh": 0.98,
     "dynamic_multimask_via_stability": true,
-    "feed_forward_hidden_act": "relu",
     "hidden_act": "gelu",
     "hidden_size": 256,
     "iou_head_depth": 3,
@@ -22,8 +20,7 @@
     "model_type": "",
     "num_attention_heads": 8,
     "num_hidden_layers": 2,
-    "num_multimask_outputs": 3,
-    "two_way_transformer_activation": "relu"
   },
   "mask_downsampler_embed_dim": 256,
   "mask_downsampler_hidden_act": "gelu",
@@ -32,9 +29,6 @@
   "mask_downsampler_stride": 2,
   "mask_downsampler_total_stride": 16,
   "max_object_pointers_in_encoder": 16,
-  "memory_attention_apply_pe_at_cross_attn_keys": true,
-  "memory_attention_apply_pe_at_cross_attn_queries": false,
-  "memory_attention_apply_pe_at_self_attn": false,
   "memory_attention_downsample_rate": 1,
   "memory_attention_dropout": 0.1,
   "memory_attention_feed_forward_hidden_act": "relu",
@@ -52,12 +46,12 @@
   "memory_encoder_output_channels": 64,
   "memory_fuser_embed_dim": 256,
   "memory_fuser_hidden_act": "gelu",
   "memory_fuser_kernel_size": 7,
   "memory_fuser_layer_scale_init_value": 1e-06,
   "memory_fuser_num_layers": 2,
   "memory_fuser_padding": 3,
-  "memory_fuser_use_depthwise_conv": true,
-  "model_type": "sam2",
   "multimask_max_pt_num": 1,
   "multimask_min_pt_num": 0,
   "multimask_output_for_tracking": true,
@@ -65,7 +59,6 @@
   "non_overlap_masks": false,
   "non_overlap_masks_for_mem_enc": false,
   "num_maskmem": 7,
-  "preserve_temporal_direction_in_object_pointers": true,
   "project_temporal_pos_encoding_in_object_pointers": true,
   "prompt_encoder_config": {
     "hidden_act": "gelu",
@@ -81,7 +74,7 @@
   "sigmoid_bias_for_mem_enc": -10.0,
   "sigmoid_scale_for_mem_enc": 20.0,
   "torch_dtype": "float32",
-  "transformers_version": "4.54.0.dev0",
   "vision_config": {
     "backbone_channel_list": [
       1152,
@@ -95,15 +88,25 @@
       "architectures": null,
       "bad_words_ids": null,
       "begin_suppress_tokens": null,
       "bos_token_id": null,
       "chunk_size_feed_forward": 0,
       "cross_attention_hidden_size": null,
       "decoder_start_token_id": null,
-      "dim_mul": 2.0,
       "diversity_penalty": 0.0,
       "do_sample": false,
-      "drop_path_rate": 0.0,
       "early_stopping": false,
       "encoder_no_repeat_ngram_size": 0,
       "eos_token_id": null,
       "exponential_decay_length_penalty": null,
@@ -115,14 +118,16 @@
         33,
         43
       ],
-      "head_mul": 2.0,
       "hidden_act": "gelu",
       "hidden_size": 144,
       "id2label": {
         "0": "LABEL_0",
         "1": "LABEL_1"
       },
-      "image_size": 1024,
       "initializer_range": 0.02,
       "is_decoder": false,
       "is_encoder_decoder": false,
@@ -134,9 +139,16 @@
       "length_penalty": 1.0,
       "max_length": 20,
       "min_length": 0,
       "model_type": "sam2_hiera_det_model",
       "no_repeat_ngram_size": 0,
-      "num_attention_heads": 2,
       "num_beam_groups": 1,
       "num_beams": 1,
       "num_channels": 3,
@@ -146,9 +158,18 @@
       "output_hidden_states": false,
       "output_scores": false,
       "pad_token_id": null,
-      "patch_kernel_size": 7,
-      "patch_padding": 3,
-      "patch_stride": 4,
       "prefix": null,
       "problem_type": null,
       "pruned_heads": {},
@@ -161,12 +182,6 @@
       "return_dict": true,
       "return_dict_in_generate": false,
       "sep_token_id": null,
-      "stages": [
-        2,
-        6,
-        36,
-        4
-      ],
       "suppress_tokens": null,
       "task_specific_params": null,
       "temperature": 1.0,
@@ -184,7 +199,7 @@
         7,
         7
       ],
-      "window_spec": [
         8,
         4,
         16,
@@ -206,7 +221,6 @@
       ]
     ],
     "fpn_hidden_size": 256,
-    "fpn_interpolation_mode": "nearest",
     "fpn_kernel_size": 1,
     "fpn_padding": 0,
     "fpn_stride": 1,
@@ -214,7 +228,6 @@
       2,
       3
     ],
-    "fuse_type": "sum",
     "hidden_act": "gelu",
     "initializer_range": 0.02,
     "layer_norm_eps": 1e-06,

   "architectures": [
     "Sam2VideoModel"
   ],
   "enable_occlusion_spatial_embedding": true,
   "enable_temporal_pos_encoding_for_object_pointers": true,
   "fill_hole_area": 8,
     "dynamic_multimask_stability_delta": 0.05,
     "dynamic_multimask_stability_thresh": 0.98,
     "dynamic_multimask_via_stability": true,
     "hidden_act": "gelu",
     "hidden_size": 256,
     "iou_head_depth": 3,
     "model_type": "",
     "num_attention_heads": 8,
     "num_hidden_layers": 2,
+    "num_multimask_outputs": 3
   },
   "mask_downsampler_embed_dim": 256,
   "mask_downsampler_hidden_act": "gelu",
   "mask_downsampler_stride": 2,
   "mask_downsampler_total_stride": 16,
   "max_object_pointers_in_encoder": 16,
   "memory_attention_downsample_rate": 1,
   "memory_attention_dropout": 0.1,
   "memory_attention_feed_forward_hidden_act": "relu",
   "memory_encoder_output_channels": 64,
   "memory_fuser_embed_dim": 256,
   "memory_fuser_hidden_act": "gelu",
+  "memory_fuser_intermediate_dim": 1024,
   "memory_fuser_kernel_size": 7,
   "memory_fuser_layer_scale_init_value": 1e-06,
   "memory_fuser_num_layers": 2,
   "memory_fuser_padding": 3,
+  "model_type": "sam2_video",
   "multimask_max_pt_num": 1,
   "multimask_min_pt_num": 0,
   "multimask_output_for_tracking": true,
   "non_overlap_masks": false,
   "non_overlap_masks_for_mem_enc": false,
   "num_maskmem": 7,
   "project_temporal_pos_encoding_in_object_pointers": true,
   "prompt_encoder_config": {
     "hidden_act": "gelu",
   "sigmoid_bias_for_mem_enc": -10.0,
   "sigmoid_scale_for_mem_enc": 20.0,
   "torch_dtype": "float32",
+  "transformers_version": "4.56.0.dev0",
   "vision_config": {
     "backbone_channel_list": [
       1152,
       "architectures": null,
       "bad_words_ids": null,
       "begin_suppress_tokens": null,
+      "blocks_per_stage": [
+        2,
+        6,
+        36,
+        4
+      ],
       "bos_token_id": null,
       "chunk_size_feed_forward": 0,
       "cross_attention_hidden_size": null,
       "decoder_start_token_id": null,
       "diversity_penalty": 0.0,
       "do_sample": false,
       "early_stopping": false,
+      "embed_dim_per_stage": [
+        144,
+        288,
+        576,
+        1152
+      ],
       "encoder_no_repeat_ngram_size": 0,
       "eos_token_id": null,
       "exponential_decay_length_penalty": null,
         33,
         43
       ],
       "hidden_act": "gelu",
       "hidden_size": 144,
       "id2label": {
         "0": "LABEL_0",
         "1": "LABEL_1"
       },
+      "image_size": [
+        1024,
+        1024
+      ],
       "initializer_range": 0.02,
       "is_decoder": false,
       "is_encoder_decoder": false,
       "length_penalty": 1.0,
       "max_length": 20,
       "min_length": 0,
+      "mlp_ratio": 4.0,
       "model_type": "sam2_hiera_det_model",
       "no_repeat_ngram_size": 0,
+      "num_attention_heads": 1,
+      "num_attention_heads_per_stage": [
+        2,
+        4,
+        8,
+        16
+      ],
       "num_beam_groups": 1,
       "num_beams": 1,
       "num_channels": 3,
       "output_hidden_states": false,
       "output_scores": false,
       "pad_token_id": null,
+      "patch_kernel_size": [
+        7,
+        7
+      ],
+      "patch_padding": [
+        3,
+        3
+      ],
+      "patch_stride": [
+        4,
+        4
+      ],
       "prefix": null,
       "problem_type": null,
       "pruned_heads": {},
       "return_dict": true,
       "return_dict_in_generate": false,
       "sep_token_id": null,
       "suppress_tokens": null,
       "task_specific_params": null,
       "temperature": 1.0,
         7,
         7
       ],
+      "window_size_per_stage": [
         8,
         4,
         16,
       ]
     ],
     "fpn_hidden_size": 256,
     "fpn_kernel_size": 1,
     "fpn_padding": 0,
     "fpn_stride": 1,
       2,
       3
     ],
     "hidden_act": "gelu",
     "initializer_range": 0.02,
     "layer_norm_eps": 1e-06,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d281251cae94754e9c7a2313d45b5f2420cb97024e379f7da3fbcb217a8aabe6
-size 897897680

 version https://git-lfs.github.com/spec/v1
+oid sha256:dc407dce21301fd94abb395c5099b4f2c455fdc8a8f261ac3d0ea6d4cd197230
+size 897897416