yonigozlan HF Staff commited on
Commit
3d6b069
·
verified ·
1 Parent(s): 643cf0c

Upload model

Browse files
Files changed (2) hide show
  1. config.json +41 -28
  2. model.safetensors +2 -2
config.json CHANGED
@@ -2,7 +2,6 @@
2
  "architectures": [
3
  "Sam2VideoModel"
4
  ],
5
- "binarize_mask_from_pts_for_mem_enc": true,
6
  "enable_occlusion_spatial_embedding": true,
7
  "enable_temporal_pos_encoding_for_object_pointers": true,
8
  "fill_hole_area": 8,
@@ -13,7 +12,6 @@
13
  "dynamic_multimask_stability_delta": 0.05,
14
  "dynamic_multimask_stability_thresh": 0.98,
15
  "dynamic_multimask_via_stability": true,
16
- "feed_forward_hidden_act": "relu",
17
  "hidden_act": "gelu",
18
  "hidden_size": 256,
19
  "iou_head_depth": 3,
@@ -22,8 +20,7 @@
22
  "model_type": "",
23
  "num_attention_heads": 8,
24
  "num_hidden_layers": 2,
25
- "num_multimask_outputs": 3,
26
- "two_way_transformer_activation": "relu"
27
  },
28
  "mask_downsampler_embed_dim": 256,
29
  "mask_downsampler_hidden_act": "gelu",
@@ -32,9 +29,6 @@
32
  "mask_downsampler_stride": 2,
33
  "mask_downsampler_total_stride": 16,
34
  "max_object_pointers_in_encoder": 16,
35
- "memory_attention_apply_pe_at_cross_attn_keys": true,
36
- "memory_attention_apply_pe_at_cross_attn_queries": false,
37
- "memory_attention_apply_pe_at_self_attn": false,
38
  "memory_attention_downsample_rate": 1,
39
  "memory_attention_dropout": 0.1,
40
  "memory_attention_feed_forward_hidden_act": "relu",
@@ -52,12 +46,12 @@
52
  "memory_encoder_output_channels": 64,
53
  "memory_fuser_embed_dim": 256,
54
  "memory_fuser_hidden_act": "gelu",
 
55
  "memory_fuser_kernel_size": 7,
56
  "memory_fuser_layer_scale_init_value": 1e-06,
57
  "memory_fuser_num_layers": 2,
58
  "memory_fuser_padding": 3,
59
- "memory_fuser_use_depthwise_conv": true,
60
- "model_type": "sam2",
61
  "multimask_max_pt_num": 1,
62
  "multimask_min_pt_num": 0,
63
  "multimask_output_for_tracking": true,
@@ -65,7 +59,6 @@
65
  "non_overlap_masks": false,
66
  "non_overlap_masks_for_mem_enc": false,
67
  "num_maskmem": 7,
68
- "preserve_temporal_direction_in_object_pointers": true,
69
  "project_temporal_pos_encoding_in_object_pointers": true,
70
  "prompt_encoder_config": {
71
  "hidden_act": "gelu",
@@ -81,7 +74,7 @@
81
  "sigmoid_bias_for_mem_enc": -10.0,
82
  "sigmoid_scale_for_mem_enc": 20.0,
83
  "torch_dtype": "float32",
84
- "transformers_version": "4.54.0.dev0",
85
  "vision_config": {
86
  "backbone_channel_list": [
87
  1152,
@@ -95,15 +88,25 @@
95
  "architectures": null,
96
  "bad_words_ids": null,
97
  "begin_suppress_tokens": null,
 
 
 
 
 
 
98
  "bos_token_id": null,
99
  "chunk_size_feed_forward": 0,
100
  "cross_attention_hidden_size": null,
101
  "decoder_start_token_id": null,
102
- "dim_mul": 2.0,
103
  "diversity_penalty": 0.0,
104
  "do_sample": false,
105
- "drop_path_rate": 0.0,
106
  "early_stopping": false,
 
 
 
 
 
 
107
  "encoder_no_repeat_ngram_size": 0,
108
  "eos_token_id": null,
109
  "exponential_decay_length_penalty": null,
@@ -115,14 +118,16 @@
115
  33,
116
  43
117
  ],
118
- "head_mul": 2.0,
119
  "hidden_act": "gelu",
120
  "hidden_size": 144,
121
  "id2label": {
122
  "0": "LABEL_0",
123
  "1": "LABEL_1"
124
  },
125
- "image_size": 1024,
 
 
 
126
  "initializer_range": 0.02,
127
  "is_decoder": false,
128
  "is_encoder_decoder": false,
@@ -134,9 +139,16 @@
134
  "length_penalty": 1.0,
135
  "max_length": 20,
136
  "min_length": 0,
 
137
  "model_type": "sam2_hiera_det_model",
138
  "no_repeat_ngram_size": 0,
139
- "num_attention_heads": 2,
 
 
 
 
 
 
140
  "num_beam_groups": 1,
141
  "num_beams": 1,
142
  "num_channels": 3,
@@ -146,9 +158,18 @@
146
  "output_hidden_states": false,
147
  "output_scores": false,
148
  "pad_token_id": null,
149
- "patch_kernel_size": 7,
150
- "patch_padding": 3,
151
- "patch_stride": 4,
 
 
 
 
 
 
 
 
 
152
  "prefix": null,
153
  "problem_type": null,
154
  "pruned_heads": {},
@@ -161,12 +182,6 @@
161
  "return_dict": true,
162
  "return_dict_in_generate": false,
163
  "sep_token_id": null,
164
- "stages": [
165
- 2,
166
- 6,
167
- 36,
168
- 4
169
- ],
170
  "suppress_tokens": null,
171
  "task_specific_params": null,
172
  "temperature": 1.0,
@@ -184,7 +199,7 @@
184
  7,
185
  7
186
  ],
187
- "window_spec": [
188
  8,
189
  4,
190
  16,
@@ -206,7 +221,6 @@
206
  ]
207
  ],
208
  "fpn_hidden_size": 256,
209
- "fpn_interpolation_mode": "nearest",
210
  "fpn_kernel_size": 1,
211
  "fpn_padding": 0,
212
  "fpn_stride": 1,
@@ -214,7 +228,6 @@
214
  2,
215
  3
216
  ],
217
- "fuse_type": "sum",
218
  "hidden_act": "gelu",
219
  "initializer_range": 0.02,
220
  "layer_norm_eps": 1e-06,
 
2
  "architectures": [
3
  "Sam2VideoModel"
4
  ],
 
5
  "enable_occlusion_spatial_embedding": true,
6
  "enable_temporal_pos_encoding_for_object_pointers": true,
7
  "fill_hole_area": 8,
 
12
  "dynamic_multimask_stability_delta": 0.05,
13
  "dynamic_multimask_stability_thresh": 0.98,
14
  "dynamic_multimask_via_stability": true,
 
15
  "hidden_act": "gelu",
16
  "hidden_size": 256,
17
  "iou_head_depth": 3,
 
20
  "model_type": "",
21
  "num_attention_heads": 8,
22
  "num_hidden_layers": 2,
23
+ "num_multimask_outputs": 3
 
24
  },
25
  "mask_downsampler_embed_dim": 256,
26
  "mask_downsampler_hidden_act": "gelu",
 
29
  "mask_downsampler_stride": 2,
30
  "mask_downsampler_total_stride": 16,
31
  "max_object_pointers_in_encoder": 16,
 
 
 
32
  "memory_attention_downsample_rate": 1,
33
  "memory_attention_dropout": 0.1,
34
  "memory_attention_feed_forward_hidden_act": "relu",
 
46
  "memory_encoder_output_channels": 64,
47
  "memory_fuser_embed_dim": 256,
48
  "memory_fuser_hidden_act": "gelu",
49
+ "memory_fuser_intermediate_dim": 1024,
50
  "memory_fuser_kernel_size": 7,
51
  "memory_fuser_layer_scale_init_value": 1e-06,
52
  "memory_fuser_num_layers": 2,
53
  "memory_fuser_padding": 3,
54
+ "model_type": "sam2_video",
 
55
  "multimask_max_pt_num": 1,
56
  "multimask_min_pt_num": 0,
57
  "multimask_output_for_tracking": true,
 
59
  "non_overlap_masks": false,
60
  "non_overlap_masks_for_mem_enc": false,
61
  "num_maskmem": 7,
 
62
  "project_temporal_pos_encoding_in_object_pointers": true,
63
  "prompt_encoder_config": {
64
  "hidden_act": "gelu",
 
74
  "sigmoid_bias_for_mem_enc": -10.0,
75
  "sigmoid_scale_for_mem_enc": 20.0,
76
  "torch_dtype": "float32",
77
+ "transformers_version": "4.56.0.dev0",
78
  "vision_config": {
79
  "backbone_channel_list": [
80
  1152,
 
88
  "architectures": null,
89
  "bad_words_ids": null,
90
  "begin_suppress_tokens": null,
91
+ "blocks_per_stage": [
92
+ 2,
93
+ 6,
94
+ 36,
95
+ 4
96
+ ],
97
  "bos_token_id": null,
98
  "chunk_size_feed_forward": 0,
99
  "cross_attention_hidden_size": null,
100
  "decoder_start_token_id": null,
 
101
  "diversity_penalty": 0.0,
102
  "do_sample": false,
 
103
  "early_stopping": false,
104
+ "embed_dim_per_stage": [
105
+ 144,
106
+ 288,
107
+ 576,
108
+ 1152
109
+ ],
110
  "encoder_no_repeat_ngram_size": 0,
111
  "eos_token_id": null,
112
  "exponential_decay_length_penalty": null,
 
118
  33,
119
  43
120
  ],
 
121
  "hidden_act": "gelu",
122
  "hidden_size": 144,
123
  "id2label": {
124
  "0": "LABEL_0",
125
  "1": "LABEL_1"
126
  },
127
+ "image_size": [
128
+ 1024,
129
+ 1024
130
+ ],
131
  "initializer_range": 0.02,
132
  "is_decoder": false,
133
  "is_encoder_decoder": false,
 
139
  "length_penalty": 1.0,
140
  "max_length": 20,
141
  "min_length": 0,
142
+ "mlp_ratio": 4.0,
143
  "model_type": "sam2_hiera_det_model",
144
  "no_repeat_ngram_size": 0,
145
+ "num_attention_heads": 1,
146
+ "num_attention_heads_per_stage": [
147
+ 2,
148
+ 4,
149
+ 8,
150
+ 16
151
+ ],
152
  "num_beam_groups": 1,
153
  "num_beams": 1,
154
  "num_channels": 3,
 
158
  "output_hidden_states": false,
159
  "output_scores": false,
160
  "pad_token_id": null,
161
+ "patch_kernel_size": [
162
+ 7,
163
+ 7
164
+ ],
165
+ "patch_padding": [
166
+ 3,
167
+ 3
168
+ ],
169
+ "patch_stride": [
170
+ 4,
171
+ 4
172
+ ],
173
  "prefix": null,
174
  "problem_type": null,
175
  "pruned_heads": {},
 
182
  "return_dict": true,
183
  "return_dict_in_generate": false,
184
  "sep_token_id": null,
 
 
 
 
 
 
185
  "suppress_tokens": null,
186
  "task_specific_params": null,
187
  "temperature": 1.0,
 
199
  7,
200
  7
201
  ],
202
+ "window_size_per_stage": [
203
  8,
204
  4,
205
  16,
 
221
  ]
222
  ],
223
  "fpn_hidden_size": 256,
 
224
  "fpn_kernel_size": 1,
225
  "fpn_padding": 0,
226
  "fpn_stride": 1,
 
228
  2,
229
  3
230
  ],
 
231
  "hidden_act": "gelu",
232
  "initializer_range": 0.02,
233
  "layer_norm_eps": 1e-06,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d281251cae94754e9c7a2313d45b5f2420cb97024e379f7da3fbcb217a8aabe6
3
- size 897897680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc407dce21301fd94abb395c5099b4f2c455fdc8a8f261ac3d0ea6d4cd197230
3
+ size 897897416