{ "resolution_mode": "native", "min_tokens": 256, "max_tokens": 16384, "patch_size": 14, "resize_factor": 2, "spatial_merge_size": 1, "temporal_patch_size": 2, "num_hidden_layers": 32, "num_attention_heads": 24, "hidden_size": 1920, "intermediate_size": 7680, "pe_type": "rope2d", "norm_type": "RMSNorm", "hidden_act": "SwiGLU", "init_method": "xavier", "image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225] }