name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" training: steps: 10_0000_0000 use_amp: true amp_type: "bf16" base_lr: 1e-4 gradient_clip_val: 1.0 gradient_clip_algorithm: "norm" every_n_train_steps: 2000 # 5000 val_check_interval: 50 # 4096 limit_val_batches: 16 dataset: target: hy3dshape.data.dit_asl.AlignedShapeLatentModule params: #! Base setting batch_size: 2 num_workers: 8 val_num_workers: 4 # Data train_data_list: tools/mini_trainset/preprocessed val_data_list: tools/mini_trainset/preprocessed #! Image loading cond_stage_key: "image" # image / text / image_text image_size: 518 mean: &mean [0.5, 0.5, 0.5] std: &std [0.5, 0.5, 0.5] #! Point cloud sampling pc_size: &pc_size 81920 pc_sharpedge_size: &pc_sharpedge_size 0 sharpedge_label: &sharpedge_label true return_normal: true #! Augmentation padding: true model: target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser params: first_stage_key: "surface" cond_stage_key: "image" scale_by_std: false z_scale_factor: &z_scale_factor 1.0039506158752403 torch_compile: false # ema_config: # ema_model: LitEma # ema_decay: 0.999 # ema_inference: false first_stage_config: target: hy3dshape.models.autoencoders.ShapeVAE from_pretrained: tencent/Hunyuan3D-2.1 params: num_latents: &num_latents 512 embed_dim: 64 num_freqs: 8 include_pi: false heads: 16 width: 1024 num_encoder_layers: 8 num_decoder_layers: 16 qkv_bias: false qk_norm: true scale_factor: *z_scale_factor geo_decoder_mlp_expand_ratio: 4 geo_decoder_downsample_ratio: 1 geo_decoder_ln_post: true point_feats: 4 pc_size: *pc_size pc_sharpedge_size: *pc_sharpedge_size cond_stage_config: target: hy3dshape.models.conditioner.SingleImageEncoder params: main_image_encoder: type: DinoImageEncoder # dino large kwargs: config: attention_probs_dropout_prob: 0.0 drop_path_rate: 0.0 hidden_act: gelu hidden_dropout_prob: 0.0 hidden_size: 1024 image_size: 518 initializer_range: 0.02 layer_norm_eps: 1.e-6 layerscale_value: 1.0 mlp_ratio: 4 model_type: dinov2 num_attention_heads: 16 num_channels: 3 num_hidden_layers: 24 patch_size: 14 qkv_bias: true torch_dtype: float32 use_swiglu_ffn: false image_size: 518 use_cls_token: true denoiser_cfg: target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain params: input_size: *num_latents in_channels: 64 hidden_size: 768 context_dim: 1024 depth: 6 num_heads: 12 qk_norm: true text_len: 1370 with_decoupled_ca: false use_attention_pooling: false qk_norm_type: 'rms' qkv_bias: false use_pos_emb: false num_moe_layers: 3 num_experts: 4 moe_top_k: 2 scheduler_cfg: transport: target: hy3dshape.models.diffusion.transport.create_transport params: path_type: Linear prediction: velocity sampler: target: hy3dshape.models.diffusion.transport.Sampler params: {} ode_params: sampling_method: euler # dopri5 ... num_steps: &num_steps 50 optimizer_cfg: optimizer: target: torch.optim.AdamW params: betas: [0.9, 0.99] eps: 1.e-6 weight_decay: 1.e-2 scheduler: target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler params: warm_up_steps: 50 # 5000 f_start: 1.e-6 f_min: 1.e-3 f_max: 1.0 pipeline_cfg: target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline image_processor_cfg: target: hy3dshape.preprocessors.ImageProcessorV2 params: {} callbacks: logger: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger params: step_frequency: 100 # 10000 num_samples: 1 sample_times: 1 mean: *mean std: *std bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01] octree_depth: 8 num_chunks: 50000 mc_level: 0.0 file_loggers: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger params: step_frequency: 50 # 5000 test_data_path: "tools/mini_testset/images.json"