File size: 5,025 Bytes
600759a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"

training:
  steps: 10_0000_0000
  use_amp: true
  amp_type: "bf16"
  base_lr: 1e-4
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
  val_check_interval: 50 # 4096
  limit_val_batches: 16

dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
    batch_size: 2
    num_workers: 8
    val_num_workers: 4

    # Data 
    train_data_list: tools/mini_trainset/preprocessed
    val_data_list: tools/mini_trainset/preprocessed

    #! Image loading
    cond_stage_key: "image" # image / text / image_text
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]

    #! Point cloud sampling
    pc_size: &pc_size 81920
    pc_sharpedge_size: &pc_sharpedge_size 0
    sharpedge_label: &sharpedge_label true
    return_normal: true

    #! Augmentation
    padding: true

model:
  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
  params:
    first_stage_key: "surface"
    cond_stage_key: "image"
    scale_by_std: false
    z_scale_factor: &z_scale_factor 1.0039506158752403
    torch_compile: false

    # ema_config:
    #   ema_model: LitEma
    #   ema_decay: 0.999
    #   ema_inference: false

    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 512
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        num_encoder_layers: 8
        num_decoder_layers: 16
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
        point_feats: 4
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size

    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        main_image_encoder:
            type: DinoImageEncoder # dino large
            kwargs:
                config:
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1024
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 16
                  num_channels: 3
                  num_hidden_layers: 24
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: false
                image_size: 518
                use_cls_token: true


    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
      params:
        input_size: *num_latents
        in_channels: 64
        hidden_size: 768
        context_dim: 1024
        depth: 6
        num_heads: 12
        qk_norm: true
        text_len: 1370
        with_decoupled_ca: false
        use_attention_pooling: false
        qk_norm_type: 'rms'
        qkv_bias: false
        use_pos_emb: false
        num_moe_layers: 3
        num_experts: 4
        moe_top_k: 2
        
    scheduler_cfg:
      transport:
        target: hy3dshape.models.diffusion.transport.create_transport
        params:
          path_type: Linear
          prediction: velocity
      sampler:
        target: hy3dshape.models.diffusion.transport.Sampler
        params: {}
        ode_params:
          sampling_method: euler # dopri5 ...
          num_steps: &num_steps 50

    optimizer_cfg:
      optimizer:
        target: torch.optim.AdamW
        params:
          betas: [0.9, 0.99]
          eps: 1.e-6
          weight_decay: 1.e-2

      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
          warm_up_steps: 50 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0

    pipeline_cfg:
      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline

    image_processor_cfg:
      target: hy3dshape.preprocessors.ImageProcessorV2
      params: {}

callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
        step_frequency: 100 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
        std: *std
        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
        octree_depth: 8
        num_chunks: 50000
        mc_level: 0.0
    
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
          step_frequency: 50 # 5000
          test_data_path: "tools/mini_testset/images.json"