Hunyuan3D-2.1 / hy3dshape /configs /hunyuandit-finetuning-flowmatching-dinog518-bf16-lr1e5-4096.yaml
Huiwenshi's picture
Upload folder using huggingface_hub
600759a verified
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1e-5
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 4
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 81920
pc_sharpedge_size: &pc_sharpedge_size 0
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 1.0039506158752403
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 4096
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
num_encoder_layers: 8
num_decoder_layers: 16
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
point_feats: 4
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino large
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1024
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 16
num_channels: 3
num_hidden_layers: 24
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: false
image_size: 518
use_cls_token: true
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
params:
input_size: *num_latents
in_channels: 64
hidden_size: 2048
context_dim: 1024
depth: 21
num_heads: 16
qk_norm: true
text_len: 1370
with_decoupled_ca: false
use_attention_pooling: false
qk_norm_type: 'rms'
qkv_bias: false
use_pos_emb: false
num_moe_layers: 6
num_experts: 8
moe_top_k: 2
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"