Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,025 Bytes
600759a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1e-4
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 2
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 81920
pc_sharpedge_size: &pc_sharpedge_size 0
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 1.0039506158752403
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 512
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
num_encoder_layers: 8
num_decoder_layers: 16
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
point_feats: 4
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino large
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1024
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 16
num_channels: 3
num_hidden_layers: 24
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: false
image_size: 518
use_cls_token: true
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
params:
input_size: *num_latents
in_channels: 64
hidden_size: 768
context_dim: 1024
depth: 6
num_heads: 12
qk_norm: true
text_len: 1370
with_decoupled_ca: false
use_attention_pooling: false
qk_norm_type: 'rms'
qkv_bias: false
use_pos_emb: false
num_moe_layers: 3
num_experts: 4
moe_top_k: 2
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"
|