{ "name": "blt_7b", "dump_dir": "/checkpoints/blt_7b", "seed": 42, "debug_dynamo": false, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 240000, "max_steps": null, "data": { "s3_profile": "blt", "batch_size": 4, "seq_len": 4096, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "async_persist_type": "approximate", "prefetch_size": 200, "preprocess_dir": "/corpora/entropy_preprocess", "dataset_files": null, "entropy_model_name": "transformer_100m", "arrow_batch_size": 20, "buffer_size": 512, "file_format": "arrow", "pad_to_max_length": true, "max_encoder_seq_length": 24576, "enable_byte_ngrams": false, "add_patches": true, "tokenizer_args": { "name": "blt", "init_kwargs": { "bpe_tokenizer_path": "/tokenizers/tokenizer_final_32k.minus_inf_ws.model" } }, "patcher_args": { "patching_mode": "entropy", "patching_device": "cuda", "entropy_model_checkpoint_dir": null, "realtime_patching": false, "threshold": 1.335442066192627, "threshold_add": null, "max_patch_length": null, "patch_size": 4.5, "patching_batch_size": 1, "device": "cuda", "monotonicity": false, "log_time": false } }, "optim": { "lr": 0.0004, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 0.01, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5 }, "model": { "dim": 512, "n_layers": 8, "head_dim": null, "n_heads": 8, "n_kv_heads": null, "ffn_dim_multiplier": 1.0, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 500000.0, "rope_use_fp32_in_outer_product": true, "init_base_std": null, "init_std_factor": "current_depth", "max_seqlen": 4096, "attn_impl": "xformers", "attn_bias_type": "block_causal", "eos_id": 2, "seed": 42, "vocab_size": 260, "weight_tying": false, "patch_in_forward": true, "dim_token": null, "dim_global": 4096, "dim_local_decoder": 1280, "dim_local_encoder": 1280, "n_layers_global": 32, "n_layers_local_decoder": 6, "n_layers_local_encoder": 1, "patch_size": 4.5, "patching_mode": "entropy", "patching_threshold": 1.335442066192627, "patching_threshold_add": null, "monotonicity": false, "patching_batch_size": 1, "patching_device": "cuda", "max_patch_length": null, "tie_local_encoder_decoder_logits": false, "use_local_encoder_transformer": true, "encoder_lm_loss": false, "max_encoder_seq_length": 24576, "pad_to_max_length": true, "encoder_enable_byte_ngrams": false, "encoder_enable_byte_group_hash": false, "ngram_vocab_sizes": null, "cross_attn_encoder": true, "cross_attn_decoder": true, "cross_attn_window_encoder": null, "cross_attn_window_decoder": null, "cross_attn_k": 4, "cross_attn_nheads": 20, "cross_attn_all_layers_decoder": true, "cross_attn_all_layers_encoder": false, "cross_attn_use_flex_attention": true, "cross_attn_init_by_pooling": true, "encoder_hash_byte_group_size": [ 3, 4, 5, 6, 7, 8 ], "encoder_hash_byte_group_vocab": 500002, "encoder_hash_byte_group_nb_functions": 1, "log_patch_lengths": false, "non_linearity": "swiglu", "use_rope": true, "recompute_fc1_out": false, "recompute_fc3_out": false, "recompute_attn": false, "custom_bwd": false, "layer_ckpt": "none", "init_use_gaussian": true, "init_use_depth": "current", "alpha_depth": "disabled", "max_length": 4096, "norm_affine": true, "pre_norm": true, "norm_type": "rmsnorm", "dropout": 0.0, "output_size": -1, "architecture": "vanilla", "share_encoder_decoder_emb": true, "global_local_decoder_residual_layer": null, "tokenize_with_bpe_delimiter": false, "patching_thresholds_str": null, "tie_local_encoder_decoder": false, "encoder_preds_low_entropy_toks": null, "encoder_preds_random_toks": null, "dim_token_emb": null, "dim_patch_emb": null, "encoder_ngram_table_dir": null, "encoder_ngram_to_size_str": null, "entropy_model_checkpoint_dir": null, "entropy_model_is_ngram_model": false, "downsampling_by_pooling": "max", "n_heads_global": 32, "n_heads_local_decoder": 20, "n_heads_local_encoder": 20, "n_kv_heads_global": null, "conv_kernel_size": null, "local_attention_window_len": 512, "sequence_parallel": false, "loss_parallel": false, "fuse_sequence_parallel": false, "use_fsdp": true, "attn_to_keep": "all", "pm_size": 0, "full_logging_n_layers": 4 }, "entropy_model": null, "train_entropy_model": false, "distributed": { "dp_shard": 1, "dp_replicate": 256, "tp_size": 1, "selective_activation_checkpointing": true, "compile": false, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "allow_bf16_reduced_precision_reduction": true, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver" }, "env": { "MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" }, "checkpoint": { "dump": { "every": 1000, "keep": 1 }, "eval": { "every": 100000, "keep": -1 }, "path": "/checkpoints/blt_7b", "init_ckpt_path": null, "continue_training_from_init": false, "s3_profile": null }, "profiling": { "run": false, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4 }, "logging": { "freq": 10, "acc_freq": null, "wandb": { "job_type": "train", "dir": null, "project": "blt", "entity": "blt", "tags": null, "group": null, "name": "blt_7b", "notes": null, "config_exclude_keys": null, "config_include_keys": null, "anonymous": null, "mode": null, "allow_val_change": null, "resume": null, "force": null, "tensorboard": null, "sync_tensorboard": null, "monitor_gym": null, "save_code": null, "id": null, "fork_from": null, "resume_from": null } }, "async_eval_gpus": null, "eval": null, "eval_on_gpus": 8 }