sparsh35 commited on
Commit
cf1b2dc
·
verified ·
1 Parent(s): c1ca3f4

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ easydel-model.parameters filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # BaseTrainer
3
+
4
+ ## 🚀 Trained With [EasyDeL](https://github.com/erfanzar/EasyDeL)
5
+
6
+ EasyDeL is an open-source framework designed to enhance and streamline the training process of machine learning
7
+ models. With a primary focus on Jax, EasyDeL aims to provide convenient and effective solutions for
8
+ training Flax/Jax models on TPU/GPU, for both serving and training purposes.
9
+
10
+ ## 📦 Installation & Usage
11
+
12
+ ```python
13
+ from easydel import AutoEasyDeLModelForCausalLM
14
+ from jax import numpy as jnp, lax
15
+
16
+ model = AutoEasyDeLModelForCausalLM.from_pretrained(
17
+ f"REPO_ID/BaseTrainer",
18
+ dtype=...,
19
+ param_dtype=...,
20
+ precision=lax.Precision("fastest"),
21
+ auto_shard_model=True,
22
+ )
23
+ ```
24
+
25
+ ## 🔧 Training Configuration
26
+
27
+ ### Model Details
28
+ - **Architecture**: qwen2
29
+ - **Platform**: TPU
30
+ - **Number of Devices**: 16
31
+
32
+ ### Training Parameters
33
+ - **Learning Rate**: 5e-05 → 5e-06
34
+ - **Optimizer**: adamw
35
+ - **Scheduler**: cosine
36
+ - **Warmup Steps**: 160
37
+ - **Weight Decay**: 0.02
38
+ - **Loss Config**: LossConfig(
39
+ ignore_index: -100
40
+ label_smoothing: 0.0
41
+ z_loss: 0.0
42
+ loss_normalizing_factor: 'NUM_REAL_TARGET_TOKENS'
43
+ num_labels: None
44
+ problem_type: None
45
+ divide_weight_sum: False
46
+ shift_tokens: True
47
+ break_on_nan: True
48
+ reduction: None
49
+ num_classification_labels: None
50
+ classification_problem_type: None
51
+ )
52
+
53
+ ### Training Setup
54
+ - **Epochs**: 2
55
+ - **Batch Size**: 8
56
+ - **Sequence Length**: 4096
57
+ - **Dtype**: <class 'jax.numpy.bfloat16'>
58
+ - **Params Dtype**: <class 'jax.numpy.bfloat16'>
59
+
60
+ ### Advanced Configuration
61
+ - **Gradient Checkpointing**:
62
+ - **Gradient Accumulation Steps**: 1
63
+ - **Max Training Steps**: None
64
+ - **Max Evaluation Steps**: None
65
+ - **Training Duration**: 7H
66
+
67
+ ### Sharding Configuration
68
+ ```python
69
+ # Partition Rules
70
+ ( ('model/embed_tokens/embedding', PartitionSpec('tp', ('fsdp', 'sp'))),
71
+ ( 'self_attn/(q_proj|k_proj|v_proj)/kernel',
72
+ PartitionSpec(('fsdp', 'sp'), 'tp')),
73
+ ('self_attn/o_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
74
+ ('mlp/gate_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
75
+ ('mlp/down_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
76
+ ('mlp/up_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
77
+ ('input_layernorm/kernel', PartitionSpec(None,)),
78
+ ('post_attention_layernorm/kernel', PartitionSpec(None,)),
79
+ ('model/norm/kernel', PartitionSpec(None,)),
80
+ ('lm_head/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
81
+ ('.*', PartitionSpec(None,)))
82
+ ```
83
+
84
+ ---
85
+ *Generated with EasyDeL v0.1.2*
config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_mechanism": "splash",
7
+ "axis_dims": [
8
+ 1,
9
+ 8,
10
+ 1,
11
+ 2
12
+ ],
13
+ "axis_names": [
14
+ "dp",
15
+ "fsdp",
16
+ "tp",
17
+ "sp"
18
+ ],
19
+ "backend": null,
20
+ "bits": null,
21
+ "blocksize_b": 1,
22
+ "blocksize_k": 128,
23
+ "blocksize_q": 128,
24
+ "bos_token_id": 151643,
25
+ "dcn_axis_dims": null,
26
+ "easy_method": "train",
27
+ "embd_pdrop": 0.0,
28
+ "eos_token_id": 151643,
29
+ "fcm_max_ratio": 0.0,
30
+ "fcm_min_ratio": 0.0,
31
+ "flash_attention_backward_pass_impl": "triton",
32
+ "freq_max_position_embeddings": 4096,
33
+ "gradient_checkpointing": "",
34
+ "hardware_abstraction": false,
35
+ "head_dim": 128,
36
+ "hidden_act": "silu",
37
+ "hidden_size": 5120,
38
+ "initializer_range": 0.02,
39
+ "intermediate_size": 13824,
40
+ "kv_cache_quantization_blocksize": 64,
41
+ "kv_cache_quantization_method": "None",
42
+ "kv_cache_sharding_sequence_axis_name": "sp",
43
+ "mask_max_position_embeddings": 4096,
44
+ "max_position_embeddings": 131072,
45
+ "max_window_layers": 48,
46
+ "model_type": "qwen2",
47
+ "num_attention_heads": 40,
48
+ "num_hidden_layers": 48,
49
+ "num_key_value_heads": 8,
50
+ "number_rep_kv": 1,
51
+ "pallas_k_block_size": 128,
52
+ "pallas_m_block_size": 128,
53
+ "pallas_n_block_size": 128,
54
+ "partition_axis": [
55
+ [
56
+ "fsdp",
57
+ "dp"
58
+ ],
59
+ "sp",
60
+ "sp",
61
+ "tp",
62
+ "sp",
63
+ "tp",
64
+ null,
65
+ null,
66
+ null,
67
+ null,
68
+ "tp",
69
+ "sp",
70
+ null
71
+ ],
72
+ "platform": "jax",
73
+ "pretraining_tp": 1,
74
+ "quantization_blocksize": 64,
75
+ "quantization_method": "None",
76
+ "quantization_pattern": ".*",
77
+ "resid_pdrop": 0.0,
78
+ "rms_norm_eps": 1e-05,
79
+ "rope_scaling": null,
80
+ "rope_theta": 10000.0,
81
+ "scan_attention_layers": false,
82
+ "scan_layers": true,
83
+ "scan_mlp_chunk_size": 1024,
84
+ "scan_ring_attention": true,
85
+ "sequence_axis_name": "sp",
86
+ "shard_attention_computation": true,
87
+ "sliding_window": null,
88
+ "tie_word_embeddings": false,
89
+ "torch_dtype": "bfloat16",
90
+ "transformers_version": "4.50.3",
91
+ "use_cache": true,
92
+ "use_scan_mlp": false,
93
+ "use_sharded_kv_caching": false,
94
+ "use_sharding_constraint": false,
95
+ "use_sliding_window": false,
96
+ "vocab_size": 151667
97
+ }
easydel-model.parameters ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bfbcc8a85552c0d0e8cb5f569f43b3d432f7d68bd128d9f965047a41b63ba04
3
+ size 29532004240
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.50.3"
9
+ }