Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.gitattributes +1 -0
README.md +88 -0
config.json +112 -0
easydel-model.parameters +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+easydel-model.parameters filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# BaseTrainer
+## 🚀 Trained With [EasyDeL](https://github.com/erfanzar/EasyDeL)
+EasyDeL is an open-source framework designed to enhance and streamline the training process of machine learning
+models. With a primary focus on Jax, EasyDeL aims to provide convenient and effective solutions for
+training Flax/Jax models on TPU/GPU, for both serving and training purposes.
+## 📦 Installation & Usage
+```python
+from easydel import AutoEasyDeLModelForCausalLM
+from jax import numpy as jnp, lax
+model = AutoEasyDeLModelForCausalLM.from_pretrained(
+    f"REPO_ID/BaseTrainer",
+    dtype=...,
+    param_dtype=...,
+    precision=lax.Precision("fastest"),
+    auto_shard_model=True,
+)
+```
+## 🔧 Training Configuration
+### Model Details
+- **Architecture**: gemma3_text
+- **Platform**: TPU
+- **Number of Devices**: 16
+### Training Parameters
+- **Learning Rate**: 4e-05 → 4e-06
+- **Optimizer**: adamw
+- **Scheduler**: cosine
+- **Warmup Steps**: 50
+- **Weight Decay**: 0.02
+- **Loss Config**: LossConfig(
+  ignore_index : -100
+  label_smoothing : 0.0
+  z_loss : 0.0
+  loss_normalizing_factor : NUM_REAL_TARGET_TOKENS
+  num_labels : None
+  problem_type : None
+  divide_weight_sum : False
+  shift_tokens : True
+  break_on_nan : True
+  reduction : None
+  num_classification_labels : None
+  classification_problem_type : None
+)
+### Training Setup
+- **Epochs**: 3
+- **Batch Size**: 8
+- **Sequence Length**: 8192
+- **Dtype**: <class 'jax.numpy.bfloat16'>
+- **Params Dtype**: <class 'jax.numpy.bfloat16'>
+### Advanced Configuration
+- **Gradient Checkpointing**:
+- **Gradient Accumulation Steps**: 1
+- **Max Training Steps**: None
+- **Max Evaluation Steps**: None
+- **Training Duration**: 7H
+### Sharding Configuration
+```python
+# Partition Rules
+( ('model/embed_tokens/embedding', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('self_attn/q_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('self_attn/k_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('self_attn/v_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('self_attn/o_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('mlp/gate_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('mlp/up_proj/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('mlp/down_proj/kernel', PartitionSpec('tp', ('fsdp', 'sp'))),
+  ('input_layernorm/kernel', PartitionSpec(None,)),
+  ('post_attention_layernorm/kernel', PartitionSpec(None,)),
+  ('pre_feedforward_layernorm/kernel', PartitionSpec(None,)),
+  ('post_feedforward_layernorm/kernel', PartitionSpec(None,)),
+  ('model/norm/kernel', PartitionSpec(None,)),
+  ('lm_head/kernel', PartitionSpec(('fsdp', 'sp'), 'tp')),
+  ('.*', PartitionSpec(None,)))
+```
+---
+*Generated with EasyDeL v0.1.3*

config.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "attn_mechanism": "splash",
+  "axis_dims": [
+    1,
+    -1,
+    2,
+    1
+  ],
+  "axis_names": [
+    "dp",
+    "fsdp",
+    "tp",
+    "sp"
+  ],
+  "backend": null,
+  "bits": null,
+  "blocksize_b": 1,
+  "blocksize_k": 128,
+  "blocksize_q": 128,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dcn_axis_dims": null,
+  "easy_method": "train",
+  "eos_token_id": 1,
+  "fcm_max_ratio": 0.0,
+  "fcm_min_ratio": 0.0,
+  "final_logit_softcapping": null,
+  "flash_attention_backward_pass_impl": "triton",
+  "freq_max_position_embeddings": 8192,
+  "gradient_checkpointing": "",
+  "hardware_abstraction": false,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "kv_cache_quantization_blocksize": 64,
+  "kv_cache_quantization_method": "None",
+  "kv_cache_sharding_sequence_axis_name": "sp",
+  "mask_max_position_embeddings": 8192,
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "pallas_k_block_size": 128,
+  "pallas_m_block_size": 128,
+  "pallas_n_block_size": 128,
+  "partition_axis": {
+    "attention_dim_axis": null,
+    "batch_axis": [
+      "fsdp",
+      "dp"
+    ],
+    "bias_head_sequence_axis": null,
+    "bias_key_sequence_axis": null,
+    "data_parallel_axis": "dp",
+    "expert_axis": "ep",
+    "expert_gate_axis": null,
+    "expert_parallel_axis": "ep",
+    "fully_sharded_data_parallel_axis": "fsdp",
+    "generation_attention_dim_axis": null,
+    "generation_batch_axis": null,
+    "generation_head_axis": "tp",
+    "generation_key_sequence_axis": "sp",
+    "generation_query_sequence_axis": null,
+    "head_axis": "tp",
+    "hidden_state_axis": "tp",
+    "key_sequence_axis": "sp",
+    "mlp_intermediate_axis": "tp",
+    "query_sequence_axis": "sp",
+    "sequence_axis": "sp",
+    "sequence_parallel_axis": "sp",
+    "tensor_parallel_axis": "tp",
+    "vocab_axis": "tp"
+  },
+  "platform": "jax",
+  "precompute_masks": true,
+  "pretraining_tp": 1,
+  "quantization_blocksize": 64,
+  "quantization_method": "None",
+  "quantization_pattern": ".*",
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "scan_attention_layers": false,
+  "scan_layers": false,
+  "scan_mlp_chunk_size": 1024,
+  "scan_ring_attention": true,
+  "sequence_axis_name": "sp",
+  "shard_attention_computation": true,
+  "sliding_window": 1024,
+  "sliding_window_pattern": 6,
+  "transformers_version": "4.50.3",
+  "use_cache": true,
+  "use_scan_mlp": false,
+  "use_sharded_kv_caching": false,
+  "use_sharding_constraint": false,
+  "vocab_size": 262208
+}

easydel-model.parameters ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5eca75e5f5eefa1ff263feac16d54061573e335f3add37d6f38996bcacd3ca
+size 9103083144