darrenphodgson76's picture
Update train.py
3c0a089 verified
raw
history blame
4.05 kB
import unsloth
from unsloth import FastLanguageModel
import os
import zipfile
import pandas as pd
from datasets import Dataset
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 768
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Configure special tokens for chat format
special_tokens = {
"bos_token": "<|im_start|>user\n",
"eos_token": "<|im_end|>",
"pad_token": "<|im_end|>",
"additional_special_tokens": [
"<|im_start|>assistant\n"
]
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# 3) Load chat-formatted dataset
df = pd.read_json("data.jsonl", lines=True)
# Confirm each sample ends with the end-of-turn token
assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"
# 4) Create Hugging Face Dataset and split
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 5) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 6) Tokenization function
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 7) Training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
eval_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 8) Train with SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("πŸš€ Starting training...")
trainer.train()
print("βœ… Training complete.")
# 9) Save and zip model
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
zip_path = "model.zip"
print(f"πŸ—œ Zipping model β†’ {zip_path}")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"βœ… Model zipped β†’ {zip_path}")
if __name__ == "__main__":
main()