darrenphodgson76's picture
Update train.py
f0fba32 verified
raw
history blame
3.72 kB
import unsloth
from unsloth import FastLanguageModel
import os
import zipfile
import pandas as pd
from datasets import Dataset
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 768
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Ensure pad token is set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# 3) Load instruction-response dataset
df = pd.read_json("cleaned_instruction_response.jsonl", lines=True)
# Rebuild clean training text
df["text"] = df.apply(
lambda row: row["instruction"].strip() + "\n\n" + row["response"].strip() + tokenizer.eos_token,
axis=1
)
# Convert to Hugging Face Dataset
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 5) Tokenization
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 6) Training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
eval_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 7) Train
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("πŸš€ Starting training...")
trainer.train()
print("βœ… Training complete.")
# 8) Save and zip
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
zip_path = "model.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"βœ… Model zipped β†’ {zip_path}")
if __name__ == "__main__":
main()