darrenphodgson76's picture
Update train.py
6d6b976 verified
raw
history blame
3.55 kB
import os
import zipfile
import pandas as pd
import torch from datasets
import Dataset from transformers
import TrainingArguments from unsloth
import FastLanguageModel from trl
import SFTTrainer
def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Set pad_token = eos_token if not already defined
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# 3) Load and format dataset
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
lambda row: (
"### Instruction:\n"
+ row["instruction"].strip()
+ "\n\n### Response:\n"
+ row["response"].strip()
+ tokenizer.eos_token
),
axis=1,
)
full_dataset = Dataset.from_pandas(df[["text"]])
# Split into train and eval
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 5) Tokenization with labels
train_seq_length = 512
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=train_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 6) Define training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
evaluation_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 7) Initialize SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
# 8) Train
print("Starting training...")
trainer.train()
print("Training finished.")
# 9) Save final adapter and tokenizer
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"Saved final adapter to {final_dir}")
# 10) Zip the final model
zip_path = "model.zip"
print(f"Zipping model directory {final_dir} to {zip_path}...")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
for root, _, files in os.walk(final_dir):
for fname in files:
full_path = os.path.join(root, fname)
rel_path = os.path.relpath(full_path, final_dir)
z.write(full_path, rel_path)
print(f"Successfully zipped model to {zip_path}")
if name == "main": main()