darrenphodgson76's picture
Update train.py
7693569 verified
raw
history blame
3.8 kB
import os
import zipfile
from datasets import Dataset
import pandas as pd
import torch
from transformers import TrainingArguments
from trl import FastLanguageModel, SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 2048
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Ensure pad_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# 3) Load and format dataset
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
lambda row: (
"### Instruction:\n"
+ row["instruction"].strip()
+ "\n\n### Response:\n"
+ row["response"].strip()
+ tokenizer.eos_token
),
axis=1,
)
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 5) Tokenization with labels
train_seq_length = 512
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=train_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 6) Define training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
evaluation_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 7) Initialize and run SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("Starting training...")
trainer.train()
print("Training finished.")
# 8) Save and zip model
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"Saved final adapter to {final_dir}")
zip_path = "model.zip"
print(f"Zipping {final_dir} β†’ {zip_path}")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"Successfully zipped model to {zip_path}")
if __name__ == "__main__":
main()