File size: 2,562 Bytes
e747724 274e76a 785db18 0c0545f 12a4ede 0c0545f 785db18 8ba34e8 785db18 8ba34e8 785db18 166e7bb 8ba34e8 b6a51b0 e747724 8ba34e8 166e7bb 8ba34e8 b6a51b0 166e7bb 274e76a 8ba34e8 274e76a 12a4ede 8ba34e8 274e76a 8ba34e8 785db18 274e76a 785db18 166e7bb 785db18 8ba34e8 785db18 166e7bb 785db18 8ba34e8 785db18 8ba34e8 785db18 166e7bb 0c0545f 2051c3e 166e7bb 0c0545f b095c6e 166e7bb 0c0545f 8ba34e8 166e7bb 0c0545f b095c6e e747724 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# β
Final train.py with JSONL input and EOS-as-pad (no stop_sequences)
import unsloth # must be first
import pandas as pd
import torch
from datasets import Dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from trl import SFTTrainer
import os
import zipfile
# 1) Load Unsloth model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="HuggingFaceTB/SmolLM2-1.7B",
max_seq_length=2048,
dtype=torch.float16,
load_in_4bit=True,
)
# 2) Reuse the existing eos_token as pad_token
eos = tokenizer.eos_token # should be "<|endoftext|>"
tokenizer.pad_token = eos
model.config.pad_token_id = tokenizer.eos_token_id
# 3) Load & format your dataset from JSONL, always ending responses with EOS
# Read the JSONL we generated (one JSON object per line with "instruction" & "response")
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
lambda row: (
"### Instruction:\n"
+ row["instruction"].strip()
+ "\n\n### Response:\n"
+ row["response"].strip()
+ eos
),
axis=1
)
dataset = Dataset.from_pandas(df[["text"]])
# 4) Apply LoRA
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
)
# 5) Tokenize
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
padding="max_length",
max_length=512,
)
tokenized_dataset = dataset.map(tokenize, batched=True)
# 6) Training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=2,
num_train_epochs=3,
learning_rate=2e-4,
logging_steps=10,
save_steps=100,
fp16=True,
)
# 7) Initialize SFTTrainer (no stop_sequences here)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_dataset,
)
# 8) Train!
trainer.train()
# 9) Save the adapter
output_dir = "./output_model"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
# π§ Zip for download
zip_path = "/home/user/app/model.zip"
try:
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
for root, _, files in os.walk(output_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, output_dir)
z.write(full, rel)
print(f"β
Zipped model to {zip_path}")
except Exception as e:
print(f"β Failed to zip model: {e}") |