File size: 3,800 Bytes
6d6b976 4d2faa4 7693569 785db18 7693569 785db18 7693569 b6a51b0 7693569 4d2faa4 7693569 b6a51b0 7693569 12a4ede 7693569 8ba34e8 785db18 7693569 785db18 7693569 785db18 7693569 785db18 7693569 4d2faa4 7693569 4d2faa4 7693569 785db18 7693569 2051c3e 7693569 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import zipfile
from datasets import Dataset
import pandas as pd
import torch
from transformers import TrainingArguments
from trl import FastLanguageModel, SFTTrainer
def main():
# 1) Load 4-bit model + tokenizer
model_name = "HuggingFaceTB/SmolLM2-1.7B"
max_seq_length = 2048
dtype = torch.float16
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Ensure pad_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# 3) Load and format dataset
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
lambda row: (
"### Instruction:\n"
+ row["instruction"].strip()
+ "\n\n### Response:\n"
+ row["response"].strip()
+ tokenizer.eos_token
),
axis=1,
)
full_dataset = Dataset.from_pandas(df[["text"]])
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 5) Tokenization with labels
train_seq_length = 512
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=train_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 6) Define training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
evaluation_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 7) Initialize and run SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print("Starting training...")
trainer.train()
print("Training finished.")
# 8) Save and zip model
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"Saved final adapter to {final_dir}")
zip_path = "model.zip"
print(f"Zipping {final_dir} → {zip_path}")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for root, _, files in os.walk(final_dir):
for fname in files:
full = os.path.join(root, fname)
rel = os.path.relpath(full, final_dir)
zf.write(full, rel)
print(f"Successfully zipped model to {zip_path}")
if __name__ == "__main__":
main() |