File size: 3,552 Bytes
6d6b976 4d2faa4 785db18 4d2faa4 785db18 4d2faa4 8ba34e8 b6a51b0 4d2faa4 e747724 8ba34e8 4d2faa4 8ba34e8 4d2faa4 8ba34e8 4d2faa4 b6a51b0 4d2faa4 274e76a 8ba34e8 4d2faa4 274e76a 12a4ede 4d2faa4 8ba34e8 4d2faa4 8ba34e8 4d2faa4 785db18 4d2faa4 785db18 4d2faa4 785db18 8ba34e8 4d2faa4 8ba34e8 4d2faa4 785db18 4d2faa4 785db18 8ba34e8 4d2faa4 785db18 4d2faa4 785db18 4d2faa4 6d6b976 4d2faa4 785db18 4d2faa4 2051c3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os
import zipfile
import pandas as pd
import torch from datasets
import Dataset from transformers
import TrainingArguments from unsloth
import FastLanguageModel from trl
import SFTTrainer
def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# 2) Set pad_token = eos_token if not already defined
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# 3) Load and format dataset
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
lambda row: (
"### Instruction:\n"
+ row["instruction"].strip()
+ "\n\n### Response:\n"
+ row["response"].strip()
+ tokenizer.eos_token
),
axis=1,
)
full_dataset = Dataset.from_pandas(df[["text"]])
# Split into train and eval
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
use_gradient_checkpointing=True,
)
# 5) Tokenization with labels
train_seq_length = 512
def tokenize_fn(examples):
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=train_seq_length,
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
# 6) Define training arguments
training_args = TrainingArguments(
output_dir="./output_model",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
fp16=True,
num_train_epochs=3,
learning_rate=2e-4,
logging_strategy="steps",
logging_steps=25,
save_strategy="epoch",
save_total_limit=2,
evaluation_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
dataloader_num_workers=2,
)
# 7) Initialize SFTTrainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
# 8) Train
print("Starting training...")
trainer.train()
print("Training finished.")
# 9) Save final adapter and tokenizer
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"Saved final adapter to {final_dir}")
# 10) Zip the final model
zip_path = "model.zip"
print(f"Zipping model directory {final_dir} to {zip_path}...")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
for root, _, files in os.walk(final_dir):
for fname in files:
full_path = os.path.join(root, fname)
rel_path = os.path.relpath(full_path, final_dir)
z.write(full_path, rel_path)
print(f"Successfully zipped model to {zip_path}")
if name == "main": main()
|