|
import unsloth
|
|
import pandas as pd
|
|
import torch
|
|
from datasets import Dataset
|
|
from transformers import TrainingArguments
|
|
from unsloth import FastLanguageModel
|
|
from trl import SFTTrainer
|
|
|
|
|
|
|
|
df = pd.read_csv("data.csv")
|
|
df["text"] = df.apply(lambda row: f"### Instruction:\n{row['instruction']}\n\n### Response:\n{row['response']}\n", axis=1)
|
|
dataset = Dataset.from_pandas(df[["text"]])
|
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name = "unsloth/Llama-3.2-3B-Instruct",
|
|
max_seq_length = 2048,
|
|
dtype = torch.float16,
|
|
load_in_4bit = True,
|
|
)
|
|
|
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r = 8,
|
|
lora_alpha = 32,
|
|
lora_dropout = 0.05,
|
|
bias = "none",
|
|
)
|
|
|
|
|
|
def tokenize(example):
|
|
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
|
|
|
|
tokenized_dataset = dataset.map(tokenize, batched=True)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir = "./lora-finetuned",
|
|
per_device_train_batch_size = 2,
|
|
num_train_epochs = 3,
|
|
learning_rate = 2e-4,
|
|
logging_steps = 10,
|
|
save_steps = 100,
|
|
fp16 = True,
|
|
)
|
|
|
|
|
|
trainer = SFTTrainer(
|
|
model = model,
|
|
tokenizer = tokenizer,
|
|
args = training_args,
|
|
train_dataset = tokenized_dataset,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
|
|
model.save_pretrained("./lora-finetuned") |