|
import os |
|
import zipfile |
|
import pandas as pd |
|
import torch from datasets |
|
import Dataset from transformers |
|
import TrainingArguments from unsloth |
|
import FastLanguageModel from trl |
|
import SFTTrainer |
|
|
|
def main(): |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_name, |
|
max_seq_length=max_seq_length, |
|
dtype=dtype, |
|
load_in_4bit=load_in_4bit, |
|
) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
model.config.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
df = pd.read_json("data.jsonl", lines=True) |
|
df["text"] = df.apply( |
|
lambda row: ( |
|
"### Instruction:\n" |
|
+ row["instruction"].strip() |
|
+ "\n\n### Response:\n" |
|
+ row["response"].strip() |
|
+ tokenizer.eos_token |
|
), |
|
axis=1, |
|
) |
|
full_dataset = Dataset.from_pandas(df[["text"]]) |
|
|
|
|
|
split = full_dataset.train_test_split(test_size=0.15, seed=42) |
|
train_dataset = split["train"] |
|
eval_dataset = split["test"] |
|
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}") |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=8, |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
bias="none", |
|
target_modules=[ |
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj", |
|
], |
|
use_gradient_checkpointing=True, |
|
) |
|
|
|
|
|
train_seq_length = 512 |
|
def tokenize_fn(examples): |
|
tokens = tokenizer( |
|
examples["text"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=train_seq_length, |
|
) |
|
tokens["labels"] = tokens["input_ids"].copy() |
|
return tokens |
|
|
|
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) |
|
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./output_model", |
|
per_device_train_batch_size=8, |
|
gradient_accumulation_steps=1, |
|
fp16=True, |
|
num_train_epochs=3, |
|
learning_rate=2e-4, |
|
logging_strategy="steps", |
|
logging_steps=25, |
|
save_strategy="epoch", |
|
save_total_limit=2, |
|
evaluation_strategy="epoch", |
|
load_best_model_at_end=True, |
|
metric_for_best_model="eval_loss", |
|
greater_is_better=False, |
|
dataloader_num_workers=2, |
|
) |
|
|
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
tokenizer=tokenizer, |
|
args=training_args, |
|
train_dataset=tokenized_train, |
|
eval_dataset=tokenized_eval, |
|
) |
|
|
|
|
|
print("Starting training...") |
|
trainer.train() |
|
print("Training finished.") |
|
|
|
|
|
final_dir = "./output_model_final" |
|
os.makedirs(final_dir, exist_ok=True) |
|
model.save_pretrained(final_dir) |
|
tokenizer.save_pretrained(final_dir) |
|
print(f"Saved final adapter to {final_dir}") |
|
|
|
|
|
zip_path = "model.zip" |
|
print(f"Zipping model directory {final_dir} to {zip_path}...") |
|
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z: |
|
for root, _, files in os.walk(final_dir): |
|
for fname in files: |
|
full_path = os.path.join(root, fname) |
|
rel_path = os.path.relpath(full_path, final_dir) |
|
z.write(full_path, rel_path) |
|
print(f"Successfully zipped model to {zip_path}") |
|
|
|
if name == "main": main() |
|
|
|
|