|
|
|
import unsloth |
|
import pandas as pd |
|
import torch |
|
from datasets import Dataset |
|
from transformers import TrainingArguments |
|
from unsloth import FastLanguageModel |
|
from trl import SFTTrainer |
|
import os |
|
import shutil |
|
import zipfile |
|
|
|
|
|
df = pd.read_csv("data.csv") |
|
df["text"] = df.apply(lambda row: f"### Instruction:\n{row['instruction']}\n\n### Response:\n{row['response']}\n", axis=1) |
|
dataset = Dataset.from_pandas(df[["text"]]) |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name = "unsloth/Llama-3.2-3B-Instruct", |
|
max_seq_length = 2048, |
|
dtype = torch.float16, |
|
load_in_4bit = True, |
|
) |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 8, |
|
lora_alpha = 32, |
|
lora_dropout = 0.05, |
|
bias = "none", |
|
) |
|
|
|
|
|
def tokenize(example): |
|
return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512) |
|
|
|
tokenized_dataset = dataset.map(tokenize, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir = "./output_model", |
|
per_device_train_batch_size = 2, |
|
num_train_epochs = 3, |
|
learning_rate = 2e-4, |
|
logging_steps = 10, |
|
save_steps = 100, |
|
fp16 = True, |
|
) |
|
|
|
|
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
args = training_args, |
|
train_dataset = tokenized_dataset, |
|
) |
|
|
|
trainer.train() |
|
|
|
|
|
output_dir = "./output_model" |
|
os.makedirs(output_dir, exist_ok=True) |
|
model.save_pretrained(output_dir) |
|
|
|
|
|
zip_path = "/home/user/app/model.zip" |
|
try: |
|
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: |
|
for root, _, files in os.walk(output_dir): |
|
for file in files: |
|
full_path = os.path.join(root, file) |
|
rel_path = os.path.relpath(full_path, output_dir) |
|
zipf.write(full_path, rel_path) |
|
print(f"β
Zipped model to {zip_path}") |
|
except Exception as e: |
|
print(f"β Failed to zip model: {e}") |