|
|
|
import unsloth |
|
import pandas as pd |
|
import torch |
|
from datasets import Dataset |
|
from transformers import TrainingArguments |
|
from unsloth import FastLanguageModel |
|
from trl import SFTTrainer |
|
import os |
|
import zipfile |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="HuggingFaceTB/SmolLM2-1.7B", |
|
max_seq_length=2048, |
|
dtype=torch.float16, |
|
load_in_4bit=True, |
|
) |
|
|
|
|
|
eos = tokenizer.eos_token |
|
tokenizer.pad_token = eos |
|
model.config.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
|
|
df = pd.read_json("data.jsonl", lines=True) |
|
df["text"] = df.apply( |
|
lambda row: ( |
|
"### Instruction:\n" |
|
+ row["instruction"].strip() |
|
+ "\n\n### Response:\n" |
|
+ row["response"].strip() |
|
+ eos |
|
), |
|
axis=1 |
|
) |
|
dataset = Dataset.from_pandas(df[["text"]]) |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=8, |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
bias="none", |
|
) |
|
|
|
|
|
def tokenize(example): |
|
return tokenizer( |
|
example["text"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=512, |
|
) |
|
|
|
tokenized_dataset = dataset.map(tokenize, batched=True) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./output_model", |
|
per_device_train_batch_size=2, |
|
num_train_epochs=3, |
|
learning_rate=2e-4, |
|
logging_steps=10, |
|
save_steps=100, |
|
fp16=True, |
|
) |
|
|
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
tokenizer=tokenizer, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
output_dir = "./output_model" |
|
os.makedirs(output_dir, exist_ok=True) |
|
model.save_pretrained(output_dir) |
|
|
|
|
|
zip_path = "/home/user/app/model.zip" |
|
try: |
|
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z: |
|
for root, _, files in os.walk(output_dir): |
|
for fname in files: |
|
full = os.path.join(root, fname) |
|
rel = os.path.relpath(full, output_dir) |
|
z.write(full, rel) |
|
print(f"β
Zipped model to {zip_path}") |
|
except Exception as e: |
|
print(f"β Failed to zip model: {e}") |