|
import unsloth |
|
from unsloth import FastLanguageModel |
|
|
|
import os |
|
import zipfile |
|
import pandas as pd |
|
from datasets import Dataset |
|
import torch |
|
from transformers import TrainingArguments |
|
from trl import SFTTrainer |
|
|
|
|
|
def main(): |
|
|
|
model_name = "HuggingFaceTB/SmolLM2-1.7B" |
|
max_seq_length = 768 |
|
dtype = torch.float16 |
|
load_in_4bit = True |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_name, |
|
max_seq_length=max_seq_length, |
|
dtype=dtype, |
|
load_in_4bit=load_in_4bit, |
|
) |
|
|
|
|
|
special_tokens = { |
|
"bos_token": "<|im_start|>user\n", |
|
"eos_token": "<|im_end|>", |
|
"pad_token": "<|im_end|>", |
|
"additional_special_tokens": [ |
|
"<|im_start|>assistant\n" |
|
] |
|
} |
|
tokenizer.add_special_tokens(special_tokens) |
|
model.resize_token_embeddings(len(tokenizer)) |
|
model.config.bos_token_id = tokenizer.bos_token_id |
|
model.config.eos_token_id = tokenizer.eos_token_id |
|
model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
|
|
df = pd.read_json("data.jsonl", lines=True) |
|
|
|
assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token" |
|
|
|
|
|
full_dataset = Dataset.from_pandas(df[["text"]]) |
|
split = full_dataset.train_test_split(test_size=0.15, seed=42) |
|
train_dataset = split["train"] |
|
eval_dataset = split["test"] |
|
print(f"β
Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}") |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r=8, |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
bias="none", |
|
target_modules=[ |
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj", |
|
], |
|
use_gradient_checkpointing=True, |
|
) |
|
|
|
|
|
def tokenize_fn(examples): |
|
tokens = tokenizer( |
|
examples["text"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=max_seq_length, |
|
) |
|
tokens["labels"] = tokens["input_ids"].copy() |
|
return tokens |
|
|
|
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) |
|
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./output_model", |
|
per_device_train_batch_size=8, |
|
gradient_accumulation_steps=1, |
|
fp16=True, |
|
num_train_epochs=3, |
|
learning_rate=2e-4, |
|
logging_strategy="steps", |
|
logging_steps=25, |
|
save_strategy="epoch", |
|
save_total_limit=2, |
|
eval_strategy="epoch", |
|
load_best_model_at_end=True, |
|
metric_for_best_model="eval_loss", |
|
greater_is_better=False, |
|
dataloader_num_workers=2, |
|
) |
|
|
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
tokenizer=tokenizer, |
|
args=training_args, |
|
train_dataset=tokenized_train, |
|
eval_dataset=tokenized_eval, |
|
) |
|
|
|
print("π Starting training...") |
|
trainer.train() |
|
print("β
Training complete.") |
|
|
|
|
|
final_dir = "./output_model_final" |
|
os.makedirs(final_dir, exist_ok=True) |
|
model.save_pretrained(final_dir) |
|
tokenizer.save_pretrained(final_dir) |
|
|
|
zip_path = "model.zip" |
|
print(f"π Zipping model β {zip_path}") |
|
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: |
|
for root, _, files in os.walk(final_dir): |
|
for fname in files: |
|
full = os.path.join(root, fname) |
|
rel = os.path.relpath(full, final_dir) |
|
zf.write(full, rel) |
|
print(f"β
Model zipped β {zip_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |