import unsloth from unsloth import FastLanguageModel import os import zipfile import pandas as pd from datasets import Dataset import torch from transformers import TrainingArguments from trl import SFTTrainer def main(): # 1) Load 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 768 dtype = torch.float16 load_in_4bit = True model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, ) # 2) Configure special tokens for chat format special_tokens = { "bos_token": "<|im_start|>user\n", "eos_token": "<|im_end|>", "pad_token": "<|im_end|>", "additional_special_tokens": [ "<|im_start|>assistant\n" ] } tokenizer.add_special_tokens(special_tokens) model.resize_token_embeddings(len(tokenizer)) model.config.bos_token_id = tokenizer.bos_token_id model.config.eos_token_id = tokenizer.eos_token_id model.config.pad_token_id = tokenizer.pad_token_id # 3) Load chat-formatted dataset df = pd.read_json("data.jsonl", lines=True) # Confirm each sample ends with the end-of-turn token assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token" # 4) Create Hugging Face Dataset and split full_dataset = Dataset.from_pandas(df[["text"]]) split = full_dataset.train_test_split(test_size=0.15, seed=42) train_dataset = split["train"] eval_dataset = split["test"] print(f"✅ Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}") # 5) Apply LoRA adapters model = FastLanguageModel.get_peft_model( model, r=8, lora_alpha=32, lora_dropout=0.05, bias="none", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], use_gradient_checkpointing=True, ) # 6) Tokenization function def tokenize_fn(examples): tokens = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=max_seq_length, ) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) # 7) Training arguments training_args = TrainingArguments( output_dir="./output_model", per_device_train_batch_size=8, gradient_accumulation_steps=1, fp16=True, num_train_epochs=3, learning_rate=2e-4, logging_strategy="steps", logging_steps=25, save_strategy="epoch", save_total_limit=2, eval_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, dataloader_num_workers=2, ) # 8) Train with SFTTrainer trainer = SFTTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, ) print("🚀 Starting training...") trainer.train() print("✅ Training complete.") # 9) Save and zip model final_dir = "./output_model_final" os.makedirs(final_dir, exist_ok=True) model.save_pretrained(final_dir) tokenizer.save_pretrained(final_dir) zip_path = "model.zip" print(f"🗜 Zipping model → {zip_path}") with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: for root, _, files in os.walk(final_dir): for fname in files: full = os.path.join(root, fname) rel = os.path.relpath(full, final_dir) zf.write(full, rel) print(f"✅ Model zipped → {zip_path}") if __name__ == "__main__": main()