import os import zipfile import pandas as pd import torch from datasets import Dataset from transformers import TrainingArguments from unsloth import FastLanguageModel from trl import SFTTrainer def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, ) # 2) Set pad_token = eos_token if not already defined if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = tokenizer.eos_token_id # 3) Load and format dataset df = pd.read_json("data.jsonl", lines=True) df["text"] = df.apply( lambda row: ( "### Instruction:\n" + row["instruction"].strip() + "\n\n### Response:\n" + row["response"].strip() + tokenizer.eos_token ), axis=1, ) full_dataset = Dataset.from_pandas(df[["text"]]) # Split into train and eval split = full_dataset.train_test_split(test_size=0.15, seed=42) train_dataset = split["train"] eval_dataset = split["test"] print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}") # 4) Apply LoRA adapters model = FastLanguageModel.get_peft_model( model, r=8, lora_alpha=32, lora_dropout=0.05, bias="none", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], use_gradient_checkpointing=True, ) # 5) Tokenization with labels train_seq_length = 512 def tokenize_fn(examples): tokens = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=train_seq_length, ) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) # 6) Define training arguments training_args = TrainingArguments( output_dir="./output_model", per_device_train_batch_size=8, gradient_accumulation_steps=1, fp16=True, num_train_epochs=3, learning_rate=2e-4, logging_strategy="steps", logging_steps=25, save_strategy="epoch", save_total_limit=2, evaluation_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, dataloader_num_workers=2, ) # 7) Initialize SFTTrainer trainer = SFTTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, ) # 8) Train print("Starting training...") trainer.train() print("Training finished.") # 9) Save final adapter and tokenizer final_dir = "./output_model_final" os.makedirs(final_dir, exist_ok=True) model.save_pretrained(final_dir) tokenizer.save_pretrained(final_dir) print(f"Saved final adapter to {final_dir}") # 10) Zip the final model zip_path = "model.zip" print(f"Zipping model directory {final_dir} to {zip_path}...") with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z: for root, _, files in os.walk(final_dir): for fname in files: full_path = os.path.join(root, fname) rel_path = os.path.relpath(full_path, final_dir) z.write(full_path, rel_path) print(f"Successfully zipped model to {zip_path}") if name == "main": main()