File size: 3,800 Bytes
6d6b976
 
4d2faa4
7693569
 
 
 
 
785db18
7693569
 
 
 
 
 
785db18
7693569
 
 
 
 
 
b6a51b0
7693569
 
 
 
4d2faa4
7693569
 
 
 
 
 
 
 
 
 
 
 
 
b6a51b0
7693569
 
 
 
12a4ede
7693569
 
 
 
 
 
 
 
 
 
 
 
8ba34e8
785db18
7693569
 
 
 
 
 
 
 
 
 
 
785db18
7693569
 
785db18
7693569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785db18
7693569
 
 
 
 
 
 
 
4d2faa4
7693569
 
 
4d2faa4
7693569
 
 
 
 
 
785db18
7693569
 
 
 
 
 
 
 
 
2051c3e
7693569
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import zipfile

from datasets import Dataset
import pandas as pd
import torch
from transformers import TrainingArguments
from trl import FastLanguageModel, SFTTrainer

def main():
    # 1) Load 4-bit model + tokenizer
    model_name = "HuggingFaceTB/SmolLM2-1.7B"
    max_seq_length = 2048
    dtype = torch.float16
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

    # 2) Ensure pad_token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

    # 3) Load and format dataset
    df = pd.read_json("data.jsonl", lines=True)
    df["text"] = df.apply(
        lambda row: (
            "### Instruction:\n"
            + row["instruction"].strip()
            + "\n\n### Response:\n"
            + row["response"].strip()
            + tokenizer.eos_token
        ),
        axis=1,
    )
    full_dataset = Dataset.from_pandas(df[["text"]])

    split = full_dataset.train_test_split(test_size=0.15, seed=42)
    train_dataset = split["train"]
    eval_dataset = split["test"]
    print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")

    # 4) Apply LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        use_gradient_checkpointing=True,
    )

    # 5) Tokenization with labels
    train_seq_length = 512
    def tokenize_fn(examples):
        tokens = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=train_seq_length,
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
    tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

    # 6) Define training arguments
    training_args = TrainingArguments(
        output_dir="./output_model",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        fp16=True,
        num_train_epochs=3,
        learning_rate=2e-4,
        logging_strategy="steps",
        logging_steps=25,
        save_strategy="epoch",
        save_total_limit=2,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        dataloader_num_workers=2,
    )

    # 7) Initialize and run SFTTrainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
    )

    print("Starting training...")
    trainer.train()
    print("Training finished.")

    # 8) Save and zip model
    final_dir = "./output_model_final"
    os.makedirs(final_dir, exist_ok=True)
    model.save_pretrained(final_dir)
    tokenizer.save_pretrained(final_dir)
    print(f"Saved final adapter to {final_dir}")

    zip_path = "model.zip"
    print(f"Zipping {final_dir}{zip_path}")
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(final_dir):
            for fname in files:
                full = os.path.join(root, fname)
                rel = os.path.relpath(full, final_dir)
                zf.write(full, rel)
    print(f"Successfully zipped model to {zip_path}")

if __name__ == "__main__":
    main()