File size: 3,552 Bytes
6d6b976
 
 
 
 
 
 
 
4d2faa4
 
785db18
 
4d2faa4
 
 
 
785db18
 
4d2faa4
 
 
8ba34e8
b6a51b0
4d2faa4
e747724
8ba34e8
 
 
 
 
 
4d2faa4
8ba34e8
4d2faa4
8ba34e8
4d2faa4
 
 
 
 
 
 
b6a51b0
4d2faa4
274e76a
 
8ba34e8
 
 
 
4d2faa4
 
 
 
 
274e76a
12a4ede
4d2faa4
 
 
 
 
8ba34e8
 
4d2faa4
8ba34e8
4d2faa4
 
785db18
4d2faa4
 
785db18
4d2faa4
785db18
8ba34e8
4d2faa4
 
 
8ba34e8
 
4d2faa4
 
 
 
 
 
 
 
 
785db18
 
4d2faa4
785db18
8ba34e8
 
 
4d2faa4
 
785db18
 
4d2faa4
 
785db18
4d2faa4
 
 
 
 
 
 
 
 
 
6d6b976
4d2faa4
 
 
 
 
 
 
 
785db18
4d2faa4
2051c3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import zipfile
import pandas as pd
import torch from datasets
import Dataset from transformers
import TrainingArguments from unsloth
import FastLanguageModel from trl
import SFTTrainer

def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# 2) Set pad_token = eos_token if not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# 3) Load and format dataset
df = pd.read_json("data.jsonl", lines=True)
df["text"] = df.apply(
    lambda row: (
        "### Instruction:\n"
        + row["instruction"].strip()
        + "\n\n### Response:\n"
        + row["response"].strip()
        + tokenizer.eos_token
    ),
    axis=1,
)
full_dataset = Dataset.from_pandas(df[["text"]])

# Split into train and eval
split = full_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")

# 4) Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    use_gradient_checkpointing=True,
)

# 5) Tokenization with labels
train_seq_length = 512
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=train_seq_length,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# 6) Define training arguments
training_args = TrainingArguments(
    output_dir="./output_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    fp16=True,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_strategy="steps",
    logging_steps=25,
    save_strategy="epoch",
    save_total_limit=2,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_num_workers=2,
)

# 7) Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

# 8) Train
print("Starting training...")
trainer.train()
print("Training finished.")

# 9) Save final adapter and tokenizer
final_dir = "./output_model_final"
os.makedirs(final_dir, exist_ok=True)
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"Saved final adapter to {final_dir}")

# 10) Zip the final model
zip_path = "model.zip"
print(f"Zipping model directory {final_dir} to {zip_path}...")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(final_dir):
        for fname in files:
            full_path = os.path.join(root, fname)
            rel_path = os.path.relpath(full_path, final_dir)
            z.write(full_path, rel_path)
print(f"Successfully zipped model to {zip_path}")

if name == "main": main()