Update train.py
Browse files
train.py
CHANGED
@@ -9,6 +9,7 @@ import torch
|
|
9 |
from transformers import TrainingArguments
|
10 |
from trl import SFTTrainer
|
11 |
|
|
|
12 |
def main():
|
13 |
# 1) Load 4-bit model + tokenizer
|
14 |
model_name = "HuggingFaceTB/SmolLM2-1.7B"
|
@@ -23,29 +24,34 @@ def main():
|
|
23 |
load_in_4bit=load_in_4bit,
|
24 |
)
|
25 |
|
26 |
-
# 2)
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
model.config.eos_token_id = tokenizer.eos_token_id
|
|
|
31 |
|
32 |
-
# 3) Load
|
33 |
-
df = pd.read_json("
|
34 |
-
|
35 |
-
|
36 |
-
df["text"] = df.apply(
|
37 |
-
lambda row: row["instruction"].strip() + "\n\n" + row["response"].strip() + tokenizer.eos_token,
|
38 |
-
axis=1
|
39 |
-
)
|
40 |
|
41 |
-
#
|
42 |
full_dataset = Dataset.from_pandas(df[["text"]])
|
43 |
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
44 |
train_dataset = split["train"]
|
45 |
eval_dataset = split["test"]
|
46 |
print(f"β
Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
|
47 |
|
48 |
-
#
|
49 |
model = FastLanguageModel.get_peft_model(
|
50 |
model,
|
51 |
r=8,
|
@@ -59,7 +65,7 @@ def main():
|
|
59 |
use_gradient_checkpointing=True,
|
60 |
)
|
61 |
|
62 |
-
#
|
63 |
def tokenize_fn(examples):
|
64 |
tokens = tokenizer(
|
65 |
examples["text"],
|
@@ -73,7 +79,7 @@ def main():
|
|
73 |
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
74 |
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
75 |
|
76 |
-
#
|
77 |
training_args = TrainingArguments(
|
78 |
output_dir="./output_model",
|
79 |
per_device_train_batch_size=8,
|
@@ -92,7 +98,7 @@ def main():
|
|
92 |
dataloader_num_workers=2,
|
93 |
)
|
94 |
|
95 |
-
#
|
96 |
trainer = SFTTrainer(
|
97 |
model=model,
|
98 |
tokenizer=tokenizer,
|
@@ -105,13 +111,14 @@ def main():
|
|
105 |
trainer.train()
|
106 |
print("β
Training complete.")
|
107 |
|
108 |
-
#
|
109 |
final_dir = "./output_model_final"
|
110 |
os.makedirs(final_dir, exist_ok=True)
|
111 |
model.save_pretrained(final_dir)
|
112 |
tokenizer.save_pretrained(final_dir)
|
113 |
|
114 |
zip_path = "model.zip"
|
|
|
115 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
116 |
for root, _, files in os.walk(final_dir):
|
117 |
for fname in files:
|
@@ -121,4 +128,4 @@ def main():
|
|
121 |
print(f"β
Model zipped β {zip_path}")
|
122 |
|
123 |
if __name__ == "__main__":
|
124 |
-
main()
|
|
|
9 |
from transformers import TrainingArguments
|
10 |
from trl import SFTTrainer
|
11 |
|
12 |
+
|
13 |
def main():
|
14 |
# 1) Load 4-bit model + tokenizer
|
15 |
model_name = "HuggingFaceTB/SmolLM2-1.7B"
|
|
|
24 |
load_in_4bit=load_in_4bit,
|
25 |
)
|
26 |
|
27 |
+
# 2) Configure special tokens for chat format
|
28 |
+
special_tokens = {
|
29 |
+
"bos_token": "<|im_start|>user\n",
|
30 |
+
"eos_token": "<|im_end|>",
|
31 |
+
"pad_token": "<|im_end|>",
|
32 |
+
"additional_special_tokens": [
|
33 |
+
"<|im_start|>assistant\n"
|
34 |
+
]
|
35 |
+
}
|
36 |
+
tokenizer.add_special_tokens(special_tokens)
|
37 |
+
model.resize_token_embeddings(len(tokenizer))
|
38 |
+
model.config.bos_token_id = tokenizer.bos_token_id
|
39 |
model.config.eos_token_id = tokenizer.eos_token_id
|
40 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
41 |
|
42 |
+
# 3) Load chat-formatted dataset
|
43 |
+
df = pd.read_json("chat_formatted_data.jsonl", lines=True)
|
44 |
+
# Confirm each sample ends with the end-of-turn token
|
45 |
+
assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# 4) Create Hugging Face Dataset and split
|
48 |
full_dataset = Dataset.from_pandas(df[["text"]])
|
49 |
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
50 |
train_dataset = split["train"]
|
51 |
eval_dataset = split["test"]
|
52 |
print(f"β
Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
|
53 |
|
54 |
+
# 5) Apply LoRA adapters
|
55 |
model = FastLanguageModel.get_peft_model(
|
56 |
model,
|
57 |
r=8,
|
|
|
65 |
use_gradient_checkpointing=True,
|
66 |
)
|
67 |
|
68 |
+
# 6) Tokenization function
|
69 |
def tokenize_fn(examples):
|
70 |
tokens = tokenizer(
|
71 |
examples["text"],
|
|
|
79 |
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
80 |
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
81 |
|
82 |
+
# 7) Training arguments
|
83 |
training_args = TrainingArguments(
|
84 |
output_dir="./output_model",
|
85 |
per_device_train_batch_size=8,
|
|
|
98 |
dataloader_num_workers=2,
|
99 |
)
|
100 |
|
101 |
+
# 8) Train with SFTTrainer
|
102 |
trainer = SFTTrainer(
|
103 |
model=model,
|
104 |
tokenizer=tokenizer,
|
|
|
111 |
trainer.train()
|
112 |
print("β
Training complete.")
|
113 |
|
114 |
+
# 9) Save and zip model
|
115 |
final_dir = "./output_model_final"
|
116 |
os.makedirs(final_dir, exist_ok=True)
|
117 |
model.save_pretrained(final_dir)
|
118 |
tokenizer.save_pretrained(final_dir)
|
119 |
|
120 |
zip_path = "model.zip"
|
121 |
+
print(f"π Zipping model β {zip_path}")
|
122 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
123 |
for root, _, files in os.walk(final_dir):
|
124 |
for fname in files:
|
|
|
128 |
print(f"β
Model zipped β {zip_path}")
|
129 |
|
130 |
if __name__ == "__main__":
|
131 |
+
main()
|