Update train.py
Browse files
train.py
CHANGED
@@ -3,8 +3,8 @@ from unsloth import FastLanguageModel
|
|
3 |
|
4 |
import os
|
5 |
import zipfile
|
6 |
-
from datasets import Dataset
|
7 |
import pandas as pd
|
|
|
8 |
import torch
|
9 |
from transformers import TrainingArguments
|
10 |
from trl import SFTTrainer
|
@@ -27,20 +27,17 @@ def main():
|
|
27 |
if tokenizer.pad_token is None:
|
28 |
tokenizer.pad_token = tokenizer.eos_token
|
29 |
model.config.pad_token_id = tokenizer.eos_token_id
|
|
|
30 |
|
31 |
-
# 3) Load
|
32 |
-
df = pd.read_json("
|
33 |
|
34 |
-
#
|
35 |
-
df["text"] = df
|
36 |
-
lambda
|
37 |
-
|
38 |
-
.strip() + tokenizer.eos_token
|
39 |
)
|
40 |
|
41 |
-
# Confirm all rows end properly
|
42 |
-
assert df["text"].str.endswith(tokenizer.eos_token).all(), "Some rows are missing eos_token!"
|
43 |
-
|
44 |
# Convert to Hugging Face Dataset
|
45 |
full_dataset = Dataset.from_pandas(df[["text"]])
|
46 |
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
@@ -62,7 +59,7 @@ def main():
|
|
62 |
use_gradient_checkpointing=True,
|
63 |
)
|
64 |
|
65 |
-
# 5)
|
66 |
def tokenize_fn(examples):
|
67 |
tokens = tokenizer(
|
68 |
examples["text"],
|
@@ -76,7 +73,7 @@ def main():
|
|
76 |
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
77 |
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
78 |
|
79 |
-
# 6)
|
80 |
training_args = TrainingArguments(
|
81 |
output_dir="./output_model",
|
82 |
per_device_train_batch_size=8,
|
@@ -88,15 +85,14 @@ def main():
|
|
88 |
logging_steps=25,
|
89 |
save_strategy="epoch",
|
90 |
save_total_limit=2,
|
91 |
-
eval_strategy="epoch",
|
92 |
load_best_model_at_end=True,
|
93 |
metric_for_best_model="eval_loss",
|
94 |
greater_is_better=False,
|
95 |
dataloader_num_workers=2,
|
96 |
)
|
97 |
|
98 |
-
|
99 |
-
# 7) Train the model
|
100 |
trainer = SFTTrainer(
|
101 |
model=model,
|
102 |
tokenizer=tokenizer,
|
@@ -109,22 +105,20 @@ def main():
|
|
109 |
trainer.train()
|
110 |
print("β
Training complete.")
|
111 |
|
112 |
-
# 8) Save
|
113 |
final_dir = "./output_model_final"
|
114 |
os.makedirs(final_dir, exist_ok=True)
|
115 |
model.save_pretrained(final_dir)
|
116 |
tokenizer.save_pretrained(final_dir)
|
117 |
-
print(f"π¦ Saved final model to {final_dir}")
|
118 |
|
119 |
zip_path = "model.zip"
|
120 |
-
print(f"π Zipping model β {zip_path}")
|
121 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
122 |
for root, _, files in os.walk(final_dir):
|
123 |
for fname in files:
|
124 |
full = os.path.join(root, fname)
|
125 |
rel = os.path.relpath(full, final_dir)
|
126 |
zf.write(full, rel)
|
127 |
-
print(f"β
|
128 |
|
129 |
if __name__ == "__main__":
|
130 |
-
main()
|
|
|
3 |
|
4 |
import os
|
5 |
import zipfile
|
|
|
6 |
import pandas as pd
|
7 |
+
from datasets import Dataset
|
8 |
import torch
|
9 |
from transformers import TrainingArguments
|
10 |
from trl import SFTTrainer
|
|
|
27 |
if tokenizer.pad_token is None:
|
28 |
tokenizer.pad_token = tokenizer.eos_token
|
29 |
model.config.pad_token_id = tokenizer.eos_token_id
|
30 |
+
model.config.eos_token_id = tokenizer.eos_token_id
|
31 |
|
32 |
+
# 3) Load instruction-response dataset
|
33 |
+
df = pd.read_json("cleaned_instruction_response.jsonl", lines=True)
|
34 |
|
35 |
+
# Rebuild clean training text
|
36 |
+
df["text"] = df.apply(
|
37 |
+
lambda row: row["instruction"].strip() + "\n\n" + row["response"].strip() + tokenizer.eos_token,
|
38 |
+
axis=1
|
|
|
39 |
)
|
40 |
|
|
|
|
|
|
|
41 |
# Convert to Hugging Face Dataset
|
42 |
full_dataset = Dataset.from_pandas(df[["text"]])
|
43 |
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
|
|
59 |
use_gradient_checkpointing=True,
|
60 |
)
|
61 |
|
62 |
+
# 5) Tokenization
|
63 |
def tokenize_fn(examples):
|
64 |
tokens = tokenizer(
|
65 |
examples["text"],
|
|
|
73 |
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
74 |
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
75 |
|
76 |
+
# 6) Training arguments
|
77 |
training_args = TrainingArguments(
|
78 |
output_dir="./output_model",
|
79 |
per_device_train_batch_size=8,
|
|
|
85 |
logging_steps=25,
|
86 |
save_strategy="epoch",
|
87 |
save_total_limit=2,
|
88 |
+
eval_strategy="epoch",
|
89 |
load_best_model_at_end=True,
|
90 |
metric_for_best_model="eval_loss",
|
91 |
greater_is_better=False,
|
92 |
dataloader_num_workers=2,
|
93 |
)
|
94 |
|
95 |
+
# 7) Train
|
|
|
96 |
trainer = SFTTrainer(
|
97 |
model=model,
|
98 |
tokenizer=tokenizer,
|
|
|
105 |
trainer.train()
|
106 |
print("β
Training complete.")
|
107 |
|
108 |
+
# 8) Save and zip
|
109 |
final_dir = "./output_model_final"
|
110 |
os.makedirs(final_dir, exist_ok=True)
|
111 |
model.save_pretrained(final_dir)
|
112 |
tokenizer.save_pretrained(final_dir)
|
|
|
113 |
|
114 |
zip_path = "model.zip"
|
|
|
115 |
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
116 |
for root, _, files in os.walk(final_dir):
|
117 |
for fname in files:
|
118 |
full = os.path.join(root, fname)
|
119 |
rel = os.path.relpath(full, final_dir)
|
120 |
zf.write(full, rel)
|
121 |
+
print(f"β
Model zipped β {zip_path}")
|
122 |
|
123 |
if __name__ == "__main__":
|
124 |
+
main()
|