Update train.py
Browse files
train.py
CHANGED
@@ -1,125 +1,127 @@
|
|
1 |
import os
|
2 |
import zipfile
|
3 |
-
import pandas as pd
|
4 |
-
import torch from datasets
|
5 |
-
import Dataset from transformers
|
6 |
-
import TrainingArguments from unsloth
|
7 |
-
import FastLanguageModel from trl
|
8 |
-
import SFTTrainer
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
|
24 |
-
#
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
"### Instruction:\n"
|
29 |
-
+ row["instruction"].strip()
|
30 |
-
+ "\n\n### Response:\n"
|
31 |
-
+ row["response"].strip()
|
32 |
-
+ tokenizer.eos_token
|
33 |
-
),
|
34 |
-
axis=1,
|
35 |
-
)
|
36 |
-
full_dataset = Dataset.from_pandas(df[["text"]])
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
lora_alpha=32,
|
49 |
-
lora_dropout=0.05,
|
50 |
-
bias="none",
|
51 |
-
target_modules=[
|
52 |
-
"q_proj", "k_proj", "v_proj", "o_proj",
|
53 |
-
"gate_proj", "up_proj", "down_proj",
|
54 |
-
],
|
55 |
-
use_gradient_checkpointing=True,
|
56 |
-
)
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
)
|
67 |
-
tokens["labels"] = tokens["input_ids"].copy()
|
68 |
-
return tokens
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
output_dir="./output_model",
|
76 |
-
per_device_train_batch_size=8,
|
77 |
-
gradient_accumulation_steps=1,
|
78 |
-
fp16=True,
|
79 |
-
num_train_epochs=3,
|
80 |
-
learning_rate=2e-4,
|
81 |
-
logging_strategy="steps",
|
82 |
-
logging_steps=25,
|
83 |
-
save_strategy="epoch",
|
84 |
-
save_total_limit=2,
|
85 |
-
evaluation_strategy="epoch",
|
86 |
-
load_best_model_at_end=True,
|
87 |
-
metric_for_best_model="eval_loss",
|
88 |
-
greater_is_better=False,
|
89 |
-
dataloader_num_workers=2,
|
90 |
-
)
|
91 |
|
92 |
-
#
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
#
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
model.save_pretrained(final_dir)
|
110 |
-
tokenizer.save_pretrained(final_dir)
|
111 |
-
print(f"Saved final adapter to {final_dir}")
|
112 |
|
113 |
-
#
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
full_path = os.path.join(root, fname)
|
120 |
-
rel_path = os.path.relpath(full_path, final_dir)
|
121 |
-
z.write(full_path, rel_path)
|
122 |
-
print(f"Successfully zipped model to {zip_path}")
|
123 |
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
|
|
|
|
|
1 |
import os
|
2 |
import zipfile
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
from datasets import Dataset
|
5 |
+
import pandas as pd
|
6 |
+
import torch
|
7 |
+
from transformers import TrainingArguments
|
8 |
+
from trl import FastLanguageModel, SFTTrainer
|
9 |
|
10 |
+
def main():
|
11 |
+
# 1) Load 4-bit model + tokenizer
|
12 |
+
model_name = "HuggingFaceTB/SmolLM2-1.7B"
|
13 |
+
max_seq_length = 2048
|
14 |
+
dtype = torch.float16
|
15 |
+
load_in_4bit = True
|
16 |
|
17 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
18 |
+
model_name=model_name,
|
19 |
+
max_seq_length=max_seq_length,
|
20 |
+
dtype=dtype,
|
21 |
+
load_in_4bit=load_in_4bit,
|
22 |
+
)
|
23 |
|
24 |
+
# 2) Ensure pad_token
|
25 |
+
if tokenizer.pad_token is None:
|
26 |
+
tokenizer.pad_token = tokenizer.eos_token
|
27 |
+
model.config.pad_token_id = tokenizer.eos_token_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
# 3) Load and format dataset
|
30 |
+
df = pd.read_json("data.jsonl", lines=True)
|
31 |
+
df["text"] = df.apply(
|
32 |
+
lambda row: (
|
33 |
+
"### Instruction:\n"
|
34 |
+
+ row["instruction"].strip()
|
35 |
+
+ "\n\n### Response:\n"
|
36 |
+
+ row["response"].strip()
|
37 |
+
+ tokenizer.eos_token
|
38 |
+
),
|
39 |
+
axis=1,
|
40 |
+
)
|
41 |
+
full_dataset = Dataset.from_pandas(df[["text"]])
|
42 |
|
43 |
+
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
44 |
+
train_dataset = split["train"]
|
45 |
+
eval_dataset = split["test"]
|
46 |
+
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# 4) Apply LoRA adapters
|
49 |
+
model = FastLanguageModel.get_peft_model(
|
50 |
+
model,
|
51 |
+
r=8,
|
52 |
+
lora_alpha=32,
|
53 |
+
lora_dropout=0.05,
|
54 |
+
bias="none",
|
55 |
+
target_modules=[
|
56 |
+
"q_proj", "k_proj", "v_proj", "o_proj",
|
57 |
+
"gate_proj", "up_proj", "down_proj",
|
58 |
+
],
|
59 |
+
use_gradient_checkpointing=True,
|
60 |
)
|
|
|
|
|
61 |
|
62 |
+
# 5) Tokenization with labels
|
63 |
+
train_seq_length = 512
|
64 |
+
def tokenize_fn(examples):
|
65 |
+
tokens = tokenizer(
|
66 |
+
examples["text"],
|
67 |
+
truncation=True,
|
68 |
+
padding="max_length",
|
69 |
+
max_length=train_seq_length,
|
70 |
+
)
|
71 |
+
tokens["labels"] = tokens["input_ids"].copy()
|
72 |
+
return tokens
|
73 |
|
74 |
+
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
75 |
+
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
# 6) Define training arguments
|
78 |
+
training_args = TrainingArguments(
|
79 |
+
output_dir="./output_model",
|
80 |
+
per_device_train_batch_size=8,
|
81 |
+
gradient_accumulation_steps=1,
|
82 |
+
fp16=True,
|
83 |
+
num_train_epochs=3,
|
84 |
+
learning_rate=2e-4,
|
85 |
+
logging_strategy="steps",
|
86 |
+
logging_steps=25,
|
87 |
+
save_strategy="epoch",
|
88 |
+
save_total_limit=2,
|
89 |
+
evaluation_strategy="epoch",
|
90 |
+
load_best_model_at_end=True,
|
91 |
+
metric_for_best_model="eval_loss",
|
92 |
+
greater_is_better=False,
|
93 |
+
dataloader_num_workers=2,
|
94 |
+
)
|
95 |
|
96 |
+
# 7) Initialize and run SFTTrainer
|
97 |
+
trainer = SFTTrainer(
|
98 |
+
model=model,
|
99 |
+
tokenizer=tokenizer,
|
100 |
+
args=training_args,
|
101 |
+
train_dataset=tokenized_train,
|
102 |
+
eval_dataset=tokenized_eval,
|
103 |
+
)
|
104 |
|
105 |
+
print("Starting training...")
|
106 |
+
trainer.train()
|
107 |
+
print("Training finished.")
|
|
|
|
|
|
|
108 |
|
109 |
+
# 8) Save and zip model
|
110 |
+
final_dir = "./output_model_final"
|
111 |
+
os.makedirs(final_dir, exist_ok=True)
|
112 |
+
model.save_pretrained(final_dir)
|
113 |
+
tokenizer.save_pretrained(final_dir)
|
114 |
+
print(f"Saved final adapter to {final_dir}")
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
zip_path = "model.zip"
|
117 |
+
print(f"Zipping {final_dir} → {zip_path}")
|
118 |
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
119 |
+
for root, _, files in os.walk(final_dir):
|
120 |
+
for fname in files:
|
121 |
+
full = os.path.join(root, fname)
|
122 |
+
rel = os.path.relpath(full, final_dir)
|
123 |
+
zf.write(full, rel)
|
124 |
+
print(f"Successfully zipped model to {zip_path}")
|
125 |
|
126 |
+
if __name__ == "__main__":
|
127 |
+
main()
|