Update train.py
Browse files
train.py
CHANGED
@@ -1,29 +1,20 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
import torch
|
5 |
-
from datasets import Dataset
|
6 |
-
from transformers import TrainingArguments
|
7 |
-
from unsloth import FastLanguageModel
|
8 |
-
from trl import SFTTrainer
|
9 |
-
import os
|
10 |
-
import zipfile
|
11 |
|
12 |
-
# 1) Load Unsloth model + tokenizer
|
13 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
14 |
-
model_name=
|
15 |
-
max_seq_length=
|
16 |
-
dtype=
|
17 |
-
load_in_4bit=
|
18 |
)
|
19 |
|
20 |
-
# 2)
|
21 |
-
|
22 |
-
tokenizer.pad_token =
|
23 |
model.config.pad_token_id = tokenizer.eos_token_id
|
24 |
|
25 |
-
# 3) Load
|
26 |
-
# Read the JSONL we generated (one JSON object per line with "instruction" & "response")
|
27 |
df = pd.read_json("data.jsonl", lines=True)
|
28 |
df["text"] = df.apply(
|
29 |
lambda row: (
|
@@ -31,68 +22,97 @@ df["text"] = df.apply(
|
|
31 |
+ row["instruction"].strip()
|
32 |
+ "\n\n### Response:\n"
|
33 |
+ row["response"].strip()
|
34 |
-
+
|
35 |
),
|
36 |
-
axis=1
|
37 |
)
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
# 4) Apply LoRA
|
41 |
model = FastLanguageModel.get_peft_model(
|
42 |
model,
|
43 |
r=8,
|
44 |
lora_alpha=32,
|
45 |
lora_dropout=0.05,
|
46 |
bias="none",
|
|
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
|
49 |
-
# 5)
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
53 |
truncation=True,
|
54 |
padding="max_length",
|
55 |
-
max_length=
|
56 |
)
|
|
|
|
|
57 |
|
58 |
-
|
|
|
59 |
|
60 |
-
# 6)
|
61 |
training_args = TrainingArguments(
|
62 |
output_dir="./output_model",
|
63 |
-
per_device_train_batch_size=
|
|
|
|
|
64 |
num_train_epochs=3,
|
65 |
learning_rate=2e-4,
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
)
|
70 |
|
71 |
-
# 7) Initialize SFTTrainer
|
72 |
trainer = SFTTrainer(
|
73 |
model=model,
|
74 |
tokenizer=tokenizer,
|
75 |
args=training_args,
|
76 |
-
train_dataset=
|
|
|
77 |
)
|
78 |
|
79 |
-
# 8) Train
|
|
|
80 |
trainer.train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
output_dir = "./output_model"
|
84 |
-
os.makedirs(output_dir, exist_ok=True)
|
85 |
-
model.save_pretrained(output_dir)
|
86 |
|
87 |
-
# π§ Zip for download
|
88 |
-
zip_path = "/home/user/app/model.zip"
|
89 |
-
try:
|
90 |
-
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
|
91 |
-
for root, _, files in os.walk(output_dir):
|
92 |
-
for fname in files:
|
93 |
-
full = os.path.join(root, fname)
|
94 |
-
rel = os.path.relpath(full, output_dir)
|
95 |
-
z.write(full, rel)
|
96 |
-
print(f"β
Zipped model to {zip_path}")
|
97 |
-
except Exception as e:
|
98 |
-
print(f"β Failed to zip model: {e}")
|
|
|
1 |
+
import os import zipfile import pandas as pd import torch from datasets import Dataset from transformers import TrainingArguments from unsloth import FastLanguageModel from trl import SFTTrainer
|
2 |
+
|
3 |
+
def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
|
|
5 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
6 |
+
model_name=model_name,
|
7 |
+
max_seq_length=max_seq_length,
|
8 |
+
dtype=dtype,
|
9 |
+
load_in_4bit=load_in_4bit,
|
10 |
)
|
11 |
|
12 |
+
# 2) Set pad_token = eos_token if not already defined
|
13 |
+
if tokenizer.pad_token is None:
|
14 |
+
tokenizer.pad_token = tokenizer.eos_token
|
15 |
model.config.pad_token_id = tokenizer.eos_token_id
|
16 |
|
17 |
+
# 3) Load and format dataset
|
|
|
18 |
df = pd.read_json("data.jsonl", lines=True)
|
19 |
df["text"] = df.apply(
|
20 |
lambda row: (
|
|
|
22 |
+ row["instruction"].strip()
|
23 |
+ "\n\n### Response:\n"
|
24 |
+ row["response"].strip()
|
25 |
+
+ tokenizer.eos_token
|
26 |
),
|
27 |
+
axis=1,
|
28 |
)
|
29 |
+
full_dataset = Dataset.from_pandas(df[["text"]])
|
30 |
+
|
31 |
+
# Split into train and eval
|
32 |
+
split = full_dataset.train_test_split(test_size=0.15, seed=42)
|
33 |
+
train_dataset = split["train"]
|
34 |
+
eval_dataset = split["test"]
|
35 |
+
print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
|
36 |
|
37 |
+
# 4) Apply LoRA adapters
|
38 |
model = FastLanguageModel.get_peft_model(
|
39 |
model,
|
40 |
r=8,
|
41 |
lora_alpha=32,
|
42 |
lora_dropout=0.05,
|
43 |
bias="none",
|
44 |
+
target_modules=[
|
45 |
+
"q_proj", "k_proj", "v_proj", "o_proj",
|
46 |
+
"gate_proj", "up_proj", "down_proj",
|
47 |
+
],
|
48 |
+
use_gradient_checkpointing=True,
|
49 |
)
|
50 |
|
51 |
+
# 5) Tokenization with labels
|
52 |
+
train_seq_length = 512
|
53 |
+
def tokenize_fn(examples):
|
54 |
+
tokens = tokenizer(
|
55 |
+
examples["text"],
|
56 |
truncation=True,
|
57 |
padding="max_length",
|
58 |
+
max_length=train_seq_length,
|
59 |
)
|
60 |
+
tokens["labels"] = tokens["input_ids"].copy()
|
61 |
+
return tokens
|
62 |
|
63 |
+
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
64 |
+
tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
65 |
|
66 |
+
# 6) Define training arguments
|
67 |
training_args = TrainingArguments(
|
68 |
output_dir="./output_model",
|
69 |
+
per_device_train_batch_size=8,
|
70 |
+
gradient_accumulation_steps=1,
|
71 |
+
fp16=True,
|
72 |
num_train_epochs=3,
|
73 |
learning_rate=2e-4,
|
74 |
+
logging_strategy="steps",
|
75 |
+
logging_steps=25,
|
76 |
+
save_strategy="epoch",
|
77 |
+
save_total_limit=2,
|
78 |
+
evaluation_strategy="epoch",
|
79 |
+
load_best_model_at_end=True,
|
80 |
+
metric_for_best_model="eval_loss",
|
81 |
+
greater_is_better=False,
|
82 |
+
dataloader_num_workers=2,
|
83 |
)
|
84 |
|
85 |
+
# 7) Initialize SFTTrainer
|
86 |
trainer = SFTTrainer(
|
87 |
model=model,
|
88 |
tokenizer=tokenizer,
|
89 |
args=training_args,
|
90 |
+
train_dataset=tokenized_train,
|
91 |
+
eval_dataset=tokenized_eval,
|
92 |
)
|
93 |
|
94 |
+
# 8) Train
|
95 |
+
print("Starting training...")
|
96 |
trainer.train()
|
97 |
+
print("Training finished.")
|
98 |
+
|
99 |
+
# 9) Save final adapter and tokenizer
|
100 |
+
final_dir = "./output_model_final"
|
101 |
+
os.makedirs(final_dir, exist_ok=True)
|
102 |
+
model.save_pretrained(final_dir)
|
103 |
+
tokenizer.save_pretrained(final_dir)
|
104 |
+
print(f"Saved final adapter to {final_dir}")
|
105 |
+
|
106 |
+
# 10) Zip the final model
|
107 |
+
zip_path = "model_final.zip"
|
108 |
+
print(f"Zipping model directory {final_dir} to {zip_path}...")
|
109 |
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
|
110 |
+
for root, _, files in os.walk(final_dir):
|
111 |
+
for fname in files:
|
112 |
+
full_path = os.path.join(root, fname)
|
113 |
+
rel_path = os.path.relpath(full_path, final_dir)
|
114 |
+
z.write(full_path, rel_path)
|
115 |
+
print(f"Successfully zipped model to {zip_path}")
|
116 |
|
117 |
+
if name == "main": main()
|
|
|
|
|
|
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|