darrenphodgson76 commited on
Commit
2efa3f3
Β·
verified Β·
1 Parent(s): 7e573ee

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +26 -19
train.py CHANGED
@@ -9,6 +9,7 @@ import torch
9
  from transformers import TrainingArguments
10
  from trl import SFTTrainer
11
 
 
12
  def main():
13
  # 1) Load 4-bit model + tokenizer
14
  model_name = "HuggingFaceTB/SmolLM2-1.7B"
@@ -23,29 +24,34 @@ def main():
23
  load_in_4bit=load_in_4bit,
24
  )
25
 
26
- # 2) Ensure pad token is set
27
- if tokenizer.pad_token is None:
28
- tokenizer.pad_token = tokenizer.eos_token
29
- model.config.pad_token_id = tokenizer.eos_token_id
 
 
 
 
 
 
 
 
30
  model.config.eos_token_id = tokenizer.eos_token_id
 
31
 
32
- # 3) Load instruction-response dataset
33
- df = pd.read_json("cleaned_instruction_response.jsonl", lines=True)
34
-
35
- # Rebuild clean training text
36
- df["text"] = df.apply(
37
- lambda row: row["instruction"].strip() + "\n\n" + row["response"].strip() + tokenizer.eos_token,
38
- axis=1
39
- )
40
 
41
- # Convert to Hugging Face Dataset
42
  full_dataset = Dataset.from_pandas(df[["text"]])
43
  split = full_dataset.train_test_split(test_size=0.15, seed=42)
44
  train_dataset = split["train"]
45
  eval_dataset = split["test"]
46
  print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
47
 
48
- # 4) Apply LoRA adapters
49
  model = FastLanguageModel.get_peft_model(
50
  model,
51
  r=8,
@@ -59,7 +65,7 @@ def main():
59
  use_gradient_checkpointing=True,
60
  )
61
 
62
- # 5) Tokenization
63
  def tokenize_fn(examples):
64
  tokens = tokenizer(
65
  examples["text"],
@@ -73,7 +79,7 @@ def main():
73
  tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
74
  tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
75
 
76
- # 6) Training arguments
77
  training_args = TrainingArguments(
78
  output_dir="./output_model",
79
  per_device_train_batch_size=8,
@@ -92,7 +98,7 @@ def main():
92
  dataloader_num_workers=2,
93
  )
94
 
95
- # 7) Train
96
  trainer = SFTTrainer(
97
  model=model,
98
  tokenizer=tokenizer,
@@ -105,13 +111,14 @@ def main():
105
  trainer.train()
106
  print("βœ… Training complete.")
107
 
108
- # 8) Save and zip
109
  final_dir = "./output_model_final"
110
  os.makedirs(final_dir, exist_ok=True)
111
  model.save_pretrained(final_dir)
112
  tokenizer.save_pretrained(final_dir)
113
 
114
  zip_path = "model.zip"
 
115
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
116
  for root, _, files in os.walk(final_dir):
117
  for fname in files:
@@ -121,4 +128,4 @@ def main():
121
  print(f"βœ… Model zipped β†’ {zip_path}")
122
 
123
  if __name__ == "__main__":
124
- main()
 
9
  from transformers import TrainingArguments
10
  from trl import SFTTrainer
11
 
12
+
13
  def main():
14
  # 1) Load 4-bit model + tokenizer
15
  model_name = "HuggingFaceTB/SmolLM2-1.7B"
 
24
  load_in_4bit=load_in_4bit,
25
  )
26
 
27
+ # 2) Configure special tokens for chat format
28
+ special_tokens = {
29
+ "bos_token": "<|im_start|>user\n",
30
+ "eos_token": "<|im_end|>",
31
+ "pad_token": "<|im_end|>",
32
+ "additional_special_tokens": [
33
+ "<|im_start|>assistant\n"
34
+ ]
35
+ }
36
+ tokenizer.add_special_tokens(special_tokens)
37
+ model.resize_token_embeddings(len(tokenizer))
38
+ model.config.bos_token_id = tokenizer.bos_token_id
39
  model.config.eos_token_id = tokenizer.eos_token_id
40
+ model.config.pad_token_id = tokenizer.pad_token_id
41
 
42
+ # 3) Load chat-formatted dataset
43
+ df = pd.read_json("chat_formatted_data.jsonl", lines=True)
44
+ # Confirm each sample ends with the end-of-turn token
45
+ assert df['text'].str.endswith("<|im_end|>").all(), "Some samples missing end-of-turn token"
 
 
 
 
46
 
47
+ # 4) Create Hugging Face Dataset and split
48
  full_dataset = Dataset.from_pandas(df[["text"]])
49
  split = full_dataset.train_test_split(test_size=0.15, seed=42)
50
  train_dataset = split["train"]
51
  eval_dataset = split["test"]
52
  print(f"βœ… Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
53
 
54
+ # 5) Apply LoRA adapters
55
  model = FastLanguageModel.get_peft_model(
56
  model,
57
  r=8,
 
65
  use_gradient_checkpointing=True,
66
  )
67
 
68
+ # 6) Tokenization function
69
  def tokenize_fn(examples):
70
  tokens = tokenizer(
71
  examples["text"],
 
79
  tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
80
  tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
81
 
82
+ # 7) Training arguments
83
  training_args = TrainingArguments(
84
  output_dir="./output_model",
85
  per_device_train_batch_size=8,
 
98
  dataloader_num_workers=2,
99
  )
100
 
101
+ # 8) Train with SFTTrainer
102
  trainer = SFTTrainer(
103
  model=model,
104
  tokenizer=tokenizer,
 
111
  trainer.train()
112
  print("βœ… Training complete.")
113
 
114
+ # 9) Save and zip model
115
  final_dir = "./output_model_final"
116
  os.makedirs(final_dir, exist_ok=True)
117
  model.save_pretrained(final_dir)
118
  tokenizer.save_pretrained(final_dir)
119
 
120
  zip_path = "model.zip"
121
+ print(f"πŸ—œ Zipping model β†’ {zip_path}")
122
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
123
  for root, _, files in os.walk(final_dir):
124
  for fname in files:
 
128
  print(f"βœ… Model zipped β†’ {zip_path}")
129
 
130
  if __name__ == "__main__":
131
+ main()