darrenphodgson76 commited on
Commit
f0fba32
Β·
verified Β·
1 Parent(s): f2f7c22

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +15 -21
train.py CHANGED
@@ -3,8 +3,8 @@ from unsloth import FastLanguageModel
3
 
4
  import os
5
  import zipfile
6
- from datasets import Dataset
7
  import pandas as pd
 
8
  import torch
9
  from transformers import TrainingArguments
10
  from trl import SFTTrainer
@@ -27,20 +27,17 @@ def main():
27
  if tokenizer.pad_token is None:
28
  tokenizer.pad_token = tokenizer.eos_token
29
  model.config.pad_token_id = tokenizer.eos_token_id
 
30
 
31
- # 3) Load and clean dataset
32
- df = pd.read_json("data.jsonl", lines=True)
33
 
34
- # Ensure all rows end cleanly with a single eos_token
35
- df["text"] = df["text"].apply(
36
- lambda t: t.rstrip()
37
- .removesuffix(tokenizer.eos_token)
38
- .strip() + tokenizer.eos_token
39
  )
40
 
41
- # Confirm all rows end properly
42
- assert df["text"].str.endswith(tokenizer.eos_token).all(), "Some rows are missing eos_token!"
43
-
44
  # Convert to Hugging Face Dataset
45
  full_dataset = Dataset.from_pandas(df[["text"]])
46
  split = full_dataset.train_test_split(test_size=0.15, seed=42)
@@ -62,7 +59,7 @@ def main():
62
  use_gradient_checkpointing=True,
63
  )
64
 
65
- # 5) Tokenize and retain eos token
66
  def tokenize_fn(examples):
67
  tokens = tokenizer(
68
  examples["text"],
@@ -76,7 +73,7 @@ def main():
76
  tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
77
  tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
78
 
79
- # 6) Define training arguments
80
  training_args = TrainingArguments(
81
  output_dir="./output_model",
82
  per_device_train_batch_size=8,
@@ -88,15 +85,14 @@ def main():
88
  logging_steps=25,
89
  save_strategy="epoch",
90
  save_total_limit=2,
91
- eval_strategy="epoch", # βœ… Corrected
92
  load_best_model_at_end=True,
93
  metric_for_best_model="eval_loss",
94
  greater_is_better=False,
95
  dataloader_num_workers=2,
96
  )
97
 
98
-
99
- # 7) Train the model
100
  trainer = SFTTrainer(
101
  model=model,
102
  tokenizer=tokenizer,
@@ -109,22 +105,20 @@ def main():
109
  trainer.train()
110
  print("βœ… Training complete.")
111
 
112
- # 8) Save model and zip it
113
  final_dir = "./output_model_final"
114
  os.makedirs(final_dir, exist_ok=True)
115
  model.save_pretrained(final_dir)
116
  tokenizer.save_pretrained(final_dir)
117
- print(f"πŸ“¦ Saved final model to {final_dir}")
118
 
119
  zip_path = "model.zip"
120
- print(f"πŸ—œ Zipping model β†’ {zip_path}")
121
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
122
  for root, _, files in os.walk(final_dir):
123
  for fname in files:
124
  full = os.path.join(root, fname)
125
  rel = os.path.relpath(full, final_dir)
126
  zf.write(full, rel)
127
- print(f"βœ… Zipped model saved to {zip_path}")
128
 
129
  if __name__ == "__main__":
130
- main()
 
3
 
4
  import os
5
  import zipfile
 
6
  import pandas as pd
7
+ from datasets import Dataset
8
  import torch
9
  from transformers import TrainingArguments
10
  from trl import SFTTrainer
 
27
  if tokenizer.pad_token is None:
28
  tokenizer.pad_token = tokenizer.eos_token
29
  model.config.pad_token_id = tokenizer.eos_token_id
30
+ model.config.eos_token_id = tokenizer.eos_token_id
31
 
32
+ # 3) Load instruction-response dataset
33
+ df = pd.read_json("cleaned_instruction_response.jsonl", lines=True)
34
 
35
+ # Rebuild clean training text
36
+ df["text"] = df.apply(
37
+ lambda row: row["instruction"].strip() + "\n\n" + row["response"].strip() + tokenizer.eos_token,
38
+ axis=1
 
39
  )
40
 
 
 
 
41
  # Convert to Hugging Face Dataset
42
  full_dataset = Dataset.from_pandas(df[["text"]])
43
  split = full_dataset.train_test_split(test_size=0.15, seed=42)
 
59
  use_gradient_checkpointing=True,
60
  )
61
 
62
+ # 5) Tokenization
63
  def tokenize_fn(examples):
64
  tokens = tokenizer(
65
  examples["text"],
 
73
  tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
74
  tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
75
 
76
+ # 6) Training arguments
77
  training_args = TrainingArguments(
78
  output_dir="./output_model",
79
  per_device_train_batch_size=8,
 
85
  logging_steps=25,
86
  save_strategy="epoch",
87
  save_total_limit=2,
88
+ eval_strategy="epoch",
89
  load_best_model_at_end=True,
90
  metric_for_best_model="eval_loss",
91
  greater_is_better=False,
92
  dataloader_num_workers=2,
93
  )
94
 
95
+ # 7) Train
 
96
  trainer = SFTTrainer(
97
  model=model,
98
  tokenizer=tokenizer,
 
105
  trainer.train()
106
  print("βœ… Training complete.")
107
 
108
+ # 8) Save and zip
109
  final_dir = "./output_model_final"
110
  os.makedirs(final_dir, exist_ok=True)
111
  model.save_pretrained(final_dir)
112
  tokenizer.save_pretrained(final_dir)
 
113
 
114
  zip_path = "model.zip"
 
115
  with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
116
  for root, _, files in os.walk(final_dir):
117
  for fname in files:
118
  full = os.path.join(root, fname)
119
  rel = os.path.relpath(full, final_dir)
120
  zf.write(full, rel)
121
+ print(f"βœ… Model zipped β†’ {zip_path}")
122
 
123
  if __name__ == "__main__":
124
+ main()