darrenphodgson76 commited on
Commit
7693569
·
verified ·
1 Parent(s): 6d6b976

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +109 -107
train.py CHANGED
@@ -1,125 +1,127 @@
1
  import os
2
  import zipfile
3
- import pandas as pd
4
- import torch from datasets
5
- import Dataset from transformers
6
- import TrainingArguments from unsloth
7
- import FastLanguageModel from trl
8
- import SFTTrainer
9
 
10
- def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True
 
 
 
 
11
 
12
- model, tokenizer = FastLanguageModel.from_pretrained(
13
- model_name=model_name,
14
- max_seq_length=max_seq_length,
15
- dtype=dtype,
16
- load_in_4bit=load_in_4bit,
17
- )
18
 
19
- # 2) Set pad_token = eos_token if not already defined
20
- if tokenizer.pad_token is None:
21
- tokenizer.pad_token = tokenizer.eos_token
22
- model.config.pad_token_id = tokenizer.eos_token_id
 
 
23
 
24
- # 3) Load and format dataset
25
- df = pd.read_json("data.jsonl", lines=True)
26
- df["text"] = df.apply(
27
- lambda row: (
28
- "### Instruction:\n"
29
- + row["instruction"].strip()
30
- + "\n\n### Response:\n"
31
- + row["response"].strip()
32
- + tokenizer.eos_token
33
- ),
34
- axis=1,
35
- )
36
- full_dataset = Dataset.from_pandas(df[["text"]])
37
 
38
- # Split into train and eval
39
- split = full_dataset.train_test_split(test_size=0.15, seed=42)
40
- train_dataset = split["train"]
41
- eval_dataset = split["test"]
42
- print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
 
 
 
 
 
 
 
 
43
 
44
- # 4) Apply LoRA adapters
45
- model = FastLanguageModel.get_peft_model(
46
- model,
47
- r=8,
48
- lora_alpha=32,
49
- lora_dropout=0.05,
50
- bias="none",
51
- target_modules=[
52
- "q_proj", "k_proj", "v_proj", "o_proj",
53
- "gate_proj", "up_proj", "down_proj",
54
- ],
55
- use_gradient_checkpointing=True,
56
- )
57
 
58
- # 5) Tokenization with labels
59
- train_seq_length = 512
60
- def tokenize_fn(examples):
61
- tokens = tokenizer(
62
- examples["text"],
63
- truncation=True,
64
- padding="max_length",
65
- max_length=train_seq_length,
 
 
 
 
66
  )
67
- tokens["labels"] = tokens["input_ids"].copy()
68
- return tokens
69
 
70
- tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
71
- tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
 
 
 
 
 
 
 
 
 
72
 
73
- # 6) Define training arguments
74
- training_args = TrainingArguments(
75
- output_dir="./output_model",
76
- per_device_train_batch_size=8,
77
- gradient_accumulation_steps=1,
78
- fp16=True,
79
- num_train_epochs=3,
80
- learning_rate=2e-4,
81
- logging_strategy="steps",
82
- logging_steps=25,
83
- save_strategy="epoch",
84
- save_total_limit=2,
85
- evaluation_strategy="epoch",
86
- load_best_model_at_end=True,
87
- metric_for_best_model="eval_loss",
88
- greater_is_better=False,
89
- dataloader_num_workers=2,
90
- )
91
 
92
- # 7) Initialize SFTTrainer
93
- trainer = SFTTrainer(
94
- model=model,
95
- tokenizer=tokenizer,
96
- args=training_args,
97
- train_dataset=tokenized_train,
98
- eval_dataset=tokenized_eval,
99
- )
 
 
 
 
 
 
 
 
 
 
100
 
101
- # 8) Train
102
- print("Starting training...")
103
- trainer.train()
104
- print("Training finished.")
 
 
 
 
105
 
106
- # 9) Save final adapter and tokenizer
107
- final_dir = "./output_model_final"
108
- os.makedirs(final_dir, exist_ok=True)
109
- model.save_pretrained(final_dir)
110
- tokenizer.save_pretrained(final_dir)
111
- print(f"Saved final adapter to {final_dir}")
112
 
113
- # 10) Zip the final model
114
- zip_path = "model.zip"
115
- print(f"Zipping model directory {final_dir} to {zip_path}...")
116
- with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
117
- for root, _, files in os.walk(final_dir):
118
- for fname in files:
119
- full_path = os.path.join(root, fname)
120
- rel_path = os.path.relpath(full_path, final_dir)
121
- z.write(full_path, rel_path)
122
- print(f"Successfully zipped model to {zip_path}")
123
 
124
- if name == "main": main()
 
 
 
 
 
 
 
 
125
 
 
 
 
1
  import os
2
  import zipfile
 
 
 
 
 
 
3
 
4
+ from datasets import Dataset
5
+ import pandas as pd
6
+ import torch
7
+ from transformers import TrainingArguments
8
+ from trl import FastLanguageModel, SFTTrainer
9
 
10
+ def main():
11
+ # 1) Load 4-bit model + tokenizer
12
+ model_name = "HuggingFaceTB/SmolLM2-1.7B"
13
+ max_seq_length = 2048
14
+ dtype = torch.float16
15
+ load_in_4bit = True
16
 
17
+ model, tokenizer = FastLanguageModel.from_pretrained(
18
+ model_name=model_name,
19
+ max_seq_length=max_seq_length,
20
+ dtype=dtype,
21
+ load_in_4bit=load_in_4bit,
22
+ )
23
 
24
+ # 2) Ensure pad_token
25
+ if tokenizer.pad_token is None:
26
+ tokenizer.pad_token = tokenizer.eos_token
27
+ model.config.pad_token_id = tokenizer.eos_token_id
 
 
 
 
 
 
 
 
 
28
 
29
+ # 3) Load and format dataset
30
+ df = pd.read_json("data.jsonl", lines=True)
31
+ df["text"] = df.apply(
32
+ lambda row: (
33
+ "### Instruction:\n"
34
+ + row["instruction"].strip()
35
+ + "\n\n### Response:\n"
36
+ + row["response"].strip()
37
+ + tokenizer.eos_token
38
+ ),
39
+ axis=1,
40
+ )
41
+ full_dataset = Dataset.from_pandas(df[["text"]])
42
 
43
+ split = full_dataset.train_test_split(test_size=0.15, seed=42)
44
+ train_dataset = split["train"]
45
+ eval_dataset = split["test"]
46
+ print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
 
 
 
 
 
 
 
 
 
47
 
48
+ # 4) Apply LoRA adapters
49
+ model = FastLanguageModel.get_peft_model(
50
+ model,
51
+ r=8,
52
+ lora_alpha=32,
53
+ lora_dropout=0.05,
54
+ bias="none",
55
+ target_modules=[
56
+ "q_proj", "k_proj", "v_proj", "o_proj",
57
+ "gate_proj", "up_proj", "down_proj",
58
+ ],
59
+ use_gradient_checkpointing=True,
60
  )
 
 
61
 
62
+ # 5) Tokenization with labels
63
+ train_seq_length = 512
64
+ def tokenize_fn(examples):
65
+ tokens = tokenizer(
66
+ examples["text"],
67
+ truncation=True,
68
+ padding="max_length",
69
+ max_length=train_seq_length,
70
+ )
71
+ tokens["labels"] = tokens["input_ids"].copy()
72
+ return tokens
73
 
74
+ tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
75
+ tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # 6) Define training arguments
78
+ training_args = TrainingArguments(
79
+ output_dir="./output_model",
80
+ per_device_train_batch_size=8,
81
+ gradient_accumulation_steps=1,
82
+ fp16=True,
83
+ num_train_epochs=3,
84
+ learning_rate=2e-4,
85
+ logging_strategy="steps",
86
+ logging_steps=25,
87
+ save_strategy="epoch",
88
+ save_total_limit=2,
89
+ evaluation_strategy="epoch",
90
+ load_best_model_at_end=True,
91
+ metric_for_best_model="eval_loss",
92
+ greater_is_better=False,
93
+ dataloader_num_workers=2,
94
+ )
95
 
96
+ # 7) Initialize and run SFTTrainer
97
+ trainer = SFTTrainer(
98
+ model=model,
99
+ tokenizer=tokenizer,
100
+ args=training_args,
101
+ train_dataset=tokenized_train,
102
+ eval_dataset=tokenized_eval,
103
+ )
104
 
105
+ print("Starting training...")
106
+ trainer.train()
107
+ print("Training finished.")
 
 
 
108
 
109
+ # 8) Save and zip model
110
+ final_dir = "./output_model_final"
111
+ os.makedirs(final_dir, exist_ok=True)
112
+ model.save_pretrained(final_dir)
113
+ tokenizer.save_pretrained(final_dir)
114
+ print(f"Saved final adapter to {final_dir}")
 
 
 
 
115
 
116
+ zip_path = "model.zip"
117
+ print(f"Zipping {final_dir} → {zip_path}")
118
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
119
+ for root, _, files in os.walk(final_dir):
120
+ for fname in files:
121
+ full = os.path.join(root, fname)
122
+ rel = os.path.relpath(full, final_dir)
123
+ zf.write(full, rel)
124
+ print(f"Successfully zipped model to {zip_path}")
125
 
126
+ if __name__ == "__main__":
127
+ main()