darrenphodgson76 commited on
Commit
4d2faa4
Β·
verified Β·
1 Parent(s): c62069a

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +74 -54
train.py CHANGED
@@ -1,29 +1,20 @@
1
- # βœ… Final train.py with JSONL input and EOS-as-pad (no stop_sequences)
2
- import unsloth # must be first
3
- import pandas as pd
4
- import torch
5
- from datasets import Dataset
6
- from transformers import TrainingArguments
7
- from unsloth import FastLanguageModel
8
- from trl import SFTTrainer
9
- import os
10
- import zipfile
11
 
12
- # 1) Load Unsloth model + tokenizer
13
  model, tokenizer = FastLanguageModel.from_pretrained(
14
- model_name="HuggingFaceTB/SmolLM2-1.7B",
15
- max_seq_length=2048,
16
- dtype=torch.float16,
17
- load_in_4bit=True,
18
  )
19
 
20
- # 2) Reuse the existing eos_token as pad_token
21
- eos = tokenizer.eos_token # should be "<|endoftext|>"
22
- tokenizer.pad_token = eos
23
  model.config.pad_token_id = tokenizer.eos_token_id
24
 
25
- # 3) Load & format your dataset from JSONL, always ending responses with EOS
26
- # Read the JSONL we generated (one JSON object per line with "instruction" & "response")
27
  df = pd.read_json("data.jsonl", lines=True)
28
  df["text"] = df.apply(
29
  lambda row: (
@@ -31,68 +22,97 @@ df["text"] = df.apply(
31
  + row["instruction"].strip()
32
  + "\n\n### Response:\n"
33
  + row["response"].strip()
34
- + eos
35
  ),
36
- axis=1
37
  )
38
- dataset = Dataset.from_pandas(df[["text"]])
 
 
 
 
 
 
39
 
40
- # 4) Apply LoRA
41
  model = FastLanguageModel.get_peft_model(
42
  model,
43
  r=8,
44
  lora_alpha=32,
45
  lora_dropout=0.05,
46
  bias="none",
 
 
 
 
 
47
  )
48
 
49
- # 5) Tokenize
50
- def tokenize(example):
51
- return tokenizer(
52
- example["text"],
 
53
  truncation=True,
54
  padding="max_length",
55
- max_length=512,
56
  )
 
 
57
 
58
- tokenized_dataset = dataset.map(tokenize, batched=True)
 
59
 
60
- # 6) Training arguments
61
  training_args = TrainingArguments(
62
  output_dir="./output_model",
63
- per_device_train_batch_size=2,
 
 
64
  num_train_epochs=3,
65
  learning_rate=2e-4,
66
- logging_steps=10,
67
- save_steps=100,
68
- fp16=True,
 
 
 
 
 
 
69
  )
70
 
71
- # 7) Initialize SFTTrainer (no stop_sequences here)
72
  trainer = SFTTrainer(
73
  model=model,
74
  tokenizer=tokenizer,
75
  args=training_args,
76
- train_dataset=tokenized_dataset,
 
77
  )
78
 
79
- # 8) Train!
 
80
  trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # 9) Save the adapter
83
- output_dir = "./output_model"
84
- os.makedirs(output_dir, exist_ok=True)
85
- model.save_pretrained(output_dir)
86
 
87
- # πŸ”§ Zip for download
88
- zip_path = "/home/user/app/model.zip"
89
- try:
90
- with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
91
- for root, _, files in os.walk(output_dir):
92
- for fname in files:
93
- full = os.path.join(root, fname)
94
- rel = os.path.relpath(full, output_dir)
95
- z.write(full, rel)
96
- print(f"βœ… Zipped model to {zip_path}")
97
- except Exception as e:
98
- print(f"❌ Failed to zip model: {e}")
 
1
+ import os import zipfile import pandas as pd import torch from datasets import Dataset from transformers import TrainingArguments from unsloth import FastLanguageModel from trl import SFTTrainer
2
+
3
+ def main(): # 1) Load Unsloth 4-bit model + tokenizer model_name = "HuggingFaceTB/SmolLM2-1.7B" max_seq_length = 2048 dtype = torch.float16 load_in_4bit = True
 
 
 
 
 
 
 
4
 
 
5
  model, tokenizer = FastLanguageModel.from_pretrained(
6
+ model_name=model_name,
7
+ max_seq_length=max_seq_length,
8
+ dtype=dtype,
9
+ load_in_4bit=load_in_4bit,
10
  )
11
 
12
+ # 2) Set pad_token = eos_token if not already defined
13
+ if tokenizer.pad_token is None:
14
+ tokenizer.pad_token = tokenizer.eos_token
15
  model.config.pad_token_id = tokenizer.eos_token_id
16
 
17
+ # 3) Load and format dataset
 
18
  df = pd.read_json("data.jsonl", lines=True)
19
  df["text"] = df.apply(
20
  lambda row: (
 
22
  + row["instruction"].strip()
23
  + "\n\n### Response:\n"
24
  + row["response"].strip()
25
+ + tokenizer.eos_token
26
  ),
27
+ axis=1,
28
  )
29
+ full_dataset = Dataset.from_pandas(df[["text"]])
30
+
31
+ # Split into train and eval
32
+ split = full_dataset.train_test_split(test_size=0.15, seed=42)
33
+ train_dataset = split["train"]
34
+ eval_dataset = split["test"]
35
+ print(f"Training samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")
36
 
37
+ # 4) Apply LoRA adapters
38
  model = FastLanguageModel.get_peft_model(
39
  model,
40
  r=8,
41
  lora_alpha=32,
42
  lora_dropout=0.05,
43
  bias="none",
44
+ target_modules=[
45
+ "q_proj", "k_proj", "v_proj", "o_proj",
46
+ "gate_proj", "up_proj", "down_proj",
47
+ ],
48
+ use_gradient_checkpointing=True,
49
  )
50
 
51
+ # 5) Tokenization with labels
52
+ train_seq_length = 512
53
+ def tokenize_fn(examples):
54
+ tokens = tokenizer(
55
+ examples["text"],
56
  truncation=True,
57
  padding="max_length",
58
+ max_length=train_seq_length,
59
  )
60
+ tokens["labels"] = tokens["input_ids"].copy()
61
+ return tokens
62
 
63
+ tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
64
+ tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
65
 
66
+ # 6) Define training arguments
67
  training_args = TrainingArguments(
68
  output_dir="./output_model",
69
+ per_device_train_batch_size=8,
70
+ gradient_accumulation_steps=1,
71
+ fp16=True,
72
  num_train_epochs=3,
73
  learning_rate=2e-4,
74
+ logging_strategy="steps",
75
+ logging_steps=25,
76
+ save_strategy="epoch",
77
+ save_total_limit=2,
78
+ evaluation_strategy="epoch",
79
+ load_best_model_at_end=True,
80
+ metric_for_best_model="eval_loss",
81
+ greater_is_better=False,
82
+ dataloader_num_workers=2,
83
  )
84
 
85
+ # 7) Initialize SFTTrainer
86
  trainer = SFTTrainer(
87
  model=model,
88
  tokenizer=tokenizer,
89
  args=training_args,
90
+ train_dataset=tokenized_train,
91
+ eval_dataset=tokenized_eval,
92
  )
93
 
94
+ # 8) Train
95
+ print("Starting training...")
96
  trainer.train()
97
+ print("Training finished.")
98
+
99
+ # 9) Save final adapter and tokenizer
100
+ final_dir = "./output_model_final"
101
+ os.makedirs(final_dir, exist_ok=True)
102
+ model.save_pretrained(final_dir)
103
+ tokenizer.save_pretrained(final_dir)
104
+ print(f"Saved final adapter to {final_dir}")
105
+
106
+ # 10) Zip the final model
107
+ zip_path = "model_final.zip"
108
+ print(f"Zipping model directory {final_dir} to {zip_path}...")
109
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
110
+ for root, _, files in os.walk(final_dir):
111
+ for fname in files:
112
+ full_path = os.path.join(root, fname)
113
+ rel_path = os.path.relpath(full_path, final_dir)
114
+ z.write(full_path, rel_path)
115
+ print(f"Successfully zipped model to {zip_path}")
116
 
117
+ if name == "main": main()
 
 
 
118