# Import libraries import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel from torch.utils.data import Dataset, DataLoader import os # Define dataset and functions class TextDataset(Dataset): def __init__(self, file_path, block_size): self.block_size = block_size with open(file_path, 'r', encoding='utf-8') as f: self.examples = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') self.special_tokens_dict = {'pad_token': ''} self.num_added_toks = self.tokenizer.add_special_tokens(self.special_tokens_dict) def __len__(self): return len(self.examples) def __getitem__(self, idx): text = self.examples[idx] tokenized_text = self.tokenizer.encode(text) if len(tokenized_text) > self.block_size: tokenized_text = tokenized_text[:self.block_size] tokenized_text += [self.tokenizer.pad_token_id] * (self.block_size - len(tokenized_text)) return torch.tensor(tokenized_text) # Define training def train(): train_dataset = TextDataset('path/to/your/text/file.txt', block_size=512) train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = GPT2LMHeadModel.from_pretrained('gpt2-medium') model.resize_token_embeddings(len(train_dataset.tokenizer)) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) criterion = torch.nn.CrossEntropyLoss(ignore_index=train_dataset.tokenizer.pad_token_id) epochs = 5 for epoch in range(epochs): model.train() total_loss = 0 for batch in train_loader: batch = batch.to(device) optimizer.zero_grad() outputs = model(input_ids=batch[:, :-1], labels=batch[:, 1:]) loss = criterion(outputs.logits.view(-1, outputs.logits.shape[-1]), batch[:, 1:].view(-1)) loss.backward() optimizer.step() total_loss += loss.item() print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}') model.save_pretrained('finetuned_model') # TRAIN THE MODEL!!! train()