jesse-tong commited on
Commit
37586a2
·
1 Parent(s): b8ddcf4

Add streamlit app

Browse files
Files changed (6) hide show
  1. .gitignore +1 -1
  2. api.py +108 -0
  3. app.py +49 -0
  4. dataset.py +5 -3
  5. example_uses.md +13 -5
  6. requirements.txt +2 -1
.gitignore CHANGED
@@ -8,4 +8,4 @@ __pycache__/
8
  metrics.txt
9
  predictions.txt
10
  *.pth
11
- news-category-dataset/
 
8
  metrics.txt
9
  predictions.txt
10
  *.pth
11
+ vietnamese_hate_speech_detection_phobert/
api.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from model import DocBERT
2
+ from models.lstm_model import DocumentBiLSTM
3
+ from dataset import DataLoader, DocumentDataset
4
+ from utils.word_segmentation_vi import word_segmentation_vi
5
+ import numpy as np
6
+ from transformers import AutoTokenizer
7
+ import torch.nn.functional as F
8
+ import torch
9
+
10
+ args = {
11
+ "bert_model": "vinai/phobert-base-v2", # Base BERT model name
12
+ "model_path": "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth", # Change this if you have a fine-tuned model somewhere else
13
+ "lstm_model_path": "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth", # Change this if you have a fine-tuned model somewhere else
14
+ "max_seq_length": 250,
15
+ "num_classes": 4, # As the fine tuned model has 4 classes per category
16
+ "num_categories": 5, # As the fine tuned model has 5 categories
17
+ }
18
+
19
+ class_names = ["NORMAL", "CLEAN", "OFFENSIVE", "HATE"]
20
+
21
+ def load_model_bert():
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+
24
+ model = DocBERT(bert_model_name=args["bert_model"], num_classes=args["num_classes"], num_categories=args["num_categories"])
25
+ model.load_state_dict(torch.load(args["model_path"], map_location=device))
26
+ model = model.to(device)
27
+ return model, device
28
+
29
+ def load_model_lstm():
30
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(args["bert_model"])
33
+ vocab_size = tokenizer.vocab_size
34
+ model = DocumentBiLSTM(vocab_size=vocab_size,
35
+ embedding_dim=300,
36
+ hidden_dim=256,
37
+ n_layers=2,
38
+ output_dim=args["num_classes"] * args["num_categories"])
39
+ model.load_state_dict(torch.load(args["lstm_model_path"], map_location=device)["model_state_dict"])
40
+ model = model.to(device)
41
+ return model, device
42
+
43
+ def inference(model, device, comments: str | list):
44
+ if isinstance(comments, str):
45
+ comments = [comments]
46
+ elif not isinstance(comments, list):
47
+ raise ValueError("comment must be a string or a list of strings")
48
+
49
+ comments = np.array([word_segmentation_vi(comment) for comment in comments])
50
+ data = DocumentDataset(texts=comments, labels=None, tokenizer_name=args["bert_model"], max_length=args["max_seq_length"])
51
+ inference_loader = DataLoader(data, batch_size=comments.shape[0], shuffle=False)
52
+
53
+ batch = next(iter(inference_loader))
54
+ input_ids = batch['input_ids']
55
+ attention_mask = batch['attention_mask']
56
+ token_type_ids = batch['token_type_ids']
57
+
58
+ input_ids = input_ids.to(device)
59
+ attention_mask = attention_mask.to(device)
60
+ token_type_ids = token_type_ids.to(device)
61
+
62
+ with torch.no_grad():
63
+ outputs = model(input_ids, attention_mask=attention_mask)
64
+ if args["num_categories"] > 1:
65
+ batch_size, total_classes = outputs.shape
66
+ if total_classes % args["num_categories"] != 0:
67
+ raise ValueError(f"Error: Number of total classes in the batch must of divisible by {args["num_categories"]}")
68
+
69
+ classes_per_group = total_classes // args["num_categories"]
70
+ # Group every classes_per_group values along dim=1
71
+ reshaped = outputs.view(outputs.size(0), -1, classes_per_group) # shape: (batch, self., classes_per_group)
72
+ probs = F.softmax(reshaped, dim=1)
73
+ # Argmax over each group of classes_per_group
74
+ predictions = probs.argmax(dim=-1)
75
+ else:
76
+ predictions = torch.argmax(outputs, dim=-1)
77
+
78
+ preds_array = predictions.cpu().numpy()
79
+ result = []
80
+ for i in range(preds_array.shape[0]):
81
+ result.append(
82
+ {
83
+ "Bình luận": comments[i],
84
+ "Cá nhân": class_names[ preds_array[i, 0] ],
85
+ "Nhóm/tổ chức": class_names[ preds_array[i, 1] ],
86
+ "Tôn giáo/tín ngưỡng": class_names[ preds_array[i, 2] ],
87
+ "Chủng tộc/sắc tộc": class_names[ preds_array[i, 3] ],
88
+ "Chính trị": class_names[ preds_array[i, 4] ],
89
+ })
90
+ return result
91
+
92
+ if __name__ == "__main__":
93
+
94
+ model, device = load_model_bert()
95
+ comments = [
96
+ "Để avata bít ngay là ngu hơn chó",
97
+ "Hàn Quốc chửi dân Đông Lào và đây là hậu quả",
98
+ "Nguyễn Thuận =)) tư tưởng rừng rú gì vậy",
99
+ "@công danh nguyen thể chế chính trị khác hẳn tư tưởng xã hội nhé. Con cờ hó china liên quan cmn gì?"
100
+ ]
101
+ predictions = inference(model, device, comments)
102
+ print("BERT Predictions:")
103
+ print(predictions)
104
+
105
+ lstm_model, device = load_model_lstm()
106
+ lstm_predictions = inference(lstm_model, device, comments)
107
+ print("LSTM Predictions:")
108
+ print(lstm_predictions)
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from api import load_model_bert, load_model_lstm, inference
3
+ import pandas as pd
4
+
5
+ # Set up the Streamlit app
6
+ def app():
7
+ st.title("Phân tích ngôn từ thù địch, phân biệt sử dụng PhoBERT và LSTM")
8
+
9
+ # Show loading progress bar
10
+ # Load models
11
+ @st.cache_resource
12
+ def load_models():
13
+ st.progress(0, "Nạp các mô hình...")
14
+ # Load BERT model
15
+ bert_model, bert_device = load_model_bert()
16
+ st.progress(50, "Mô hình PhoBERT đã được nạp.")
17
+ # Load LSTM model
18
+ lstm_model, lstm_device = load_model_lstm()
19
+ st.progress(100, "Mô hình LSTM đã được nạp.") # Complete loading progress
20
+ return bert_model, bert_device, lstm_model, lstm_device
21
+
22
+ bert_model, bert_device, lstm_model, lstm_device = load_models()
23
+
24
+ # User input
25
+ user_input = st.text_area("Nhập các bình luận để phân tích ngôn từ thù địch, phân biệt (xuống dòng cho từng bình luận):")
26
+
27
+ if st.button("Phân tích"):
28
+ if user_input:
29
+ # Preprocess input
30
+ comments = user_input.splitlines()
31
+
32
+ # Inference with BERT
33
+ st.progress(0, "Đang phân tích với PhoBERT...")
34
+ bert_predictions = inference(bert_model, bert_device, comments)
35
+ st.write("BERT Predictions:")
36
+ st.dataframe(pd.DataFrame(bert_predictions))
37
+
38
+ st.progress(50, "Đang phân tích với LSTM...")
39
+
40
+ # Inference with LSTM
41
+ lstm_predictions = inference(lstm_model, lstm_device, comments)
42
+ st.write("LSTM Predictions:")
43
+ st.progress(100, "Phân tích hoàn tất!")
44
+ st.dataframe(pd.DataFrame(lstm_predictions))
45
+ else:
46
+ st.warning("Hãy nhập một vài bình luận.")
47
+
48
+ if __name__ == "__main__":
49
+ app()
dataset.py CHANGED
@@ -19,7 +19,9 @@ class DocumentDataset(Dataset):
19
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
20
  self.max_length = max_length
21
 
22
- if type(labels) is not np.ndarray and type(labels) is not list:
 
 
23
  # Validate labels
24
  unique_labels = set(labels)
25
  min_label = min(unique_labels) if unique_labels else 0
@@ -67,7 +69,7 @@ class DocumentDataset(Dataset):
67
 
68
  def __getitem__(self, idx):
69
  text = str(self.texts[idx])
70
- label = self.labels[idx]
71
 
72
  # Tokenize the text with attention mask and truncation
73
  encoding = self.tokenizer.encode_plus(
@@ -92,7 +94,7 @@ class DocumentDataset(Dataset):
92
  """Get original text for a given index"""
93
  return {
94
  'text': self.texts[idx],
95
- 'label': self.labels[idx]
96
  }
97
 
98
  def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):
 
19
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
20
  self.max_length = max_length
21
 
22
+ if labels is None:
23
+ self.labels = np.zeros(len(texts), dtype=int)
24
+ elif type(labels) is not np.ndarray and type(labels) is not list:
25
  # Validate labels
26
  unique_labels = set(labels)
27
  min_label = min(unique_labels) if unique_labels else 0
 
69
 
70
  def __getitem__(self, idx):
71
  text = str(self.texts[idx])
72
+ label = self.labels[idx] if self.labels is not None else torch.tensor(0, dtype=torch.long)
73
 
74
  # Tokenize the text with attention mask and truncation
75
  encoding = self.tokenizer.encode_plus(
 
94
  """Get original text for a given index"""
95
  return {
96
  'text': self.texts[idx],
97
+ 'label': self.labels[idx] if self.labels is not None else None
98
  }
99
 
100
  def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):
example_uses.md CHANGED
@@ -3,19 +3,27 @@
3
 
4
  - Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
5
  ```
6
- python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4
7
  ```
8
  - Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
9
  ```
10
- python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./output/vinai_phobert-base-v2_finetuned.pth" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
11
  ```
12
 
13
  - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
14
  ```
15
- python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vinai_phobert-base-v2_finetuned/best_model.pth" --output_dir "./output" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
16
  ```
17
 
18
  - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
19
  ```
20
- python ./inference_lstm.py --model_path "./output/distilled_lstm_model.pth" --bert_tokenizer "vinai/phobert-base-v2" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
21
- ```
 
 
 
 
 
 
 
 
 
3
 
4
  - Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
5
  ```
6
+ python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4 --output "./vietnamese_hate_speech_detection_phobert"
7
  ```
8
  - Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
9
  ```
10
+ python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
11
  ```
12
 
13
  - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
14
  ```
15
+ python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --output_dir "./vietnamese_hate_speech_detection_phobert" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
16
  ```
17
 
18
  - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
19
  ```
20
+ python ./inference_lstm.py --model_path "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth" --bert_tokenizer "vinai/phobert-base-v2" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
21
+ ```
22
+
23
+ ## How to run:
24
+
25
+ - Install the dependencies in requirements.txt: pip install -r requirements.txt
26
+
27
+ - Either follow the "Train with BERT model" or "Train LSTM model from BERT model using distillation" in Example uses section above, or git clone the model from: "https://huggingface.co/jesse-tong/vietnamese_hate_speech_detection_phobert"
28
+
29
+ - Run the Streamlit app: streamlit run app.py, then either go to http://localhost:8501 or waiting for the browser tab to open.
requirements.txt CHANGED
@@ -8,4 +8,5 @@ datasets
8
  torchtext
9
  maturin
10
  underthesea --only-binary :all:
11
- accelerate
 
 
8
  torchtext
9
  maturin
10
  underthesea --only-binary :all:
11
+ accelerate
12
+ streamlit