Commit
·
37586a2
1
Parent(s):
b8ddcf4
Add streamlit app
Browse files- .gitignore +1 -1
- api.py +108 -0
- app.py +49 -0
- dataset.py +5 -3
- example_uses.md +13 -5
- requirements.txt +2 -1
.gitignore
CHANGED
@@ -8,4 +8,4 @@ __pycache__/
|
|
8 |
metrics.txt
|
9 |
predictions.txt
|
10 |
*.pth
|
11 |
-
|
|
|
8 |
metrics.txt
|
9 |
predictions.txt
|
10 |
*.pth
|
11 |
+
vietnamese_hate_speech_detection_phobert/
|
api.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from model import DocBERT
|
2 |
+
from models.lstm_model import DocumentBiLSTM
|
3 |
+
from dataset import DataLoader, DocumentDataset
|
4 |
+
from utils.word_segmentation_vi import word_segmentation_vi
|
5 |
+
import numpy as np
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
import torch.nn.functional as F
|
8 |
+
import torch
|
9 |
+
|
10 |
+
args = {
|
11 |
+
"bert_model": "vinai/phobert-base-v2", # Base BERT model name
|
12 |
+
"model_path": "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth", # Change this if you have a fine-tuned model somewhere else
|
13 |
+
"lstm_model_path": "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth", # Change this if you have a fine-tuned model somewhere else
|
14 |
+
"max_seq_length": 250,
|
15 |
+
"num_classes": 4, # As the fine tuned model has 4 classes per category
|
16 |
+
"num_categories": 5, # As the fine tuned model has 5 categories
|
17 |
+
}
|
18 |
+
|
19 |
+
class_names = ["NORMAL", "CLEAN", "OFFENSIVE", "HATE"]
|
20 |
+
|
21 |
+
def load_model_bert():
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
|
24 |
+
model = DocBERT(bert_model_name=args["bert_model"], num_classes=args["num_classes"], num_categories=args["num_categories"])
|
25 |
+
model.load_state_dict(torch.load(args["model_path"], map_location=device))
|
26 |
+
model = model.to(device)
|
27 |
+
return model, device
|
28 |
+
|
29 |
+
def load_model_lstm():
|
30 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
31 |
+
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(args["bert_model"])
|
33 |
+
vocab_size = tokenizer.vocab_size
|
34 |
+
model = DocumentBiLSTM(vocab_size=vocab_size,
|
35 |
+
embedding_dim=300,
|
36 |
+
hidden_dim=256,
|
37 |
+
n_layers=2,
|
38 |
+
output_dim=args["num_classes"] * args["num_categories"])
|
39 |
+
model.load_state_dict(torch.load(args["lstm_model_path"], map_location=device)["model_state_dict"])
|
40 |
+
model = model.to(device)
|
41 |
+
return model, device
|
42 |
+
|
43 |
+
def inference(model, device, comments: str | list):
|
44 |
+
if isinstance(comments, str):
|
45 |
+
comments = [comments]
|
46 |
+
elif not isinstance(comments, list):
|
47 |
+
raise ValueError("comment must be a string or a list of strings")
|
48 |
+
|
49 |
+
comments = np.array([word_segmentation_vi(comment) for comment in comments])
|
50 |
+
data = DocumentDataset(texts=comments, labels=None, tokenizer_name=args["bert_model"], max_length=args["max_seq_length"])
|
51 |
+
inference_loader = DataLoader(data, batch_size=comments.shape[0], shuffle=False)
|
52 |
+
|
53 |
+
batch = next(iter(inference_loader))
|
54 |
+
input_ids = batch['input_ids']
|
55 |
+
attention_mask = batch['attention_mask']
|
56 |
+
token_type_ids = batch['token_type_ids']
|
57 |
+
|
58 |
+
input_ids = input_ids.to(device)
|
59 |
+
attention_mask = attention_mask.to(device)
|
60 |
+
token_type_ids = token_type_ids.to(device)
|
61 |
+
|
62 |
+
with torch.no_grad():
|
63 |
+
outputs = model(input_ids, attention_mask=attention_mask)
|
64 |
+
if args["num_categories"] > 1:
|
65 |
+
batch_size, total_classes = outputs.shape
|
66 |
+
if total_classes % args["num_categories"] != 0:
|
67 |
+
raise ValueError(f"Error: Number of total classes in the batch must of divisible by {args["num_categories"]}")
|
68 |
+
|
69 |
+
classes_per_group = total_classes // args["num_categories"]
|
70 |
+
# Group every classes_per_group values along dim=1
|
71 |
+
reshaped = outputs.view(outputs.size(0), -1, classes_per_group) # shape: (batch, self., classes_per_group)
|
72 |
+
probs = F.softmax(reshaped, dim=1)
|
73 |
+
# Argmax over each group of classes_per_group
|
74 |
+
predictions = probs.argmax(dim=-1)
|
75 |
+
else:
|
76 |
+
predictions = torch.argmax(outputs, dim=-1)
|
77 |
+
|
78 |
+
preds_array = predictions.cpu().numpy()
|
79 |
+
result = []
|
80 |
+
for i in range(preds_array.shape[0]):
|
81 |
+
result.append(
|
82 |
+
{
|
83 |
+
"Bình luận": comments[i],
|
84 |
+
"Cá nhân": class_names[ preds_array[i, 0] ],
|
85 |
+
"Nhóm/tổ chức": class_names[ preds_array[i, 1] ],
|
86 |
+
"Tôn giáo/tín ngưỡng": class_names[ preds_array[i, 2] ],
|
87 |
+
"Chủng tộc/sắc tộc": class_names[ preds_array[i, 3] ],
|
88 |
+
"Chính trị": class_names[ preds_array[i, 4] ],
|
89 |
+
})
|
90 |
+
return result
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
|
94 |
+
model, device = load_model_bert()
|
95 |
+
comments = [
|
96 |
+
"Để avata bít ngay là ngu hơn chó",
|
97 |
+
"Hàn Quốc chửi dân Đông Lào và đây là hậu quả",
|
98 |
+
"Nguyễn Thuận =)) tư tưởng rừng rú gì vậy",
|
99 |
+
"@công danh nguyen thể chế chính trị khác hẳn tư tưởng xã hội nhé. Con cờ hó china liên quan cmn gì?"
|
100 |
+
]
|
101 |
+
predictions = inference(model, device, comments)
|
102 |
+
print("BERT Predictions:")
|
103 |
+
print(predictions)
|
104 |
+
|
105 |
+
lstm_model, device = load_model_lstm()
|
106 |
+
lstm_predictions = inference(lstm_model, device, comments)
|
107 |
+
print("LSTM Predictions:")
|
108 |
+
print(lstm_predictions)
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from api import load_model_bert, load_model_lstm, inference
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
# Set up the Streamlit app
|
6 |
+
def app():
|
7 |
+
st.title("Phân tích ngôn từ thù địch, phân biệt sử dụng PhoBERT và LSTM")
|
8 |
+
|
9 |
+
# Show loading progress bar
|
10 |
+
# Load models
|
11 |
+
@st.cache_resource
|
12 |
+
def load_models():
|
13 |
+
st.progress(0, "Nạp các mô hình...")
|
14 |
+
# Load BERT model
|
15 |
+
bert_model, bert_device = load_model_bert()
|
16 |
+
st.progress(50, "Mô hình PhoBERT đã được nạp.")
|
17 |
+
# Load LSTM model
|
18 |
+
lstm_model, lstm_device = load_model_lstm()
|
19 |
+
st.progress(100, "Mô hình LSTM đã được nạp.") # Complete loading progress
|
20 |
+
return bert_model, bert_device, lstm_model, lstm_device
|
21 |
+
|
22 |
+
bert_model, bert_device, lstm_model, lstm_device = load_models()
|
23 |
+
|
24 |
+
# User input
|
25 |
+
user_input = st.text_area("Nhập các bình luận để phân tích ngôn từ thù địch, phân biệt (xuống dòng cho từng bình luận):")
|
26 |
+
|
27 |
+
if st.button("Phân tích"):
|
28 |
+
if user_input:
|
29 |
+
# Preprocess input
|
30 |
+
comments = user_input.splitlines()
|
31 |
+
|
32 |
+
# Inference with BERT
|
33 |
+
st.progress(0, "Đang phân tích với PhoBERT...")
|
34 |
+
bert_predictions = inference(bert_model, bert_device, comments)
|
35 |
+
st.write("BERT Predictions:")
|
36 |
+
st.dataframe(pd.DataFrame(bert_predictions))
|
37 |
+
|
38 |
+
st.progress(50, "Đang phân tích với LSTM...")
|
39 |
+
|
40 |
+
# Inference with LSTM
|
41 |
+
lstm_predictions = inference(lstm_model, lstm_device, comments)
|
42 |
+
st.write("LSTM Predictions:")
|
43 |
+
st.progress(100, "Phân tích hoàn tất!")
|
44 |
+
st.dataframe(pd.DataFrame(lstm_predictions))
|
45 |
+
else:
|
46 |
+
st.warning("Hãy nhập một vài bình luận.")
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
app()
|
dataset.py
CHANGED
@@ -19,7 +19,9 @@ class DocumentDataset(Dataset):
|
|
19 |
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
20 |
self.max_length = max_length
|
21 |
|
22 |
-
if
|
|
|
|
|
23 |
# Validate labels
|
24 |
unique_labels = set(labels)
|
25 |
min_label = min(unique_labels) if unique_labels else 0
|
@@ -67,7 +69,7 @@ class DocumentDataset(Dataset):
|
|
67 |
|
68 |
def __getitem__(self, idx):
|
69 |
text = str(self.texts[idx])
|
70 |
-
label = self.labels[idx]
|
71 |
|
72 |
# Tokenize the text with attention mask and truncation
|
73 |
encoding = self.tokenizer.encode_plus(
|
@@ -92,7 +94,7 @@ class DocumentDataset(Dataset):
|
|
92 |
"""Get original text for a given index"""
|
93 |
return {
|
94 |
'text': self.texts[idx],
|
95 |
-
'label': self.labels[idx]
|
96 |
}
|
97 |
|
98 |
def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):
|
|
|
19 |
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
20 |
self.max_length = max_length
|
21 |
|
22 |
+
if labels is None:
|
23 |
+
self.labels = np.zeros(len(texts), dtype=int)
|
24 |
+
elif type(labels) is not np.ndarray and type(labels) is not list:
|
25 |
# Validate labels
|
26 |
unique_labels = set(labels)
|
27 |
min_label = min(unique_labels) if unique_labels else 0
|
|
|
69 |
|
70 |
def __getitem__(self, idx):
|
71 |
text = str(self.texts[idx])
|
72 |
+
label = self.labels[idx] if self.labels is not None else torch.tensor(0, dtype=torch.long)
|
73 |
|
74 |
# Tokenize the text with attention mask and truncation
|
75 |
encoding = self.tokenizer.encode_plus(
|
|
|
94 |
"""Get original text for a given index"""
|
95 |
return {
|
96 |
'text': self.texts[idx],
|
97 |
+
'label': self.labels[idx] if self.labels is not None else None
|
98 |
}
|
99 |
|
100 |
def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):
|
example_uses.md
CHANGED
@@ -3,19 +3,27 @@
|
|
3 |
|
4 |
- Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
|
5 |
```
|
6 |
-
python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4
|
7 |
```
|
8 |
- Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
|
9 |
```
|
10 |
-
python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./
|
11 |
```
|
12 |
|
13 |
- Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
|
14 |
```
|
15 |
-
python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vinai_phobert-base-v2_finetuned
|
16 |
```
|
17 |
|
18 |
- Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
|
19 |
```
|
20 |
-
python ./inference_lstm.py --model_path "./
|
21 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
- Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
|
5 |
```
|
6 |
+
python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4 --output "./vietnamese_hate_speech_detection_phobert"
|
7 |
```
|
8 |
- Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
|
9 |
```
|
10 |
+
python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
|
11 |
```
|
12 |
|
13 |
- Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
|
14 |
```
|
15 |
+
python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --output_dir "./vietnamese_hate_speech_detection_phobert" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
|
16 |
```
|
17 |
|
18 |
- Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
|
19 |
```
|
20 |
+
python ./inference_lstm.py --model_path "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth" --bert_tokenizer "vinai/phobert-base-v2" --num_classes 4 --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
|
21 |
+
```
|
22 |
+
|
23 |
+
## How to run:
|
24 |
+
|
25 |
+
- Install the dependencies in requirements.txt: pip install -r requirements.txt
|
26 |
+
|
27 |
+
- Either follow the "Train with BERT model" or "Train LSTM model from BERT model using distillation" in Example uses section above, or git clone the model from: "https://huggingface.co/jesse-tong/vietnamese_hate_speech_detection_phobert"
|
28 |
+
|
29 |
+
- Run the Streamlit app: streamlit run app.py, then either go to http://localhost:8501 or waiting for the browser tab to open.
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ datasets
|
|
8 |
torchtext
|
9 |
maturin
|
10 |
underthesea --only-binary :all:
|
11 |
-
accelerate
|
|
|
|
8 |
torchtext
|
9 |
maturin
|
10 |
underthesea --only-binary :all:
|
11 |
+
accelerate
|
12 |
+
streamlit
|