Spaces:
Paused
Paused
Upload 4 files
Browse files- .gitattributes +2 -0
- app.py +471 -0
- faiss_index_bo_tp_cosine_full2.idx +3 -0
- faiss_metadata_bo_tp_full2.json +3 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
faiss_index_bo_tp_cosine_full2.idx filter=lfs diff=lfs merge=lfs -text
|
37 |
+
faiss_metadata_bo_tp_full2.json filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import TextStreamer
|
4 |
+
import numpy as np
|
5 |
+
import faiss
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
from unsloth import FastLanguageModel
|
9 |
+
from serpapi import GoogleSearch
|
10 |
+
from sentence_transformers import CrossEncoder
|
11 |
+
from sentence_transformers import SentenceTransformer
|
12 |
+
print("Load QA model")
|
13 |
+
new_model_name = "thailevann/Qwen3-4B_SFT_dvcqg_v4"
|
14 |
+
model1, tokenizer1 = FastLanguageModel.from_pretrained(
|
15 |
+
model_name = new_model_name,
|
16 |
+
max_seq_length = 2048,
|
17 |
+
dtype = None,
|
18 |
+
load_in_4bit = True,
|
19 |
+
)
|
20 |
+
FastLanguageModel.for_inference(model1);
|
21 |
+
|
22 |
+
|
23 |
+
print("Load embedding model")
|
24 |
+
model_embed = SentenceTransformer("huyydangg/DEk21_hcmute_embedding")
|
25 |
+
|
26 |
+
print("Load rerank model")
|
27 |
+
rerank_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
28 |
+
index = faiss.read_index("./faiss_index_bo_tp_cosine_full2.idx")
|
29 |
+
with open("./faiss_metadata_bo_tp_full2.json", "r", encoding="utf-8") as f:
|
30 |
+
metadata = json.load(f)
|
31 |
+
import re
|
32 |
+
import numpy as np
|
33 |
+
from bs4 import BeautifulSoup
|
34 |
+
import requests
|
35 |
+
import json
|
36 |
+
from datasets import load_dataset
|
37 |
+
from google import genai
|
38 |
+
import json
|
39 |
+
import time
|
40 |
+
|
41 |
+
# Cấu hình API key
|
42 |
+
GEMINI_API_KEY = "AIzaSyDi-ZXE3qXfDdIjeDFS8Cw-GylXD4P4LsQ"
|
43 |
+
client_gemini = genai.Client(api_key=GEMINI_API_KEY)
|
44 |
+
|
45 |
+
|
46 |
+
# Tạo prompt
|
47 |
+
def prompt1(question, relevant) -> str:
|
48 |
+
return f"""
|
49 |
+
Bạn là một trợ lý AI hỗ trợ tra cứu dịch vụ công dựa vào thông tin liên quan
|
50 |
+
## Câu hỏi:
|
51 |
+
{question}
|
52 |
+
|
53 |
+
## Thông tin liên quan:
|
54 |
+
{ relevant}
|
55 |
+
"""
|
56 |
+
|
57 |
+
# Hàm gọi Gemini
|
58 |
+
def generate_mcq(question, relevant):
|
59 |
+
prompt = prompt1(question, relevant)
|
60 |
+
|
61 |
+
response = client_gemini.models.generate_content(
|
62 |
+
model="gemini-2.0-flash",
|
63 |
+
contents=prompt
|
64 |
+
)
|
65 |
+
|
66 |
+
return response.text
|
67 |
+
def generate(q):
|
68 |
+
prompt = f"""
|
69 |
+
Viết lại cho dễ hiểu. Chỉ trả lời, nếu bạn thấy có thông tin không hợp lí có thể sửa lại và không cần giải thích. Các link đều được quy về https://dichvucong.gov.vn/
|
70 |
+
|
71 |
+
{q}
|
72 |
+
"""
|
73 |
+
try:
|
74 |
+
response = client_gemini.models.generate_content(
|
75 |
+
model="gemini-2.0-flash",
|
76 |
+
contents=prompt
|
77 |
+
)
|
78 |
+
print(q)
|
79 |
+
return response.text
|
80 |
+
except Exception as e:
|
81 |
+
return q
|
82 |
+
|
83 |
+
def extract_table_to_kv(table_tag):
|
84 |
+
headers = [
|
85 |
+
th.get_text(strip=True).lower()
|
86 |
+
.replace(" ", "_")
|
87 |
+
.replace(":", "")
|
88 |
+
for th in table_tag.find_all("th")
|
89 |
+
]
|
90 |
+
|
91 |
+
rows = []
|
92 |
+
for tr in table_tag.find_all("tr"):
|
93 |
+
cells = tr.find_all("td")
|
94 |
+
if not cells or len(cells) < 2:
|
95 |
+
continue
|
96 |
+
|
97 |
+
row_data = {}
|
98 |
+
for i, td in enumerate(cells):
|
99 |
+
key = headers[i] if i < len(headers) else f"column_{i}"
|
100 |
+
|
101 |
+
# Với td, nếu có span.link thì ưu tiên lấy text span
|
102 |
+
span_link = td.find("span", class_="link")
|
103 |
+
if span_link:
|
104 |
+
value = span_link.get_text(strip=True)
|
105 |
+
else:
|
106 |
+
# Giữ xuống dòng để không bị dính text
|
107 |
+
value = "\n".join([
|
108 |
+
x.strip() for x in td.stripped_strings
|
109 |
+
])
|
110 |
+
row_data[key] = value
|
111 |
+
rows.append(row_data)
|
112 |
+
return rows
|
113 |
+
|
114 |
+
def extract_section_by_heading(soup, heading_text):
|
115 |
+
h2 = soup.find('h2', string=lambda t: t and heading_text in t)
|
116 |
+
if not h2:
|
117 |
+
return None
|
118 |
+
|
119 |
+
content = []
|
120 |
+
for sibling in h2.find_next_siblings():
|
121 |
+
if sibling.name == 'h2':
|
122 |
+
break
|
123 |
+
if sibling.name == 'table':
|
124 |
+
table_kv = extract_table_to_kv(sibling)
|
125 |
+
if table_kv:
|
126 |
+
content.append(table_kv)
|
127 |
+
elif sibling.name:
|
128 |
+
text = sibling.get_text(" ", strip=True)
|
129 |
+
if text:
|
130 |
+
content.append(text)
|
131 |
+
return content if content else None
|
132 |
+
|
133 |
+
|
134 |
+
def call_dichvucong_api(service: str, ma_thu_tuc: str, label: str = None):
|
135 |
+
"""
|
136 |
+
Gửi request tới https://dichvucong.gov.vn/jsp/rest.jsp với tham số tuỳ biến.
|
137 |
+
|
138 |
+
Args:
|
139 |
+
service (str): Tên dịch vụ, ví dụ: 'procedure_get_requires_by_procedure_id_service_v2'
|
140 |
+
ma_thu_tuc (str): ID thủ tục hành chính (VD: '5974')
|
141 |
+
cookies_dict (dict): Dict chứa cookies, gồm 'route', 'JSESSIONID', 'TS0115bee1'
|
142 |
+
|
143 |
+
Returns:
|
144 |
+
dict | str: Dữ liệu JSON nếu thành công, chuỗi lỗi nếu thất bại
|
145 |
+
"""
|
146 |
+
url = "https://dichvucong.gov.vn/jsp/rest.jsp"
|
147 |
+
|
148 |
+
headers = {
|
149 |
+
"Accept": "application/json, text/javascript, */*; q=0.01",
|
150 |
+
"Accept-Language": "en,vi;q=0.9,ja;q=0.8,de;q=0.7,en-US;q=0.6,fr;q=0.5",
|
151 |
+
"Connection": "keep-alive",
|
152 |
+
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
153 |
+
"Origin": "https://dichvucong.gov.vn",
|
154 |
+
"Referer": f"https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html?ma_thu_tuc={ma_thu_tuc}",
|
155 |
+
"Sec-Fetch-Dest": "empty",
|
156 |
+
"Sec-Fetch-Mode": "cors",
|
157 |
+
"Sec-Fetch-Site": "same-origin",
|
158 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
159 |
+
"X-Requested-With": "XMLHttpRequest",
|
160 |
+
"sec-ch-ua": '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
|
161 |
+
"sec-ch-ua-mobile": "?0",
|
162 |
+
"sec-ch-ua-platform": '"Windows"',
|
163 |
+
}
|
164 |
+
|
165 |
+
payload = {
|
166 |
+
"params": json.dumps({
|
167 |
+
"service": service,
|
168 |
+
"provider": "dvcquocgia",
|
169 |
+
"type": "ref",
|
170 |
+
"id": ma_thu_tuc,
|
171 |
+
"parent_id": ""
|
172 |
+
})
|
173 |
+
}
|
174 |
+
|
175 |
+
try:
|
176 |
+
response = requests.post(url, headers=headers, data=payload)
|
177 |
+
response.raise_for_status()
|
178 |
+
data = response.json()
|
179 |
+
|
180 |
+
# Nếu cần gắn nhãn vào phần 'result'
|
181 |
+
if label and isinstance(data, dict) and 'result' in data and isinstance(data['result'], str):
|
182 |
+
data['result'] = f"{label}: {data['result']}"
|
183 |
+
|
184 |
+
return data
|
185 |
+
except Exception as e:
|
186 |
+
return {"error": str(e)}
|
187 |
+
|
188 |
+
|
189 |
+
import numpy as np
|
190 |
+
|
191 |
+
def get_best_doc_from_sections(query_text, model_embed, sections_dict):
|
192 |
+
# Encode query
|
193 |
+
query_vec = model_embed.encode([query_text])
|
194 |
+
query_vec = query_vec / np.linalg.norm(query_vec, axis=1, keepdims=True)
|
195 |
+
|
196 |
+
texts = []
|
197 |
+
keys = []
|
198 |
+
|
199 |
+
for key, value in sections_dict.items():
|
200 |
+
if isinstance(value, list):
|
201 |
+
for v in value:
|
202 |
+
if isinstance(v, dict):
|
203 |
+
text = f"{key}: " + " ".join(str(x) for x in v.values())
|
204 |
+
texts.append(text)
|
205 |
+
keys.append((key, text))
|
206 |
+
elif isinstance(v, str):
|
207 |
+
text = f"{key}: {v}"
|
208 |
+
texts.append(text)
|
209 |
+
keys.append((key, text))
|
210 |
+
elif isinstance(value, str):
|
211 |
+
text = f"{key}: {value}"
|
212 |
+
texts.append(text)
|
213 |
+
keys.append((key, text))
|
214 |
+
|
215 |
+
if not texts:
|
216 |
+
return {
|
217 |
+
"best_section": None,
|
218 |
+
"best_doc": "",
|
219 |
+
"similarity": 0.0
|
220 |
+
}
|
221 |
+
|
222 |
+
# Encode văn bản
|
223 |
+
doc_vecs = model_embed.encode(texts)
|
224 |
+
doc_vecs = doc_vecs / np.linalg.norm(doc_vecs, axis=1, keepdims=True)
|
225 |
+
|
226 |
+
# Tính cosine similarity
|
227 |
+
similarities = np.dot(doc_vecs, query_vec.T).flatten()
|
228 |
+
similarities = np.atleast_1d(similarities)
|
229 |
+
|
230 |
+
# Lấy doc tốt nhất
|
231 |
+
best_idx = int(np.argmax(similarities))
|
232 |
+
best_key, best_text = keys[best_idx]
|
233 |
+
|
234 |
+
return {
|
235 |
+
"best_section": best_key,
|
236 |
+
"best_doc": best_text,
|
237 |
+
"similarity": float(similarities[best_idx])
|
238 |
+
}
|
239 |
+
|
240 |
+
import requests
|
241 |
+
|
242 |
+
def get_thu_tuc_html(ma_thu_tuc: str):
|
243 |
+
"""
|
244 |
+
Tải trang HTML thủ tục hành chính từ dichvucong.gov.vn
|
245 |
+
|
246 |
+
Args:
|
247 |
+
ma_thu_tuc (str): Mã thủ tục (VD: "5974")
|
248 |
+
cookies_dict (dict): Cookie dạng dict gồm 'route', 'JSESSIONID', 'TS0115bee1'
|
249 |
+
|
250 |
+
Returns:
|
251 |
+
str: Nội dung HTML nếu thành công, hoặc chuỗi lỗi
|
252 |
+
"""
|
253 |
+
url = f"https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html?ma_thu_tuc={ma_thu_tuc}"
|
254 |
+
|
255 |
+
headers = {
|
256 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
257 |
+
"Accept-Language": "en,vi;q=0.9,ja;q=0.8,de;q=0.7,en-US;q=0.6,fr;q=0.5",
|
258 |
+
"Cache-Control": "max-age=0",
|
259 |
+
"Connection": "keep-alive",
|
260 |
+
"Referer": "https://www.google.com/",
|
261 |
+
"Sec-Fetch-Dest": "document",
|
262 |
+
"Sec-Fetch-Mode": "navigate",
|
263 |
+
"Sec-Fetch-Site": "cross-site",
|
264 |
+
"Sec-Fetch-User": "?1",
|
265 |
+
"Upgrade-Insecure-Requests": "1",
|
266 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
267 |
+
"sec-ch-ua": '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
|
268 |
+
"sec-ch-ua-mobile": "?0",
|
269 |
+
"sec-ch-ua-platform": '"Windows"',
|
270 |
+
}
|
271 |
+
|
272 |
+
try:
|
273 |
+
response = requests.get(url, headers=headers)
|
274 |
+
response.raise_for_status()
|
275 |
+
return response.text
|
276 |
+
except Exception as e:
|
277 |
+
return f"Lỗi khi tải HTML: {e}"
|
278 |
+
|
279 |
+
|
280 |
+
def extract_thu_tuc(doc_text):
|
281 |
+
match = re.search(r"\*\*Tên thủ tục\*\*:\s*(.+)", doc_text)
|
282 |
+
if match:
|
283 |
+
return match.group(1).strip()
|
284 |
+
return None
|
285 |
+
|
286 |
+
def get_preferred_link(results):
|
287 |
+
"""
|
288 |
+
Lấy link đầu tiên theo mẫu dichvucong.gov.vn, nếu không có thì lấy top-1 bất kỳ.
|
289 |
+
"""
|
290 |
+
preferred_prefix = "https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html"
|
291 |
+
for r in results[:5]:
|
292 |
+
link = r.get('link', '')
|
293 |
+
if link.startswith(preferred_prefix):
|
294 |
+
return link
|
295 |
+
return results[0]['link'] if results else None
|
296 |
+
|
297 |
+
def search_keyword(question):
|
298 |
+
api_key = "4d71a752a00145c83690f47006bc6a32c7bf717f08c0d0a997a064c7b381f226"
|
299 |
+
params = {
|
300 |
+
"engine": "google",
|
301 |
+
"q": f"site:dichvucong.gov.vn {question}",
|
302 |
+
"api_key": api_key,
|
303 |
+
"location": "Ho Chi Minh, Vietnam",
|
304 |
+
"hl": "vi",
|
305 |
+
"gl": "vn",
|
306 |
+
"num": 5 # Lấy 5 kết quả để lọc
|
307 |
+
}
|
308 |
+
|
309 |
+
search = GoogleSearch(params)
|
310 |
+
results = search.get_dict()
|
311 |
+
organic = results.get("organic_results", [])
|
312 |
+
|
313 |
+
if not organic:
|
314 |
+
return None, None, None
|
315 |
+
|
316 |
+
link = get_preferred_link(organic)
|
317 |
+
result = next((r for r in organic if r.get("link") == link), organic[0])
|
318 |
+
title = result.get("title", "")
|
319 |
+
snippet = result.get("snippet", "")
|
320 |
+
|
321 |
+
# Gộp để debug nếu cần
|
322 |
+
print(question + " " + title + " " + snippet, link)
|
323 |
+
|
324 |
+
return question + " " + title + " " + snippet, link, title, snippet
|
325 |
+
|
326 |
+
|
327 |
+
|
328 |
+
def respond(message, system_message, model_name):
|
329 |
+
message = message.lower().replace("online", "trực tuyến").replace("offline", "dịch vụ bưu chính")
|
330 |
+
if model_name == "Gemini":
|
331 |
+
print("Gemini")
|
332 |
+
query_text, link, title, snippet = search_keyword(message)
|
333 |
+
preferred_prefix = "https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html"
|
334 |
+
|
335 |
+
if preferred_prefix in link:
|
336 |
+
|
337 |
+
ma_thu_tuc = link.split("ma_thu_tuc=")[-1]
|
338 |
+
dieu_kien = call_dichvucong_api("procedure_get_requires_by_procedure_id_service_v2", ma_thu_tuc, "Yêu cầu, điều kiện thực hiện: ")
|
339 |
+
trinh_tuc_thuc_hien = call_dichvucong_api("procedure_get_impl_orders_by_proc_id_service_v2", ma_thu_tuc, "Trình tự thực hiện: ")
|
340 |
+
html = get_thu_tuc_html(ma_thu_tuc)
|
341 |
+
soup = BeautifulSoup(html, 'html.parser')
|
342 |
+
|
343 |
+
def flatten(lst):
|
344 |
+
return [item for sublist in lst for item in (sublist if isinstance(sublist, list) else [sublist])]
|
345 |
+
|
346 |
+
sections = {
|
347 |
+
"dieu_kien": dieu_kien,
|
348 |
+
"trinh_tu_thuc_hien": trinh_tuc_thuc_hien,
|
349 |
+
"cach_thuc_thuc_hien": extract_section_by_heading(soup, "Cách thức thực hiện"),
|
350 |
+
"thanh_phan_ho_so":extract_section_by_heading(soup, "Thành phần hồ sơ"),
|
351 |
+
"can_cu_phap_ly":extract_section_by_heading(soup, "Căn cứ pháp lý"),
|
352 |
+
"ket_qua_thuc_hien": extract_section_by_heading(soup, "Kết quả thực hiện")
|
353 |
+
}
|
354 |
+
|
355 |
+
best_doc_raw = get_best_doc_from_sections(query_text, model_embed, sections)['best_doc']
|
356 |
+
|
357 |
+
if isinstance(best_doc_raw, dict):
|
358 |
+
best_doc_text = " ".join(f"{k}: {v}" for k, v in best_doc_raw.items())
|
359 |
+
else:
|
360 |
+
best_doc_text = str(best_doc_raw)
|
361 |
+
|
362 |
+
best_doc_text += " " + snippet
|
363 |
+
print(best_doc_text)
|
364 |
+
|
365 |
+
|
366 |
+
else:
|
367 |
+
query_vec = model_embed.encode([query_text])
|
368 |
+
query_vec = query_vec / np.linalg.norm(query_vec, axis=1, keepdims=True)
|
369 |
+
|
370 |
+
D, I = index.search(query_vec.astype(np.float32), k=10)
|
371 |
+
|
372 |
+
retrieved_documents = []
|
373 |
+
pairs_for_rerank = []
|
374 |
+
|
375 |
+
for rank, idx in enumerate(I[0]):
|
376 |
+
doc_text = metadata[idx]["text"]
|
377 |
+
print(idx)
|
378 |
+
print(doc_text[:100])
|
379 |
+
title_doc = extract_thu_tuc(doc_text)
|
380 |
+
if title_doc.lower().strip() == title.lower().strip():
|
381 |
+
# Gắn idx vào metadata hoặc lưu riêng
|
382 |
+
retrieved_documents= []
|
383 |
+
pairs_for_rerank = []
|
384 |
+
retrieved_documents.append((idx, doc_text, metadata[idx]))
|
385 |
+
pairs_for_rerank.append([query_text, doc_text])
|
386 |
+
break
|
387 |
+
|
388 |
+
|
389 |
+
# Gắn idx vào metadata hoặc lưu riêng
|
390 |
+
retrieved_documents.append((idx, doc_text, metadata[idx]))
|
391 |
+
pairs_for_rerank.append([query_text, doc_text])
|
392 |
+
|
393 |
+
# ==== Bước 6: Rerank với CrossEncoder ====
|
394 |
+
|
395 |
+
scores = rerank_model.predict(pairs_for_rerank)
|
396 |
+
scored_docs = list(zip(scores, retrieved_documents))
|
397 |
+
|
398 |
+
# Sắp xếp theo score giảm dần
|
399 |
+
scored_docs.sort(reverse=True, key=lambda x: x[0])
|
400 |
+
|
401 |
+
# Lấy document có score cao nhất
|
402 |
+
best_score, (best_idx, best_doc_text, best_meta) = scored_docs[0]
|
403 |
+
print("After rerank: ", best_doc_text[:100])
|
404 |
+
|
405 |
+
|
406 |
+
doc = best_doc_text
|
407 |
+
answer = generate_mcq(message, doc)
|
408 |
+
print(answer)
|
409 |
+
return f"""{answer}
|
410 |
+
|
411 |
+
**Nguồn tham khảo:**
|
412 |
+
1. {link}
|
413 |
+
|
414 |
+
"""
|
415 |
+
else:
|
416 |
+
print("My Model")
|
417 |
+
final_prompt = f"""Bạn là một trợ lý AI hỗ trợ tra cứu dịch vụ công.
|
418 |
+
## Câu hỏi:
|
419 |
+
{message}"""
|
420 |
+
|
421 |
+
messages2 = [{"role": "user", "content": final_prompt}]
|
422 |
+
text2 = tokenizer1.apply_chat_template(
|
423 |
+
messages2,
|
424 |
+
tokenize=False,
|
425 |
+
add_generation_prompt=True,
|
426 |
+
enable_thinking=False,
|
427 |
+
)
|
428 |
+
inputs2 = tokenizer1(text2, return_tensors="pt").to("cuda")
|
429 |
+
outputs2 = model1.generate(
|
430 |
+
**inputs2,
|
431 |
+
max_new_tokens=2048,
|
432 |
+
temperature=0.7,
|
433 |
+
top_p=0.8,
|
434 |
+
do_sample=True,
|
435 |
+
)
|
436 |
+
response = generate(tokenizer1.decode(outputs2[0], skip_special_tokens=True).split("assistant")[-1].replace("<think>", "").replace("</think>", "").strip())
|
437 |
+
|
438 |
+
print(response)
|
439 |
+
|
440 |
+
return f"""{response}"""
|
441 |
+
|
442 |
+
|
443 |
+
|
444 |
+
with gr.Blocks() as demo:
|
445 |
+
with gr.Row():
|
446 |
+
chatbot = gr.Chatbot()
|
447 |
+
|
448 |
+
with gr.Row():
|
449 |
+
user_input = gr.Textbox(label="Nhập câu hỏi")
|
450 |
+
model_choice = gr.Radio(
|
451 |
+
choices=["Gemini", "My model"],
|
452 |
+
value="Gemini",
|
453 |
+
label="Chọn mô hình trả lời"
|
454 |
+
)
|
455 |
+
send_button = gr.Button("Gửi")
|
456 |
+
|
457 |
+
state = gr.State([])
|
458 |
+
|
459 |
+
def chat_wrapper(message, model_name, history):
|
460 |
+
system_msg = ""
|
461 |
+
response = respond(message, system_msg, model_name)
|
462 |
+
history.append([message, response])
|
463 |
+
return history, history # Trả về cho chatbot và cập nhật state
|
464 |
+
|
465 |
+
send_button.click(
|
466 |
+
chat_wrapper,
|
467 |
+
inputs=[user_input, model_choice, state],
|
468 |
+
outputs=[chatbot, state]
|
469 |
+
)
|
470 |
+
|
471 |
+
demo.launch(share=True)
|
faiss_index_bo_tp_cosine_full2.idx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98114e3e804397389140aba2ff7de88e26a82559374ba5ae7615ac13100b3838
|
3 |
+
size 58070061
|
faiss_metadata_bo_tp_full2.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd7c3907ede6446635fd6893720ccda81787cc2b2c1c7b9978784baa87e420a8
|
3 |
+
size 65140375
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
unsloth
|
2 |
+
faiss-gpu-cu11
|
3 |
+
gradio
|
4 |
+
google-search-results
|