thailevann commited on
Commit
8aca85c
·
verified ·
1 Parent(s): 73abc75

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss_index_bo_tp_cosine_full2.idx filter=lfs diff=lfs merge=lfs -text
37
+ faiss_metadata_bo_tp_full2.json filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import TextStreamer
4
+ import numpy as np
5
+ import faiss
6
+ import json
7
+ import re
8
+ from unsloth import FastLanguageModel
9
+ from serpapi import GoogleSearch
10
+ from sentence_transformers import CrossEncoder
11
+ from sentence_transformers import SentenceTransformer
12
+ print("Load QA model")
13
+ new_model_name = "thailevann/Qwen3-4B_SFT_dvcqg_v4"
14
+ model1, tokenizer1 = FastLanguageModel.from_pretrained(
15
+ model_name = new_model_name,
16
+ max_seq_length = 2048,
17
+ dtype = None,
18
+ load_in_4bit = True,
19
+ )
20
+ FastLanguageModel.for_inference(model1);
21
+
22
+
23
+ print("Load embedding model")
24
+ model_embed = SentenceTransformer("huyydangg/DEk21_hcmute_embedding")
25
+
26
+ print("Load rerank model")
27
+ rerank_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
28
+ index = faiss.read_index("./faiss_index_bo_tp_cosine_full2.idx")
29
+ with open("./faiss_metadata_bo_tp_full2.json", "r", encoding="utf-8") as f:
30
+ metadata = json.load(f)
31
+ import re
32
+ import numpy as np
33
+ from bs4 import BeautifulSoup
34
+ import requests
35
+ import json
36
+ from datasets import load_dataset
37
+ from google import genai
38
+ import json
39
+ import time
40
+
41
+ # Cấu hình API key
42
+ GEMINI_API_KEY = "AIzaSyDi-ZXE3qXfDdIjeDFS8Cw-GylXD4P4LsQ"
43
+ client_gemini = genai.Client(api_key=GEMINI_API_KEY)
44
+
45
+
46
+ # Tạo prompt
47
+ def prompt1(question, relevant) -> str:
48
+ return f"""
49
+ Bạn là một trợ lý AI hỗ trợ tra cứu dịch vụ công dựa vào thông tin liên quan
50
+ ## Câu hỏi:
51
+ {question}
52
+
53
+ ## Thông tin liên quan:
54
+ { relevant}
55
+ """
56
+
57
+ # Hàm gọi Gemini
58
+ def generate_mcq(question, relevant):
59
+ prompt = prompt1(question, relevant)
60
+
61
+ response = client_gemini.models.generate_content(
62
+ model="gemini-2.0-flash",
63
+ contents=prompt
64
+ )
65
+
66
+ return response.text
67
+ def generate(q):
68
+ prompt = f"""
69
+ Viết lại cho dễ hiểu. Chỉ trả lời, nếu bạn thấy có thông tin không hợp lí có thể sửa lại và không cần giải thích. Các link đều được quy về https://dichvucong.gov.vn/
70
+
71
+ {q}
72
+ """
73
+ try:
74
+ response = client_gemini.models.generate_content(
75
+ model="gemini-2.0-flash",
76
+ contents=prompt
77
+ )
78
+ print(q)
79
+ return response.text
80
+ except Exception as e:
81
+ return q
82
+
83
+ def extract_table_to_kv(table_tag):
84
+ headers = [
85
+ th.get_text(strip=True).lower()
86
+ .replace(" ", "_")
87
+ .replace(":", "")
88
+ for th in table_tag.find_all("th")
89
+ ]
90
+
91
+ rows = []
92
+ for tr in table_tag.find_all("tr"):
93
+ cells = tr.find_all("td")
94
+ if not cells or len(cells) < 2:
95
+ continue
96
+
97
+ row_data = {}
98
+ for i, td in enumerate(cells):
99
+ key = headers[i] if i < len(headers) else f"column_{i}"
100
+
101
+ # Với td, nếu có span.link thì ưu tiên lấy text span
102
+ span_link = td.find("span", class_="link")
103
+ if span_link:
104
+ value = span_link.get_text(strip=True)
105
+ else:
106
+ # Giữ xuống dòng để không bị dính text
107
+ value = "\n".join([
108
+ x.strip() for x in td.stripped_strings
109
+ ])
110
+ row_data[key] = value
111
+ rows.append(row_data)
112
+ return rows
113
+
114
+ def extract_section_by_heading(soup, heading_text):
115
+ h2 = soup.find('h2', string=lambda t: t and heading_text in t)
116
+ if not h2:
117
+ return None
118
+
119
+ content = []
120
+ for sibling in h2.find_next_siblings():
121
+ if sibling.name == 'h2':
122
+ break
123
+ if sibling.name == 'table':
124
+ table_kv = extract_table_to_kv(sibling)
125
+ if table_kv:
126
+ content.append(table_kv)
127
+ elif sibling.name:
128
+ text = sibling.get_text(" ", strip=True)
129
+ if text:
130
+ content.append(text)
131
+ return content if content else None
132
+
133
+
134
+ def call_dichvucong_api(service: str, ma_thu_tuc: str, label: str = None):
135
+ """
136
+ Gửi request tới https://dichvucong.gov.vn/jsp/rest.jsp với tham số tuỳ biến.
137
+
138
+ Args:
139
+ service (str): Tên dịch vụ, ví dụ: 'procedure_get_requires_by_procedure_id_service_v2'
140
+ ma_thu_tuc (str): ID thủ tục hành chính (VD: '5974')
141
+ cookies_dict (dict): Dict chứa cookies, gồm 'route', 'JSESSIONID', 'TS0115bee1'
142
+
143
+ Returns:
144
+ dict | str: Dữ liệu JSON nếu thành công, chuỗi lỗi nếu thất bại
145
+ """
146
+ url = "https://dichvucong.gov.vn/jsp/rest.jsp"
147
+
148
+ headers = {
149
+ "Accept": "application/json, text/javascript, */*; q=0.01",
150
+ "Accept-Language": "en,vi;q=0.9,ja;q=0.8,de;q=0.7,en-US;q=0.6,fr;q=0.5",
151
+ "Connection": "keep-alive",
152
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
153
+ "Origin": "https://dichvucong.gov.vn",
154
+ "Referer": f"https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html?ma_thu_tuc={ma_thu_tuc}",
155
+ "Sec-Fetch-Dest": "empty",
156
+ "Sec-Fetch-Mode": "cors",
157
+ "Sec-Fetch-Site": "same-origin",
158
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
159
+ "X-Requested-With": "XMLHttpRequest",
160
+ "sec-ch-ua": '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
161
+ "sec-ch-ua-mobile": "?0",
162
+ "sec-ch-ua-platform": '"Windows"',
163
+ }
164
+
165
+ payload = {
166
+ "params": json.dumps({
167
+ "service": service,
168
+ "provider": "dvcquocgia",
169
+ "type": "ref",
170
+ "id": ma_thu_tuc,
171
+ "parent_id": ""
172
+ })
173
+ }
174
+
175
+ try:
176
+ response = requests.post(url, headers=headers, data=payload)
177
+ response.raise_for_status()
178
+ data = response.json()
179
+
180
+ # Nếu cần gắn nhãn vào phần 'result'
181
+ if label and isinstance(data, dict) and 'result' in data and isinstance(data['result'], str):
182
+ data['result'] = f"{label}: {data['result']}"
183
+
184
+ return data
185
+ except Exception as e:
186
+ return {"error": str(e)}
187
+
188
+
189
+ import numpy as np
190
+
191
+ def get_best_doc_from_sections(query_text, model_embed, sections_dict):
192
+ # Encode query
193
+ query_vec = model_embed.encode([query_text])
194
+ query_vec = query_vec / np.linalg.norm(query_vec, axis=1, keepdims=True)
195
+
196
+ texts = []
197
+ keys = []
198
+
199
+ for key, value in sections_dict.items():
200
+ if isinstance(value, list):
201
+ for v in value:
202
+ if isinstance(v, dict):
203
+ text = f"{key}: " + " ".join(str(x) for x in v.values())
204
+ texts.append(text)
205
+ keys.append((key, text))
206
+ elif isinstance(v, str):
207
+ text = f"{key}: {v}"
208
+ texts.append(text)
209
+ keys.append((key, text))
210
+ elif isinstance(value, str):
211
+ text = f"{key}: {value}"
212
+ texts.append(text)
213
+ keys.append((key, text))
214
+
215
+ if not texts:
216
+ return {
217
+ "best_section": None,
218
+ "best_doc": "",
219
+ "similarity": 0.0
220
+ }
221
+
222
+ # Encode văn bản
223
+ doc_vecs = model_embed.encode(texts)
224
+ doc_vecs = doc_vecs / np.linalg.norm(doc_vecs, axis=1, keepdims=True)
225
+
226
+ # Tính cosine similarity
227
+ similarities = np.dot(doc_vecs, query_vec.T).flatten()
228
+ similarities = np.atleast_1d(similarities)
229
+
230
+ # Lấy doc tốt nhất
231
+ best_idx = int(np.argmax(similarities))
232
+ best_key, best_text = keys[best_idx]
233
+
234
+ return {
235
+ "best_section": best_key,
236
+ "best_doc": best_text,
237
+ "similarity": float(similarities[best_idx])
238
+ }
239
+
240
+ import requests
241
+
242
+ def get_thu_tuc_html(ma_thu_tuc: str):
243
+ """
244
+ Tải trang HTML thủ tục hành chính từ dichvucong.gov.vn
245
+
246
+ Args:
247
+ ma_thu_tuc (str): Mã thủ tục (VD: "5974")
248
+ cookies_dict (dict): Cookie dạng dict gồm 'route', 'JSESSIONID', 'TS0115bee1'
249
+
250
+ Returns:
251
+ str: Nội dung HTML nếu thành công, hoặc chuỗi lỗi
252
+ """
253
+ url = f"https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html?ma_thu_tuc={ma_thu_tuc}"
254
+
255
+ headers = {
256
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
257
+ "Accept-Language": "en,vi;q=0.9,ja;q=0.8,de;q=0.7,en-US;q=0.6,fr;q=0.5",
258
+ "Cache-Control": "max-age=0",
259
+ "Connection": "keep-alive",
260
+ "Referer": "https://www.google.com/",
261
+ "Sec-Fetch-Dest": "document",
262
+ "Sec-Fetch-Mode": "navigate",
263
+ "Sec-Fetch-Site": "cross-site",
264
+ "Sec-Fetch-User": "?1",
265
+ "Upgrade-Insecure-Requests": "1",
266
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
267
+ "sec-ch-ua": '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"',
268
+ "sec-ch-ua-mobile": "?0",
269
+ "sec-ch-ua-platform": '"Windows"',
270
+ }
271
+
272
+ try:
273
+ response = requests.get(url, headers=headers)
274
+ response.raise_for_status()
275
+ return response.text
276
+ except Exception as e:
277
+ return f"Lỗi khi tải HTML: {e}"
278
+
279
+
280
+ def extract_thu_tuc(doc_text):
281
+ match = re.search(r"\*\*Tên thủ tục\*\*:\s*(.+)", doc_text)
282
+ if match:
283
+ return match.group(1).strip()
284
+ return None
285
+
286
+ def get_preferred_link(results):
287
+ """
288
+ Lấy link đầu tiên theo mẫu dichvucong.gov.vn, nếu không có thì lấy top-1 bất kỳ.
289
+ """
290
+ preferred_prefix = "https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html"
291
+ for r in results[:5]:
292
+ link = r.get('link', '')
293
+ if link.startswith(preferred_prefix):
294
+ return link
295
+ return results[0]['link'] if results else None
296
+
297
+ def search_keyword(question):
298
+ api_key = "4d71a752a00145c83690f47006bc6a32c7bf717f08c0d0a997a064c7b381f226"
299
+ params = {
300
+ "engine": "google",
301
+ "q": f"site:dichvucong.gov.vn {question}",
302
+ "api_key": api_key,
303
+ "location": "Ho Chi Minh, Vietnam",
304
+ "hl": "vi",
305
+ "gl": "vn",
306
+ "num": 5 # Lấy 5 kết quả để lọc
307
+ }
308
+
309
+ search = GoogleSearch(params)
310
+ results = search.get_dict()
311
+ organic = results.get("organic_results", [])
312
+
313
+ if not organic:
314
+ return None, None, None
315
+
316
+ link = get_preferred_link(organic)
317
+ result = next((r for r in organic if r.get("link") == link), organic[0])
318
+ title = result.get("title", "")
319
+ snippet = result.get("snippet", "")
320
+
321
+ # Gộp để debug nếu cần
322
+ print(question + " " + title + " " + snippet, link)
323
+
324
+ return question + " " + title + " " + snippet, link, title, snippet
325
+
326
+
327
+
328
+ def respond(message, system_message, model_name):
329
+ message = message.lower().replace("online", "trực tuyến").replace("offline", "dịch vụ bưu chính")
330
+ if model_name == "Gemini":
331
+ print("Gemini")
332
+ query_text, link, title, snippet = search_keyword(message)
333
+ preferred_prefix = "https://dichvucong.gov.vn/p/home/dvc-tthc-thu-tuc-hanh-chinh-chi-tiet.html"
334
+
335
+ if preferred_prefix in link:
336
+
337
+ ma_thu_tuc = link.split("ma_thu_tuc=")[-1]
338
+ dieu_kien = call_dichvucong_api("procedure_get_requires_by_procedure_id_service_v2", ma_thu_tuc, "Yêu cầu, điều kiện thực hiện: ")
339
+ trinh_tuc_thuc_hien = call_dichvucong_api("procedure_get_impl_orders_by_proc_id_service_v2", ma_thu_tuc, "Trình tự thực hiện: ")
340
+ html = get_thu_tuc_html(ma_thu_tuc)
341
+ soup = BeautifulSoup(html, 'html.parser')
342
+
343
+ def flatten(lst):
344
+ return [item for sublist in lst for item in (sublist if isinstance(sublist, list) else [sublist])]
345
+
346
+ sections = {
347
+ "dieu_kien": dieu_kien,
348
+ "trinh_tu_thuc_hien": trinh_tuc_thuc_hien,
349
+ "cach_thuc_thuc_hien": extract_section_by_heading(soup, "Cách thức thực hiện"),
350
+ "thanh_phan_ho_so":extract_section_by_heading(soup, "Thành phần hồ sơ"),
351
+ "can_cu_phap_ly":extract_section_by_heading(soup, "Căn cứ pháp lý"),
352
+ "ket_qua_thuc_hien": extract_section_by_heading(soup, "Kết quả thực hiện")
353
+ }
354
+
355
+ best_doc_raw = get_best_doc_from_sections(query_text, model_embed, sections)['best_doc']
356
+
357
+ if isinstance(best_doc_raw, dict):
358
+ best_doc_text = " ".join(f"{k}: {v}" for k, v in best_doc_raw.items())
359
+ else:
360
+ best_doc_text = str(best_doc_raw)
361
+
362
+ best_doc_text += " " + snippet
363
+ print(best_doc_text)
364
+
365
+
366
+ else:
367
+ query_vec = model_embed.encode([query_text])
368
+ query_vec = query_vec / np.linalg.norm(query_vec, axis=1, keepdims=True)
369
+
370
+ D, I = index.search(query_vec.astype(np.float32), k=10)
371
+
372
+ retrieved_documents = []
373
+ pairs_for_rerank = []
374
+
375
+ for rank, idx in enumerate(I[0]):
376
+ doc_text = metadata[idx]["text"]
377
+ print(idx)
378
+ print(doc_text[:100])
379
+ title_doc = extract_thu_tuc(doc_text)
380
+ if title_doc.lower().strip() == title.lower().strip():
381
+ # Gắn idx vào metadata hoặc lưu riêng
382
+ retrieved_documents= []
383
+ pairs_for_rerank = []
384
+ retrieved_documents.append((idx, doc_text, metadata[idx]))
385
+ pairs_for_rerank.append([query_text, doc_text])
386
+ break
387
+
388
+
389
+ # Gắn idx vào metadata hoặc lưu riêng
390
+ retrieved_documents.append((idx, doc_text, metadata[idx]))
391
+ pairs_for_rerank.append([query_text, doc_text])
392
+
393
+ # ==== Bước 6: Rerank với CrossEncoder ====
394
+
395
+ scores = rerank_model.predict(pairs_for_rerank)
396
+ scored_docs = list(zip(scores, retrieved_documents))
397
+
398
+ # Sắp xếp theo score giảm dần
399
+ scored_docs.sort(reverse=True, key=lambda x: x[0])
400
+
401
+ # Lấy document có score cao nhất
402
+ best_score, (best_idx, best_doc_text, best_meta) = scored_docs[0]
403
+ print("After rerank: ", best_doc_text[:100])
404
+
405
+
406
+ doc = best_doc_text
407
+ answer = generate_mcq(message, doc)
408
+ print(answer)
409
+ return f"""{answer}
410
+
411
+ **Nguồn tham khảo:**
412
+ 1. {link}
413
+
414
+ """
415
+ else:
416
+ print("My Model")
417
+ final_prompt = f"""Bạn là một trợ lý AI hỗ trợ tra cứu dịch vụ công.
418
+ ## Câu hỏi:
419
+ {message}"""
420
+
421
+ messages2 = [{"role": "user", "content": final_prompt}]
422
+ text2 = tokenizer1.apply_chat_template(
423
+ messages2,
424
+ tokenize=False,
425
+ add_generation_prompt=True,
426
+ enable_thinking=False,
427
+ )
428
+ inputs2 = tokenizer1(text2, return_tensors="pt").to("cuda")
429
+ outputs2 = model1.generate(
430
+ **inputs2,
431
+ max_new_tokens=2048,
432
+ temperature=0.7,
433
+ top_p=0.8,
434
+ do_sample=True,
435
+ )
436
+ response = generate(tokenizer1.decode(outputs2[0], skip_special_tokens=True).split("assistant")[-1].replace("<think>", "").replace("</think>", "").strip())
437
+
438
+ print(response)
439
+
440
+ return f"""{response}"""
441
+
442
+
443
+
444
+ with gr.Blocks() as demo:
445
+ with gr.Row():
446
+ chatbot = gr.Chatbot()
447
+
448
+ with gr.Row():
449
+ user_input = gr.Textbox(label="Nhập câu hỏi")
450
+ model_choice = gr.Radio(
451
+ choices=["Gemini", "My model"],
452
+ value="Gemini",
453
+ label="Chọn mô hình trả lời"
454
+ )
455
+ send_button = gr.Button("Gửi")
456
+
457
+ state = gr.State([])
458
+
459
+ def chat_wrapper(message, model_name, history):
460
+ system_msg = ""
461
+ response = respond(message, system_msg, model_name)
462
+ history.append([message, response])
463
+ return history, history # Trả về cho chatbot và cập nhật state
464
+
465
+ send_button.click(
466
+ chat_wrapper,
467
+ inputs=[user_input, model_choice, state],
468
+ outputs=[chatbot, state]
469
+ )
470
+
471
+ demo.launch(share=True)
faiss_index_bo_tp_cosine_full2.idx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98114e3e804397389140aba2ff7de88e26a82559374ba5ae7615ac13100b3838
3
+ size 58070061
faiss_metadata_bo_tp_full2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7c3907ede6446635fd6893720ccda81787cc2b2c1c7b9978784baa87e420a8
3
+ size 65140375
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ unsloth
2
+ faiss-gpu-cu11
3
+ gradio
4
+ google-search-results