Javier Vera
commited on
Update rag_hf.py
Browse files
rag_hf.py
CHANGED
@@ -10,7 +10,7 @@ from rdflib import Graph as RDFGraph, Namespace
|
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
-
# ===
|
14 |
load_dotenv()
|
15 |
ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
|
16 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
@@ -18,37 +18,70 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
|
|
18 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
EX = Namespace("http://example.org/lang/")
|
20 |
|
21 |
-
# ===
|
22 |
st.set_page_config(
|
23 |
-
page_title="
|
24 |
page_icon="π",
|
25 |
layout="wide",
|
26 |
-
initial_sidebar_state="expanded"
|
|
|
|
|
|
|
|
|
27 |
)
|
28 |
|
29 |
-
#
|
30 |
st.markdown("""
|
31 |
<style>
|
|
|
32 |
.header {
|
33 |
color: #2c3e50;
|
34 |
-
border-bottom: 2px solid #
|
35 |
-
padding-bottom:
|
36 |
margin-bottom: 1.5rem;
|
37 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
.response-card {
|
39 |
-
background-color:
|
40 |
border-radius: 8px;
|
41 |
padding: 1.5rem;
|
|
|
42 |
margin: 1rem 0;
|
43 |
-
border-left: 4px solid #3498db;
|
44 |
}
|
45 |
-
.language-
|
46 |
-
background-color:
|
47 |
border-radius: 8px;
|
48 |
padding: 1rem;
|
49 |
margin: 0.5rem 0;
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
.metric-badge {
|
53 |
display: inline-block;
|
54 |
background-color: #e8f4fc;
|
@@ -57,11 +90,19 @@ st.markdown("""
|
|
57 |
font-size: 0.85rem;
|
58 |
margin-right: 0.5rem;
|
59 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
</style>
|
61 |
""", unsafe_allow_html=True)
|
62 |
|
63 |
-
# ===
|
64 |
-
@st.cache_resource(show_spinner="Loading models and
|
65 |
def load_all_components():
|
66 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
67 |
methods = {}
|
@@ -79,9 +120,6 @@ def load_all_components():
|
|
79 |
methods[label] = (matrix, id_map, G, rdf)
|
80 |
return methods, embedder
|
81 |
|
82 |
-
methods, embedder = load_all_components()
|
83 |
-
|
84 |
-
# === ORIGINAL CORE FUNCTIONS (unchanged) ===
|
85 |
def get_top_k(matrix, id_map, query, k):
|
86 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
87 |
vec = vec.cpu().numpy().astype("float32")
|
@@ -143,44 +181,142 @@ Answer:
|
|
143 |
except Exception as e:
|
144 |
return str(e), ids, context, rdf_facts
|
145 |
|
146 |
-
# ===
|
147 |
def main():
|
|
|
|
|
|
|
|
|
148 |
st.markdown("""
|
149 |
<div class="header">
|
150 |
-
<h1
|
151 |
-
</div>
|
152 |
-
<div style="background-color: #e8f4fc; border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem;">
|
153 |
-
<b>AI-Powered Analysis:</b> This app uses Mistral-7B-Instruct with RAG (Retrieval-Augmented Generation) to analyze indigenous languages.
|
154 |
</div>
|
155 |
""", unsafe_allow_html=True)
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
with st.sidebar:
|
158 |
-
|
159 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
st.markdown("""
|
161 |
-
- **
|
162 |
-
- **
|
|
|
163 |
""")
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
-
if st.button("Analyze
|
|
|
|
|
|
|
|
|
174 |
col1, col2 = st.columns(2)
|
175 |
-
|
176 |
for col, (label, method) in zip([col1, col2], methods.items()):
|
177 |
with col:
|
178 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
179 |
start = datetime.datetime.now()
|
180 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
181 |
duration = (datetime.datetime.now() - start).total_seconds()
|
182 |
|
183 |
-
#
|
184 |
st.markdown(f"""
|
185 |
<div class="response-card">
|
186 |
{response}
|
@@ -191,19 +327,24 @@ def main():
|
|
191 |
</div>
|
192 |
""", unsafe_allow_html=True)
|
193 |
|
194 |
-
#
|
195 |
-
if show_ids:
|
196 |
-
with st.expander("Language IDs"):
|
197 |
-
st.code("\n".join(lang_ids))
|
198 |
-
|
199 |
if show_ctx:
|
200 |
-
with st.expander("Context
|
201 |
-
for ctx in context:
|
202 |
-
st.markdown(f"<div class='language-
|
203 |
|
204 |
if show_rdf:
|
205 |
-
with st.expander("RDF
|
206 |
st.code("\n".join(rdf_data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
if __name__ == "__main__":
|
209 |
main()
|
|
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
+
# === CONFIGURATION ===
|
14 |
load_dotenv()
|
15 |
ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
|
16 |
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
|
|
18 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
EX = Namespace("http://example.org/lang/")
|
20 |
|
21 |
+
# === STREAMLIT UI CONFIG ===
|
22 |
st.set_page_config(
|
23 |
+
page_title="Language Atlas: South American Indigenous Languages",
|
24 |
page_icon="π",
|
25 |
layout="wide",
|
26 |
+
initial_sidebar_state="expanded",
|
27 |
+
menu_items={
|
28 |
+
'About': "## AI-powered analysis of endangered indigenous languages\n"
|
29 |
+
"Developed by Departamento AcadΓ©mico de Humanidades"
|
30 |
+
}
|
31 |
)
|
32 |
|
33 |
+
# === CUSTOM CSS ===
|
34 |
st.markdown("""
|
35 |
<style>
|
36 |
+
/* Main styles */
|
37 |
.header {
|
38 |
color: #2c3e50;
|
39 |
+
border-bottom: 2px solid #4f46e5;
|
40 |
+
padding-bottom: 0.5rem;
|
41 |
margin-bottom: 1.5rem;
|
42 |
}
|
43 |
+
.feature-card {
|
44 |
+
background-color: #f8fafc;
|
45 |
+
border-radius: 8px;
|
46 |
+
padding: 1rem;
|
47 |
+
margin: 0.5rem 0;
|
48 |
+
border-left: 3px solid #4f46e5;
|
49 |
+
}
|
50 |
.response-card {
|
51 |
+
background-color: white;
|
52 |
border-radius: 8px;
|
53 |
padding: 1.5rem;
|
54 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
55 |
margin: 1rem 0;
|
|
|
56 |
}
|
57 |
+
.language-card {
|
58 |
+
background-color: #f9fafb;
|
59 |
border-radius: 8px;
|
60 |
padding: 1rem;
|
61 |
margin: 0.5rem 0;
|
62 |
+
border: 1px solid #e5e7eb;
|
63 |
+
}
|
64 |
+
|
65 |
+
/* Sidebar styles */
|
66 |
+
.sidebar-section {
|
67 |
+
margin-bottom: 1.5rem;
|
68 |
}
|
69 |
+
.sidebar-title {
|
70 |
+
font-weight: 600;
|
71 |
+
color: #4f46e5;
|
72 |
+
}
|
73 |
+
.suggested-question {
|
74 |
+
padding: 0.5rem;
|
75 |
+
margin: 0.25rem 0;
|
76 |
+
border-radius: 4px;
|
77 |
+
cursor: pointer;
|
78 |
+
transition: all 0.2s;
|
79 |
+
}
|
80 |
+
.suggested-question:hover {
|
81 |
+
background-color: #f1f5f9;
|
82 |
+
}
|
83 |
+
|
84 |
+
/* Metrics and badges */
|
85 |
.metric-badge {
|
86 |
display: inline-block;
|
87 |
background-color: #e8f4fc;
|
|
|
90 |
font-size: 0.85rem;
|
91 |
margin-right: 0.5rem;
|
92 |
}
|
93 |
+
.tech-badge {
|
94 |
+
background-color: #ecfdf5;
|
95 |
+
color: #065f46;
|
96 |
+
padding: 0.25rem 0.5rem;
|
97 |
+
border-radius: 4px;
|
98 |
+
font-size: 0.75rem;
|
99 |
+
font-weight: 500;
|
100 |
+
}
|
101 |
</style>
|
102 |
""", unsafe_allow_html=True)
|
103 |
|
104 |
+
# === CORE FUNCTIONS ===
|
105 |
+
@st.cache_resource(show_spinner="Loading AI models and knowledge graphs...")
|
106 |
def load_all_components():
|
107 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
108 |
methods = {}
|
|
|
120 |
methods[label] = (matrix, id_map, G, rdf)
|
121 |
return methods, embedder
|
122 |
|
|
|
|
|
|
|
123 |
def get_top_k(matrix, id_map, query, k):
|
124 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
125 |
vec = vec.cpu().numpy().astype("float32")
|
|
|
181 |
except Exception as e:
|
182 |
return str(e), ids, context, rdf_facts
|
183 |
|
184 |
+
# === MAIN APP ===
|
185 |
def main():
|
186 |
+
# Load components
|
187 |
+
methods, embedder = load_all_components()
|
188 |
+
|
189 |
+
# Main header
|
190 |
st.markdown("""
|
191 |
<div class="header">
|
192 |
+
<h1>π Language Atlas: South American Indigenous Languages</h1>
|
|
|
|
|
|
|
193 |
</div>
|
194 |
""", unsafe_allow_html=True)
|
195 |
+
|
196 |
+
# Overview section
|
197 |
+
with st.expander("π Overview", expanded=True):
|
198 |
+
st.markdown("""
|
199 |
+
This app provides **AI-powered analysis** of endangered indigenous languages in South America,
|
200 |
+
integrating knowledge graphs from **Glottolog, Wikipedia, and Wikidata**.
|
201 |
+
""")
|
202 |
+
|
203 |
+
cols = st.columns(2)
|
204 |
+
with cols[0]:
|
205 |
+
st.markdown("""
|
206 |
+
<div class="feature-card">
|
207 |
+
<h4>πΉ Two AI Methods Available:</h4>
|
208 |
+
<ul>
|
209 |
+
<li><b>InfoMatch</b> (Node2Vec + Textual Data)</li>
|
210 |
+
<li><b>LinkGraph</b> (GraphSAGE + Structured Relations)</li>
|
211 |
+
</ul>
|
212 |
+
<p>πΉ <b>Powered by Mistral-7B</b> for contextual responses</p>
|
213 |
+
</div>
|
214 |
+
""", unsafe_allow_html=True)
|
215 |
+
|
216 |
+
with cols[1]:
|
217 |
+
st.markdown("""
|
218 |
+
<div class="feature-card">
|
219 |
+
<h4>π οΈ Features</h4>
|
220 |
+
<ul>
|
221 |
+
<li>β
Multisource Knowledge Graph</li>
|
222 |
+
<li>β
Hybrid AI Analysis</li>
|
223 |
+
<li>β
Comparative Results</li>
|
224 |
+
<li>β
Structured & Unstructured Data</li>
|
225 |
+
</ul>
|
226 |
+
</div>
|
227 |
+
""", unsafe_allow_html=True)
|
228 |
+
|
229 |
+
# Sidebar
|
230 |
with st.sidebar:
|
231 |
+
# Logo and academic info
|
232 |
+
st.image("626af4b6fb5b8b79dac12078_logo-dpto-humanidades.png", width=180)
|
233 |
+
st.markdown("### Departamento AcadΓ©mico de Humanidades")
|
234 |
+
st.markdown("---")
|
235 |
+
|
236 |
+
# Quick start guide
|
237 |
+
st.markdown("### π Quick Start")
|
238 |
+
st.markdown("""
|
239 |
+
1. **Type a question** in the input box
|
240 |
+
2. **Click 'Analyze'** to compare methods
|
241 |
+
3. **Explore results** with expandable details
|
242 |
+
""")
|
243 |
+
|
244 |
+
st.markdown("---")
|
245 |
+
|
246 |
+
# Suggested questions
|
247 |
+
st.markdown("### π Example Queries")
|
248 |
+
questions = [
|
249 |
+
"What languages are endangered in Brazil?",
|
250 |
+
"How many speakers does Aymara have?",
|
251 |
+
"Which languages are related to Quechua?",
|
252 |
+
"Where is Mapudungun spoken?"
|
253 |
+
]
|
254 |
+
|
255 |
+
for q in questions:
|
256 |
+
if st.markdown(f"<div class='suggested-question'>{q}</div>", unsafe_allow_html=True):
|
257 |
+
st.session_state.query = q
|
258 |
+
|
259 |
+
st.markdown("---")
|
260 |
+
|
261 |
+
# Technical details
|
262 |
+
st.markdown("### βοΈ Technical Details")
|
263 |
+
st.markdown("""
|
264 |
+
- <span class="tech-badge">Embeddings</span> Node2Vec vs. GraphSAGE
|
265 |
+
- <span class="tech-badge">Language Model</span> Mistral-7B-Instruct
|
266 |
+
- <span class="tech-badge">Knowledge Graph</span> RDF-based integration
|
267 |
+
""", unsafe_allow_html=True)
|
268 |
+
|
269 |
+
st.markdown("---")
|
270 |
+
|
271 |
+
# Data sources
|
272 |
+
st.markdown("### π Data Sources")
|
273 |
st.markdown("""
|
274 |
+
- **Glottolog** (Language classification)
|
275 |
+
- **Wikipedia** (Textual summaries)
|
276 |
+
- **Wikidata** (Structured facts)
|
277 |
""")
|
278 |
|
279 |
+
st.markdown("---")
|
280 |
+
|
281 |
+
# Analysis parameters
|
282 |
+
st.markdown("### π Analysis Parameters")
|
283 |
+
k = st.slider("Number of languages to analyze", 1, 10, 3)
|
284 |
+
st.markdown("---")
|
285 |
+
|
286 |
+
# Debug options
|
287 |
+
st.markdown("### π§ Advanced Options")
|
288 |
+
show_ctx = st.checkbox("Show context information", False)
|
289 |
+
show_rdf = st.checkbox("Show structured facts", False)
|
290 |
|
291 |
+
# Main query interface
|
292 |
+
st.markdown("### π Ask About Indigenous Languages")
|
293 |
+
query = st.text_input(
|
294 |
+
"Enter your question:",
|
295 |
+
value=st.session_state.get("query", ""),
|
296 |
+
label_visibility="collapsed",
|
297 |
+
placeholder="e.g. What languages are spoken in Peru?"
|
298 |
+
)
|
299 |
|
300 |
+
if st.button("Analyze", type="primary", use_container_width=True):
|
301 |
+
if not query:
|
302 |
+
st.warning("Please enter a question")
|
303 |
+
return
|
304 |
+
|
305 |
col1, col2 = st.columns(2)
|
306 |
+
|
307 |
for col, (label, method) in zip([col1, col2], methods.items()):
|
308 |
with col:
|
309 |
+
st.markdown(f"#### {label} Method")
|
310 |
+
st.caption({
|
311 |
+
"InfoMatch": "Node2Vec embeddings combining text and graph structure",
|
312 |
+
"LinkGraph": "GraphSAGE embeddings capturing network patterns"
|
313 |
+
}[label])
|
314 |
+
|
315 |
start = datetime.datetime.now()
|
316 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
317 |
duration = (datetime.datetime.now() - start).total_seconds()
|
318 |
|
319 |
+
# Response display
|
320 |
st.markdown(f"""
|
321 |
<div class="response-card">
|
322 |
{response}
|
|
|
327 |
</div>
|
328 |
""", unsafe_allow_html=True)
|
329 |
|
330 |
+
# Additional information
|
|
|
|
|
|
|
|
|
331 |
if show_ctx:
|
332 |
+
with st.expander(f"π Context from {len(lang_ids)} languages"):
|
333 |
+
for lang_id, ctx in zip(lang_ids, context):
|
334 |
+
st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
|
335 |
|
336 |
if show_rdf:
|
337 |
+
with st.expander("π Structured facts (RDF)"):
|
338 |
st.code("\n".join(rdf_data))
|
339 |
+
|
340 |
+
# Footer note
|
341 |
+
st.markdown("---")
|
342 |
+
st.markdown("""
|
343 |
+
<div style="font-size: 0.8rem; color: #64748b; text-align: center;">
|
344 |
+
<b>π Note:</b> This tool is designed for researchers, linguists, and cultural preservationists.
|
345 |
+
For best results, use specific questions about languages, families, or regions.
|
346 |
+
</div>
|
347 |
+
""", unsafe_allow_html=True)
|
348 |
|
349 |
if __name__ == "__main__":
|
350 |
main()
|