Javier Vera commited on
Commit
2d6d126
Β·
verified Β·
1 Parent(s): 159fe5e

Update rag_hf.py

Browse files
Files changed (1) hide show
  1. rag_hf.py +187 -46
rag_hf.py CHANGED
@@ -10,7 +10,7 @@ from rdflib import Graph as RDFGraph, Namespace
10
  from sentence_transformers import SentenceTransformer
11
  from dotenv import load_dotenv
12
 
13
- # === ORIGINAL CONFIGURATION (unchanged) ===
14
  load_dotenv()
15
  ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
16
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
@@ -18,37 +18,70 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
18
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
  EX = Namespace("http://example.org/lang/")
20
 
21
- # === IMPROVED UI SETUP ===
22
  st.set_page_config(
23
- page_title="Vanishing Voices: Language Atlas",
24
  page_icon="🌍",
25
  layout="wide",
26
- initial_sidebar_state="expanded"
 
 
 
 
27
  )
28
 
29
- # Professional CSS (visual only)
30
  st.markdown("""
31
  <style>
 
32
  .header {
33
  color: #2c3e50;
34
- border-bottom: 2px solid #3498db;
35
- padding-bottom: 10px;
36
  margin-bottom: 1.5rem;
37
  }
 
 
 
 
 
 
 
38
  .response-card {
39
- background-color: #f8f9fa;
40
  border-radius: 8px;
41
  padding: 1.5rem;
 
42
  margin: 1rem 0;
43
- border-left: 4px solid #3498db;
44
  }
45
- .language-info {
46
- background-color: white;
47
  border-radius: 8px;
48
  padding: 1rem;
49
  margin: 0.5rem 0;
50
- box-shadow: 0 1px 3px rgba(0,0,0,0.1);
 
 
 
 
 
51
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  .metric-badge {
53
  display: inline-block;
54
  background-color: #e8f4fc;
@@ -57,11 +90,19 @@ st.markdown("""
57
  font-size: 0.85rem;
58
  margin-right: 0.5rem;
59
  }
 
 
 
 
 
 
 
 
60
  </style>
61
  """, unsafe_allow_html=True)
62
 
63
- # === ORIGINAL FUNCTIONALITY (unchanged) ===
64
- @st.cache_resource(show_spinner="Loading models and indexes...")
65
  def load_all_components():
66
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
67
  methods = {}
@@ -79,9 +120,6 @@ def load_all_components():
79
  methods[label] = (matrix, id_map, G, rdf)
80
  return methods, embedder
81
 
82
- methods, embedder = load_all_components()
83
-
84
- # === ORIGINAL CORE FUNCTIONS (unchanged) ===
85
  def get_top_k(matrix, id_map, query, k):
86
  vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
87
  vec = vec.cpu().numpy().astype("float32")
@@ -143,44 +181,142 @@ Answer:
143
  except Exception as e:
144
  return str(e), ids, context, rdf_facts
145
 
146
- # === IMPROVED MAIN FUNCTION (same functionality, better UI) ===
147
  def main():
 
 
 
 
148
  st.markdown("""
149
  <div class="header">
150
- <h1>Vanishing Voices: South America's Endangered Language Atlas</h1>
151
- </div>
152
- <div style="background-color: #e8f4fc; border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem;">
153
- <b>AI-Powered Analysis:</b> This app uses Mistral-7B-Instruct with RAG (Retrieval-Augmented Generation) to analyze indigenous languages.
154
  </div>
155
  """, unsafe_allow_html=True)
156
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  with st.sidebar:
158
- st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
159
- st.markdown("### Analysis Methods")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  st.markdown("""
161
- - **InfoMatch**: Combines text embeddings with metadata
162
- - **LinkGraph**: Uses graph neural networks (GraphSAGE)
 
163
  """)
164
 
165
- # Original controls with same parameters
166
- k = st.slider("Languages to analyze", 1, 10, 3)
167
- show_ids = st.checkbox("Show Language IDs", True)
168
- show_ctx = st.checkbox("Show Context Info", True)
169
- show_rdf = st.checkbox("Show RDF Facts", False)
 
 
 
 
 
 
170
 
171
- query = st.text_input("Ask about South American languages:", "What languages are spoken in PerΓΊ?")
 
 
 
 
 
 
 
172
 
173
- if st.button("Analyze with AI"):
 
 
 
 
174
  col1, col2 = st.columns(2)
175
- results = {}
176
  for col, (label, method) in zip([col1, col2], methods.items()):
177
  with col:
178
- st.subheader(f"{label} Method")
 
 
 
 
 
179
  start = datetime.datetime.now()
180
  response, lang_ids, context, rdf_data = generate_response(*method, query, k)
181
  duration = (datetime.datetime.now() - start).total_seconds()
182
 
183
- # Improved response display
184
  st.markdown(f"""
185
  <div class="response-card">
186
  {response}
@@ -191,19 +327,24 @@ def main():
191
  </div>
192
  """, unsafe_allow_html=True)
193
 
194
- # Original debug info with better presentation
195
- if show_ids:
196
- with st.expander("Language IDs"):
197
- st.code("\n".join(lang_ids))
198
-
199
  if show_ctx:
200
- with st.expander("Context Information"):
201
- for ctx in context:
202
- st.markdown(f"<div class='language-info'>{ctx}</div>", unsafe_allow_html=True)
203
 
204
  if show_rdf:
205
- with st.expander("RDF Relations"):
206
  st.code("\n".join(rdf_data))
 
 
 
 
 
 
 
 
 
207
 
208
  if __name__ == "__main__":
209
  main()
 
10
  from sentence_transformers import SentenceTransformer
11
  from dotenv import load_dotenv
12
 
13
+ # === CONFIGURATION ===
14
  load_dotenv()
15
  ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
16
  HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 
18
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
  EX = Namespace("http://example.org/lang/")
20
 
21
+ # === STREAMLIT UI CONFIG ===
22
  st.set_page_config(
23
+ page_title="Language Atlas: South American Indigenous Languages",
24
  page_icon="🌍",
25
  layout="wide",
26
+ initial_sidebar_state="expanded",
27
+ menu_items={
28
+ 'About': "## AI-powered analysis of endangered indigenous languages\n"
29
+ "Developed by Departamento AcadΓ©mico de Humanidades"
30
+ }
31
  )
32
 
33
+ # === CUSTOM CSS ===
34
  st.markdown("""
35
  <style>
36
+ /* Main styles */
37
  .header {
38
  color: #2c3e50;
39
+ border-bottom: 2px solid #4f46e5;
40
+ padding-bottom: 0.5rem;
41
  margin-bottom: 1.5rem;
42
  }
43
+ .feature-card {
44
+ background-color: #f8fafc;
45
+ border-radius: 8px;
46
+ padding: 1rem;
47
+ margin: 0.5rem 0;
48
+ border-left: 3px solid #4f46e5;
49
+ }
50
  .response-card {
51
+ background-color: white;
52
  border-radius: 8px;
53
  padding: 1.5rem;
54
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
55
  margin: 1rem 0;
 
56
  }
57
+ .language-card {
58
+ background-color: #f9fafb;
59
  border-radius: 8px;
60
  padding: 1rem;
61
  margin: 0.5rem 0;
62
+ border: 1px solid #e5e7eb;
63
+ }
64
+
65
+ /* Sidebar styles */
66
+ .sidebar-section {
67
+ margin-bottom: 1.5rem;
68
  }
69
+ .sidebar-title {
70
+ font-weight: 600;
71
+ color: #4f46e5;
72
+ }
73
+ .suggested-question {
74
+ padding: 0.5rem;
75
+ margin: 0.25rem 0;
76
+ border-radius: 4px;
77
+ cursor: pointer;
78
+ transition: all 0.2s;
79
+ }
80
+ .suggested-question:hover {
81
+ background-color: #f1f5f9;
82
+ }
83
+
84
+ /* Metrics and badges */
85
  .metric-badge {
86
  display: inline-block;
87
  background-color: #e8f4fc;
 
90
  font-size: 0.85rem;
91
  margin-right: 0.5rem;
92
  }
93
+ .tech-badge {
94
+ background-color: #ecfdf5;
95
+ color: #065f46;
96
+ padding: 0.25rem 0.5rem;
97
+ border-radius: 4px;
98
+ font-size: 0.75rem;
99
+ font-weight: 500;
100
+ }
101
  </style>
102
  """, unsafe_allow_html=True)
103
 
104
+ # === CORE FUNCTIONS ===
105
+ @st.cache_resource(show_spinner="Loading AI models and knowledge graphs...")
106
  def load_all_components():
107
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
108
  methods = {}
 
120
  methods[label] = (matrix, id_map, G, rdf)
121
  return methods, embedder
122
 
 
 
 
123
  def get_top_k(matrix, id_map, query, k):
124
  vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
125
  vec = vec.cpu().numpy().astype("float32")
 
181
  except Exception as e:
182
  return str(e), ids, context, rdf_facts
183
 
184
+ # === MAIN APP ===
185
  def main():
186
+ # Load components
187
+ methods, embedder = load_all_components()
188
+
189
+ # Main header
190
  st.markdown("""
191
  <div class="header">
192
+ <h1>🌍 Language Atlas: South American Indigenous Languages</h1>
 
 
 
193
  </div>
194
  """, unsafe_allow_html=True)
195
+
196
+ # Overview section
197
+ with st.expander("πŸ“Œ Overview", expanded=True):
198
+ st.markdown("""
199
+ This app provides **AI-powered analysis** of endangered indigenous languages in South America,
200
+ integrating knowledge graphs from **Glottolog, Wikipedia, and Wikidata**.
201
+ """)
202
+
203
+ cols = st.columns(2)
204
+ with cols[0]:
205
+ st.markdown("""
206
+ <div class="feature-card">
207
+ <h4>πŸ”Ή Two AI Methods Available:</h4>
208
+ <ul>
209
+ <li><b>InfoMatch</b> (Node2Vec + Textual Data)</li>
210
+ <li><b>LinkGraph</b> (GraphSAGE + Structured Relations)</li>
211
+ </ul>
212
+ <p>πŸ”Ή <b>Powered by Mistral-7B</b> for contextual responses</p>
213
+ </div>
214
+ """, unsafe_allow_html=True)
215
+
216
+ with cols[1]:
217
+ st.markdown("""
218
+ <div class="feature-card">
219
+ <h4>πŸ› οΈ Features</h4>
220
+ <ul>
221
+ <li>βœ… Multisource Knowledge Graph</li>
222
+ <li>βœ… Hybrid AI Analysis</li>
223
+ <li>βœ… Comparative Results</li>
224
+ <li>βœ… Structured & Unstructured Data</li>
225
+ </ul>
226
+ </div>
227
+ """, unsafe_allow_html=True)
228
+
229
+ # Sidebar
230
  with st.sidebar:
231
+ # Logo and academic info
232
+ st.image("626af4b6fb5b8b79dac12078_logo-dpto-humanidades.png", width=180)
233
+ st.markdown("### Departamento AcadΓ©mico de Humanidades")
234
+ st.markdown("---")
235
+
236
+ # Quick start guide
237
+ st.markdown("### πŸš€ Quick Start")
238
+ st.markdown("""
239
+ 1. **Type a question** in the input box
240
+ 2. **Click 'Analyze'** to compare methods
241
+ 3. **Explore results** with expandable details
242
+ """)
243
+
244
+ st.markdown("---")
245
+
246
+ # Suggested questions
247
+ st.markdown("### πŸ” Example Queries")
248
+ questions = [
249
+ "What languages are endangered in Brazil?",
250
+ "How many speakers does Aymara have?",
251
+ "Which languages are related to Quechua?",
252
+ "Where is Mapudungun spoken?"
253
+ ]
254
+
255
+ for q in questions:
256
+ if st.markdown(f"<div class='suggested-question'>{q}</div>", unsafe_allow_html=True):
257
+ st.session_state.query = q
258
+
259
+ st.markdown("---")
260
+
261
+ # Technical details
262
+ st.markdown("### βš™οΈ Technical Details")
263
+ st.markdown("""
264
+ - <span class="tech-badge">Embeddings</span> Node2Vec vs. GraphSAGE
265
+ - <span class="tech-badge">Language Model</span> Mistral-7B-Instruct
266
+ - <span class="tech-badge">Knowledge Graph</span> RDF-based integration
267
+ """, unsafe_allow_html=True)
268
+
269
+ st.markdown("---")
270
+
271
+ # Data sources
272
+ st.markdown("### πŸ“‚ Data Sources")
273
  st.markdown("""
274
+ - **Glottolog** (Language classification)
275
+ - **Wikipedia** (Textual summaries)
276
+ - **Wikidata** (Structured facts)
277
  """)
278
 
279
+ st.markdown("---")
280
+
281
+ # Analysis parameters
282
+ st.markdown("### πŸ“Š Analysis Parameters")
283
+ k = st.slider("Number of languages to analyze", 1, 10, 3)
284
+ st.markdown("---")
285
+
286
+ # Debug options
287
+ st.markdown("### πŸ”§ Advanced Options")
288
+ show_ctx = st.checkbox("Show context information", False)
289
+ show_rdf = st.checkbox("Show structured facts", False)
290
 
291
+ # Main query interface
292
+ st.markdown("### πŸ“ Ask About Indigenous Languages")
293
+ query = st.text_input(
294
+ "Enter your question:",
295
+ value=st.session_state.get("query", ""),
296
+ label_visibility="collapsed",
297
+ placeholder="e.g. What languages are spoken in Peru?"
298
+ )
299
 
300
+ if st.button("Analyze", type="primary", use_container_width=True):
301
+ if not query:
302
+ st.warning("Please enter a question")
303
+ return
304
+
305
  col1, col2 = st.columns(2)
306
+
307
  for col, (label, method) in zip([col1, col2], methods.items()):
308
  with col:
309
+ st.markdown(f"#### {label} Method")
310
+ st.caption({
311
+ "InfoMatch": "Node2Vec embeddings combining text and graph structure",
312
+ "LinkGraph": "GraphSAGE embeddings capturing network patterns"
313
+ }[label])
314
+
315
  start = datetime.datetime.now()
316
  response, lang_ids, context, rdf_data = generate_response(*method, query, k)
317
  duration = (datetime.datetime.now() - start).total_seconds()
318
 
319
+ # Response display
320
  st.markdown(f"""
321
  <div class="response-card">
322
  {response}
 
327
  </div>
328
  """, unsafe_allow_html=True)
329
 
330
+ # Additional information
 
 
 
 
331
  if show_ctx:
332
+ with st.expander(f"πŸ“– Context from {len(lang_ids)} languages"):
333
+ for lang_id, ctx in zip(lang_ids, context):
334
+ st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
335
 
336
  if show_rdf:
337
+ with st.expander("πŸ”— Structured facts (RDF)"):
338
  st.code("\n".join(rdf_data))
339
+
340
+ # Footer note
341
+ st.markdown("---")
342
+ st.markdown("""
343
+ <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
344
+ <b>πŸ“Œ Note:</b> This tool is designed for researchers, linguists, and cultural preservationists.
345
+ For best results, use specific questions about languages, families, or regions.
346
+ </div>
347
+ """, unsafe_allow_html=True)
348
 
349
  if __name__ == "__main__":
350
  main()