javiervz commited on
Commit
0640991
·
verified ·
1 Parent(s): 842db18

Update rag_hf.py

Browse files
Files changed (1) hide show
  1. rag_hf.py +138 -122
rag_hf.py CHANGED
@@ -20,13 +20,13 @@ EX = Namespace("http://example.org/lang/")
20
 
21
  # === STREAMLIT UI CONFIG ===
22
  st.set_page_config(
23
- page_title="Language Atlas: South American Indigenous Languages",
24
  page_icon="🌍",
25
  layout="wide",
26
  initial_sidebar_state="expanded",
27
  menu_items={
28
- 'About': "## AI-powered analysis of endangered indigenous languages\n"
29
- "Developed by Departamento Académico de Humanidades"
30
  }
31
  )
32
 
@@ -100,22 +100,20 @@ st.markdown("""
100
  """, unsafe_allow_html=True)
101
 
102
  # === CORE FUNCTIONS ===
103
- @st.cache_resource(show_spinner="Loading AI models and knowledge graphs...")
104
  def load_all_components():
105
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
106
  methods = {}
107
- for label, suffix, ttl, matrix_path in [
108
- ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
109
- ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
110
- ]:
111
- with open(f"id_map{suffix}.pkl", "rb") as f:
112
- id_map = pickle.load(f)
113
- with open(f"grafo_embed{suffix}.pickle", "rb") as f:
114
- G = pickle.load(f)
115
- matrix = np.load(matrix_path)
116
- rdf = RDFGraph()
117
- rdf.parse(ttl, format="ttl")
118
- methods[label] = (matrix, id_map, G, rdf)
119
  return methods, embedder
120
 
121
  def get_top_k(matrix, id_map, query, k, embedder):
@@ -127,13 +125,13 @@ def get_top_k(matrix, id_map, query, k, embedder):
127
 
128
  def get_context(G, lang_id):
129
  node = G.nodes.get(lang_id, {})
130
- lines = [f"**Language:** {node.get('label', lang_id)}"]
131
  if node.get("wikipedia_summary"):
132
  lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
133
  if node.get("wikidata_description"):
134
  lines.append(f"**Wikidata:** {node['wikidata_description']}")
135
  if node.get("wikidata_countries"):
136
- lines.append(f"**Countries:** {node['wikidata_countries']}")
137
  return "\n\n".join(lines)
138
 
139
  def query_rdf(rdf, lang_id):
@@ -152,32 +150,56 @@ def generate_response(matrix, id_map, G, rdf, user_question, k, embedder):
152
  rdf_facts = []
153
  for i in ids:
154
  rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
155
- prompt = f"""<s>[INST]
156
- You are an expert in South American indigenous languages.
157
- Use strictly and only the information below to answer the user question in **English**.
 
 
 
 
 
 
 
 
 
 
158
  - Do not infer or assume facts that are not explicitly stated.
159
  - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
160
  - Limit your answer to 100 words.
161
- ### CONTEXT:
162
- {chr(10).join(context)}
163
- ### RDF RELATIONS:
164
- {chr(10).join(rdf_facts)}
165
- ### QUESTION:
166
- {user_question}
167
- Answer:
168
- [/INST]"""
169
  try:
170
- res = requests.post(
 
 
 
 
 
 
 
 
 
 
 
171
  ENDPOINT_URL,
172
  headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
173
- json={"inputs": prompt}, timeout=60
174
  )
175
- out = res.json()
176
- if isinstance(out, list) and "generated_text" in out[0]:
177
- return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
178
- return str(out), ids, context, rdf_facts
 
 
 
179
  except Exception as e:
180
- return str(e), ids, context, rdf_facts
181
 
182
  # === MAIN APP ===
183
  def main():
@@ -185,124 +207,118 @@ def main():
185
 
186
  st.markdown("""
187
  <div class="header">
188
- <h1>🌍 Language Atlas: South American Indigenous Languages</h1>
189
  </div>
190
  """, unsafe_allow_html=True)
191
-
192
- with st.expander("📌 **Overview**", expanded=True):
193
  st.markdown("""
194
- This app provides **AI-powered analysis** of endangered indigenous languages in South America,
195
- integrating knowledge graphs from **Glottolog, Wikipedia, and Wikidata**.
196
- \n\n*This is version 1 and currently English-only. Spanish version coming soon!*
197
  """)
198
-
 
199
  with st.sidebar:
200
- st.markdown("### 📚 Pontificia Universidad Católica del Perú")
201
  st.markdown("""
202
- - <span class="tech-badge">Departamento de Humanidades</span>
203
- - <span class="tech-badge">jveraz@pucp.edu.pe</span>
204
- - <span class="tech-badge">Suggestions? Contact us</span>
205
  """, unsafe_allow_html=True)
206
  st.markdown("---")
207
- st.markdown("### 🚀 Quick Start")
208
  st.markdown("""
209
- 1. **Type a question** in the input box
210
- 2. **Click 'Analyze'** to compare methods
211
- 3. **Explore results** with expandable details
212
  """)
213
-
214
  st.markdown("---")
215
- st.markdown("### 🔍 Example Queries")
216
  questions = [
217
- "What languages are endangered in Brazil?",
218
- "What languages are spoken in Perú?",
219
- "Which languages are related to Quechua?",
220
- "Where is Mapudungun spoken?"
221
  ]
222
-
223
  for q in questions:
224
- if st.markdown(f"<div class='suggested-question'>{q}</div>", unsafe_allow_html=True):
225
- st.session_state.query = q
226
-
227
  st.markdown("---")
228
- st.markdown("### ⚙️ Technical Details")
229
  st.markdown("""
230
- - <span class="tech-badge">Embeddings</span> Node2Vec vs. GraphSAGE
231
- - <span class="tech-badge">Language Model</span> Mistral-7B-Instruct
232
- - <span class="tech-badge">Knowledge Graph</span> RDF-based integration
233
  """, unsafe_allow_html=True)
234
-
235
  st.markdown("---")
236
- st.markdown("### 📂 Data Sources")
237
  st.markdown("""
238
- - **Glottolog** (Language classification)
239
- - **Wikipedia** (Textual summaries)
240
- - **Wikidata** (Structured facts)
241
  """)
242
-
243
  st.markdown("---")
244
- st.markdown("### 📊 Analysis Parameters")
245
- k = st.slider("Number of languages to analyze", 1, 10, 3)
246
  st.markdown("---")
247
- st.markdown("### 🔧 Advanced Options")
248
- show_ctx = st.checkbox("Show context information", False)
249
- show_rdf = st.checkbox("Show structured facts", False)
250
 
251
- st.markdown("### 📝 Ask About Indigenous Languages")
 
252
  query = st.text_input(
253
- "Enter your question:",
254
  value=st.session_state.get("query", ""),
255
  label_visibility="collapsed",
256
- placeholder="e.g. What languages are spoken in Peru?"
257
  )
258
 
259
- if st.button("Analyze", type="primary", use_container_width=True):
260
  if not query:
261
- st.warning("Please enter a question")
262
  return
263
-
264
- col1, col2 = st.columns(2)
265
-
266
- for col, (label, method) in zip([col1, col2], methods.items()):
267
- with col:
268
- st.markdown(f"#### {label} Method")
269
- st.caption({
270
- "InfoMatch": "Node2Vec embeddings combining text and graph structure",
271
- "LinkGraph": "GraphSAGE embeddings capturing network patterns"
272
- }[label])
273
-
274
- start = datetime.datetime.now()
275
- response, lang_ids, context, rdf_data = generate_response(*method, query, k, embedder)
276
- duration = (datetime.datetime.now() - start).total_seconds()
277
-
278
- st.markdown(f"""
279
- <div class="response-card">
280
- {response}
281
- <div style="margin-top: 1rem;">
282
- <span class="metric-badge">⏱️ {duration:.2f}s</span>
283
- <span class="metric-badge">🌐 {len(lang_ids)} languages</span>
284
- </div>
285
- </div>
286
- """, unsafe_allow_html=True)
287
-
288
- if show_ctx:
289
- with st.expander(f"📖 Context from {len(lang_ids)} languages"):
290
- for lang_id, ctx in zip(lang_ids, context):
291
- st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
292
-
293
- if show_rdf:
294
- with st.expander("🔗 Structured facts (RDF)"):
295
- st.code("\n".join(rdf_data))
296
-
297
- st.markdown("---")
298
- st.markdown("""
299
- <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
300
- <b>📌 Note:</b> This tool is designed for researchers, linguists, and cultural preservationists.
301
- For best results, use specific questions about languages, families, or regions.
302
  </div>
303
  """, unsafe_allow_html=True)
304
 
305
- if __name__ == "__main__":
306
- main()
 
 
307
 
 
 
 
308
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # === STREAMLIT UI CONFIG ===
22
  st.set_page_config(
23
+ page_title="Atlas de Lenguas: Lenguas Indígenas Sudamericanas",
24
  page_icon="🌍",
25
  layout="wide",
26
  initial_sidebar_state="expanded",
27
  menu_items={
28
+ 'About': "## Análisis con IA de lenguas indígenas en peligro\n"
29
+ "Esta aplicación integra grafos de conocimiento de Glottolog, Wikipedia y Wikidata."
30
  }
31
  )
32
 
 
100
  """, unsafe_allow_html=True)
101
 
102
  # === CORE FUNCTIONS ===
103
+ @st.cache_resource(show_spinner="Cargando modelos de IA y grafos de conocimiento...")
104
  def load_all_components():
105
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
106
  methods = {}
107
+ # Solo carga el método LinkGraph
108
+ label, suffix, ttl, matrix_path = ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
109
+ with open(f"id_map{suffix}.pkl", "rb") as f:
110
+ id_map = pickle.load(f)
111
+ with open(f"grafo_embed{suffix}.pickle", "rb") as f:
112
+ G = pickle.load(f)
113
+ matrix = np.load(matrix_path)
114
+ rdf = RDFGraph()
115
+ rdf.parse(ttl, format="ttl")
116
+ methods[label] = (matrix, id_map, G, rdf)
 
 
117
  return methods, embedder
118
 
119
  def get_top_k(matrix, id_map, query, k, embedder):
 
125
 
126
  def get_context(G, lang_id):
127
  node = G.nodes.get(lang_id, {})
128
+ lines = [f"**Lengua:** {node.get('label', lang_id)}"]
129
  if node.get("wikipedia_summary"):
130
  lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
131
  if node.get("wikidata_description"):
132
  lines.append(f"**Wikidata:** {node['wikidata_description']}")
133
  if node.get("wikidata_countries"):
134
+ lines.append(f"**Países:** {node['wikidata_countries']}")
135
  return "\n\n".join(lines)
136
 
137
  def query_rdf(rdf, lang_id):
 
150
  rdf_facts = []
151
  for i in ids:
152
  rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
153
+
154
+ # Prompt para generar respuesta en español
155
+ prompt_es = f"""<s>[INST] Eres un experto en lenguas indígenas sudamericanas. Utiliza estricta y únicamente la información a continuación para responder la pregunta del usuario en **español**.
156
+ - No infieras ni asumas hechos que no estén explícitamente establecidos.
157
+ - Si la respuesta es desconocida o insuficiente, di "No puedo responder con los datos disponibles."
158
+ - Limita tu respuesta a 100 palabras.
159
+ ### CONTEXTO: {chr(10).join(context)}
160
+ ### RELACIONES RDF: {chr(10).join(rdf_facts)}
161
+ ### PREGUNTA: {user_question}
162
+ Respuesta: [/INST]"""
163
+
164
+ # Prompt para generar respuesta en inglés
165
+ prompt_en = f"""<s>[INST] You are an expert in South American indigenous languages. Use strictly and only the information below to answer the user question in **English**.
166
  - Do not infer or assume facts that are not explicitly stated.
167
  - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
168
  - Limit your answer to 100 words.
169
+ ### CONTEXT: {chr(10).join(context)}
170
+ ### RDF RELATIONS: {chr(10).join(rdf_facts)}
171
+ ### QUESTION: {user_question}
172
+ Answer: [/INST]"""
173
+
174
+ response_es = "Error al generar respuesta en español."
175
+ response_en = "Error generating response in English."
176
+
177
  try:
178
+ # Generar respuesta en español
179
+ res_es = requests.post(
180
+ ENDPOINT_URL,
181
+ headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
182
+ json={"inputs": prompt_es}, timeout=60
183
+ )
184
+ out_es = res_es.json()
185
+ if isinstance(out_es, list) and "generated_text" in out_es[0]:
186
+ response_es = out_es[0]["generated_text"].replace(prompt_es.strip(), "").strip()
187
+
188
+ # Generar respuesta en inglés
189
+ res_en = requests.post(
190
  ENDPOINT_URL,
191
  headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
192
+ json={"inputs": prompt_en}, timeout=60
193
  )
194
+ out_en = res_en.json()
195
+ if isinstance(out_en, list) and "generated_text" in out_en[0]:
196
+ response_en = out_en[0]["generated_text"].replace(prompt_en.strip(), "").strip()
197
+
198
+ # Concatenar ambas respuestas
199
+ full_response = f"**Respuesta en español:**\n{response_es}\n\n**Answer in English:**\n{response_en}"
200
+ return full_response, ids, context, rdf_facts
201
  except Exception as e:
202
+ return f"Ocurrió un error al generar la respuesta: {str(e)}", ids, context, rdf_facts
203
 
204
  # === MAIN APP ===
205
  def main():
 
207
 
208
  st.markdown("""
209
  <div class="header">
210
+ <h1>🌍 Atlas de Lenguas: Lenguas Indígenas Sudamericanas</h1>
211
  </div>
212
  """, unsafe_allow_html=True)
213
+
214
+ with st.expander("📌 **Resumen General**", expanded=True):
215
  st.markdown("""
216
+ Esta aplicación ofrece **análisis impulsado por IA** de lenguas indígenas en peligro de extinción en América del Sur,
217
+ integrando grafos de conocimiento de **Glottolog, Wikipedia y Wikidata**.
 
218
  """)
219
+ st.markdown("*Puedes preguntar en **español o inglés**, y el modelo responderá en **ambos idiomas**.*")
220
+
221
  with st.sidebar:
222
+ st.markdown("### 📚 Información de Contacto")
223
  st.markdown("""
224
+ - <span class="tech-badge">Correo: jxvera@gmail.com</span>
 
 
225
  """, unsafe_allow_html=True)
226
  st.markdown("---")
227
+ st.markdown("### 🚀 Inicio Rápido")
228
  st.markdown("""
229
+ 1. **Escribe una pregunta** en el cuadro de entrada
230
+ 2. **Haz clic en 'Analizar'** para obtener la respuesta
231
+ 3. **Explora los resultados** con los detalles expandibles
232
  """)
233
+
234
  st.markdown("---")
235
+ st.markdown("### 🔍 Preguntas de Ejemplo")
236
  questions = [
237
+ "¿Qué idiomas están en peligro en Brasil? (What languages are endangered in Brazil?)",
238
+ "¿Qué idiomas se hablan en Perú? (What languages are spoken in Perú?)",
239
+ "¿Cuáles idiomas están relacionados con el Quechua? (Which languages are related to Quechua?)",
240
+ "¿Dónde se habla el Mapudungun? (Where is Mapudungun spoken?)"
241
  ]
242
+
243
  for q in questions:
244
+ if st.button(q, key=f"suggested_{q}", use_container_width=True):
245
+ st.session_state.query = q.split(" (")[0]
246
+
247
  st.markdown("---")
248
+ st.markdown("### ⚙️ Detalles Técnicos")
249
  st.markdown("""
250
+ - <span class="tech-badge">Embeddings</span> GraphSAGE
251
+ - <span class="tech-badge">Modelo de Lenguaje</span> Mistral-7B-Instruct
252
+ - <span class="tech-badge">Grafo de Conocimiento</span> Integración basada en RDF
253
  """, unsafe_allow_html=True)
254
+
255
  st.markdown("---")
256
+ st.markdown("### 📂 Fuentes de Datos")
257
  st.markdown("""
258
+ - **Glottolog** (Clasificación de idiomas)
259
+ - **Wikipedia** (Resúmenes textuales)
260
+ - **Wikidata** (Hechos estructurados)
261
  """)
262
+
263
  st.markdown("---")
264
+ st.markdown("### 📊 Parámetros de Análisis")
265
+ k = st.slider("Número de idiomas a analizar", 1, 10, 3)
266
  st.markdown("---")
267
+ st.markdown("### 🔧 Opciones Avanzadas")
268
+ show_ctx = st.checkbox("Mostrar información de contexto", False)
269
+ show_rdf = st.checkbox("Mostrar hechos estructurados", False)
270
 
271
+ st.markdown("### 📝 Haz una pregunta sobre lenguas indígenas")
272
+ st.markdown("*(Puedes preguntar en español o inglés, y el modelo responderá en **ambos idiomas**.)*")
273
  query = st.text_input(
274
+ "Ingresa tu pregunta:",
275
  value=st.session_state.get("query", ""),
276
  label_visibility="collapsed",
277
+ placeholder="Ej. ¿Qué lenguas se hablan en Perú?"
278
  )
279
 
280
+ if st.button("Analizar", type="primary", use_container_width=True):
281
  if not query:
282
+ st.warning("Por favor, ingresa una pregunta")
283
  return
284
+
285
+ label = "LinkGraph"
286
+ method = methods[label]
287
+
288
+ st.markdown(f"#### Método {label}")
289
+ st.caption("Embeddings de GraphSAGE que capturan patrones de red")
290
+
291
+ start = datetime.datetime.now()
292
+ # Llamamos a generate_response sin un código de idioma, ya que generará ambos
293
+ response, lang_ids, context, rdf_data = generate_response(*method, query, k, embedder)
294
+ duration = (datetime.datetime.now() - start).total_seconds()
295
+
296
+ st.markdown(f"""
297
+ <div class="response-card">
298
+ {response}
299
+ <div style="margin-top: 1rem;">
300
+ <span class="metric-badge">⏱️ {duration:.2f}s</span>
301
+ <span class="metric-badge">🌐 {len(lang_ids)} idiomas</span>
302
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  </div>
304
  """, unsafe_allow_html=True)
305
 
306
+ if show_ctx:
307
+ with st.expander(f"📖 Contexto de {len(lang_ids)} idiomas"):
308
+ for lang_id, ctx in zip(lang_ids, context):
309
+ st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
310
 
311
+ if show_rdf:
312
+ with st.expander("🔗 Hechos estructurados (RDF)"):
313
+ st.code("\n".join(rdf_data))
314
 
315
+ st.markdown("---")
316
+ st.markdown("""
317
+ <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
318
+ <b>📌 Nota:</b> Esta herramienta está diseñada para investigadores, lingüistas y preservacionistas culturales.
319
+ Para mejores resultados, usa preguntas específicas sobre idiomas, familias o regiones.
320
+ </div>
321
+ """, unsafe_allow_html=True)
322
+
323
+ if __name__ == "__main__":
324
+ main()