Update rag_hf.py
Browse files
rag_hf.py
CHANGED
@@ -101,7 +101,7 @@ st.markdown("""
|
|
101 |
</style>
|
102 |
""", unsafe_allow_html=True)
|
103 |
|
104 |
-
# ===
|
105 |
@st.cache_resource(show_spinner="Cargando modelos de IA y grafos de conocimiento...")
|
106 |
def load_all_components():
|
107 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
@@ -117,6 +117,7 @@ def load_all_components():
|
|
117 |
methods[label] = (matrix, id_map, G, rdf)
|
118 |
return methods, embedder
|
119 |
|
|
|
120 |
def get_top_k(matrix, id_map, query, k, embedder):
|
121 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
122 |
vec = vec.cpu().numpy().astype("float32")
|
@@ -145,29 +146,9 @@ def query_rdf(rdf, lang_id):
|
|
145 |
except Exception as e:
|
146 |
return [("error", str(e))]
|
147 |
|
148 |
-
def query_llm(prompt):
|
149 |
-
try:
|
150 |
-
res = requests.post(
|
151 |
-
ENDPOINT_URL,
|
152 |
-
headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
|
153 |
-
json={"inputs": prompt}, timeout=30
|
154 |
-
)
|
155 |
-
res.raise_for_status()
|
156 |
-
out = res.json()
|
157 |
-
if isinstance(out, list):
|
158 |
-
if len(out) > 0 and isinstance(out[0], dict) and "generated_text" in out[0]:
|
159 |
-
return out[0]["generated_text"].strip()
|
160 |
-
elif isinstance(out, dict) and "generated_text" in out:
|
161 |
-
return out["generated_text"].strip()
|
162 |
-
elif isinstance(out, dict) and "text" in out:
|
163 |
-
return out["text"].strip()
|
164 |
-
return "Sin respuesta del modelo."
|
165 |
-
except Exception as e:
|
166 |
-
return f"Error al consultar el modelo: {str(e)}"
|
167 |
-
|
168 |
# === PROMPT PARA MODELO MISTRAL ===
|
169 |
-
def generate_response(matrix, id_map, G, rdf, user_question, k
|
170 |
-
ids = get_top_k(matrix, id_map, user_question, k)
|
171 |
context = [get_context(G, i) for i in ids]
|
172 |
rdf_facts = []
|
173 |
for i in ids:
|
@@ -233,21 +214,27 @@ Answer:
|
|
233 |
except Exception as e:
|
234 |
return f"Error al consultar el modelo: {str(e)}", ids, context, rdf_facts
|
235 |
|
236 |
-
# ===
|
237 |
def main():
|
238 |
methods, embedder = load_all_components()
|
|
|
239 |
st.markdown("""
|
240 |
<div class="header">
|
241 |
<h1>🌍 Atlas de Lenguas: Lenguas Indígenas Sudamericanas</h1>
|
242 |
</div>
|
243 |
""", unsafe_allow_html=True)
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
with st.sidebar:
|
246 |
st.markdown("### 📚 Información de Contacto")
|
247 |
st.markdown("""
|
248 |
- <span class="tech-badge">Correo: jxvera@gmail.com</span>
|
249 |
""", unsafe_allow_html=True)
|
250 |
-
|
251 |
st.markdown("---")
|
252 |
st.markdown("### 🚀 Inicio Rápido")
|
253 |
st.markdown("""
|
@@ -264,20 +251,37 @@ def main():
|
|
264 |
"¿Cuáles idiomas están relacionados con el Quechua?",
|
265 |
"¿Dónde se habla el Mapudungun?"
|
266 |
]
|
|
|
267 |
for q in questions:
|
268 |
if st.button(q, key=f"suggested_{q}", use_container_width=True):
|
269 |
st.session_state.query = q
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
st.markdown("---")
|
272 |
st.markdown("### 📊 Parámetros de Análisis")
|
273 |
k = st.slider("Número de idiomas a analizar", 1, 10, 3)
|
274 |
-
|
275 |
st.markdown("---")
|
276 |
st.markdown("### 🔧 Opciones Avanzadas")
|
277 |
show_ctx = st.checkbox("Mostrar información de contexto", False)
|
278 |
show_rdf = st.checkbox("Mostrar hechos estructurados", False)
|
279 |
|
280 |
st.markdown("### 📝 Haz una pregunta sobre lenguas indígenas")
|
|
|
281 |
query = st.text_input(
|
282 |
"Ingresa tu pregunta:",
|
283 |
value=st.session_state.get("query", ""),
|
@@ -289,11 +293,14 @@ def main():
|
|
289 |
if not query:
|
290 |
st.warning("Por favor, ingresa una pregunta")
|
291 |
return
|
|
|
292 |
label = "LinkGraph"
|
293 |
method = methods[label]
|
|
|
294 |
start = datetime.datetime.now()
|
295 |
-
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
296 |
duration = (datetime.datetime.now() - start).total_seconds()
|
|
|
297 |
st.markdown(f"""
|
298 |
<div class="response-card">
|
299 |
{response}
|
@@ -317,7 +324,7 @@ def main():
|
|
317 |
st.markdown("""
|
318 |
<div style="font-size: 0.8rem; color: #64748b; text-align: center;">
|
319 |
<b>📌 Nota:</b> Esta herramienta está diseñada para investigadores, lingüistas y preservacionistas culturales.
|
320 |
-
|
321 |
</div>
|
322 |
""", unsafe_allow_html=True)
|
323 |
|
|
|
101 |
</style>
|
102 |
""", unsafe_allow_html=True)
|
103 |
|
104 |
+
# === CARGA COMPONENTES ===
|
105 |
@st.cache_resource(show_spinner="Cargando modelos de IA y grafos de conocimiento...")
|
106 |
def load_all_components():
|
107 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
|
|
117 |
methods[label] = (matrix, id_map, G, rdf)
|
118 |
return methods, embedder
|
119 |
|
120 |
+
# === FUNCIONES BASE ===
|
121 |
def get_top_k(matrix, id_map, query, k, embedder):
|
122 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
123 |
vec = vec.cpu().numpy().astype("float32")
|
|
|
146 |
except Exception as e:
|
147 |
return [("error", str(e))]
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# === PROMPT PARA MODELO MISTRAL ===
|
150 |
+
def generate_response(matrix, id_map, G, rdf, user_question, k, embedder):
|
151 |
+
ids = get_top_k(matrix, id_map, user_question, k, embedder)
|
152 |
context = [get_context(G, i) for i in ids]
|
153 |
rdf_facts = []
|
154 |
for i in ids:
|
|
|
214 |
except Exception as e:
|
215 |
return f"Error al consultar el modelo: {str(e)}", ids, context, rdf_facts
|
216 |
|
217 |
+
# === MAIN ===
|
218 |
def main():
|
219 |
methods, embedder = load_all_components()
|
220 |
+
|
221 |
st.markdown("""
|
222 |
<div class="header">
|
223 |
<h1>🌍 Atlas de Lenguas: Lenguas Indígenas Sudamericanas</h1>
|
224 |
</div>
|
225 |
""", unsafe_allow_html=True)
|
226 |
|
227 |
+
with st.expander("📌 **Resumen General**", expanded=True):
|
228 |
+
st.markdown("""
|
229 |
+
Esta aplicación ofrece **análisis impulsado por IA, Grafos y RAGs (GraphRAGs)** de lenguas indígenas de América del Sur,
|
230 |
+
integrando información de **Glottolog, Wikipedia y Wikidata**.
|
231 |
+
""")
|
232 |
+
|
233 |
with st.sidebar:
|
234 |
st.markdown("### 📚 Información de Contacto")
|
235 |
st.markdown("""
|
236 |
- <span class="tech-badge">Correo: jxvera@gmail.com</span>
|
237 |
""", unsafe_allow_html=True)
|
|
|
238 |
st.markdown("---")
|
239 |
st.markdown("### 🚀 Inicio Rápido")
|
240 |
st.markdown("""
|
|
|
251 |
"¿Cuáles idiomas están relacionados con el Quechua?",
|
252 |
"¿Dónde se habla el Mapudungun?"
|
253 |
]
|
254 |
+
|
255 |
for q in questions:
|
256 |
if st.button(q, key=f"suggested_{q}", use_container_width=True):
|
257 |
st.session_state.query = q
|
258 |
|
259 |
+
st.markdown("---")
|
260 |
+
st.markdown("### ⚙️ Detalles Técnicos")
|
261 |
+
st.markdown("""
|
262 |
+
- <span class="tech-badge">Embeddings</span> GraphSAGE
|
263 |
+
- <span class="tech-badge">Modelo de Lenguaje</span> Mistral (Inference Endpoint)
|
264 |
+
- <span class="tech-badge">Grafo de Conocimiento</span> Integración basada en RDF
|
265 |
+
""", unsafe_allow_html=True)
|
266 |
+
|
267 |
+
st.markdown("---")
|
268 |
+
st.markdown("### 📂 Fuentes de Datos")
|
269 |
+
st.markdown("""
|
270 |
+
- **Glottolog** (Clasificación de idiomas)
|
271 |
+
- **Wikipedia** (Resúmenes textuales)
|
272 |
+
- **Wikidata** (Hechos estructurados)
|
273 |
+
""")
|
274 |
+
|
275 |
st.markdown("---")
|
276 |
st.markdown("### 📊 Parámetros de Análisis")
|
277 |
k = st.slider("Número de idiomas a analizar", 1, 10, 3)
|
|
|
278 |
st.markdown("---")
|
279 |
st.markdown("### 🔧 Opciones Avanzadas")
|
280 |
show_ctx = st.checkbox("Mostrar información de contexto", False)
|
281 |
show_rdf = st.checkbox("Mostrar hechos estructurados", False)
|
282 |
|
283 |
st.markdown("### 📝 Haz una pregunta sobre lenguas indígenas")
|
284 |
+
st.markdown("*(Puedes preguntar en español o inglés, y el modelo responderá en **ambos idiomas**.)*")
|
285 |
query = st.text_input(
|
286 |
"Ingresa tu pregunta:",
|
287 |
value=st.session_state.get("query", ""),
|
|
|
293 |
if not query:
|
294 |
st.warning("Por favor, ingresa una pregunta")
|
295 |
return
|
296 |
+
|
297 |
label = "LinkGraph"
|
298 |
method = methods[label]
|
299 |
+
|
300 |
start = datetime.datetime.now()
|
301 |
+
response, lang_ids, context, rdf_data = generate_response(*method, query, k, embedder)
|
302 |
duration = (datetime.datetime.now() - start).total_seconds()
|
303 |
+
|
304 |
st.markdown(f"""
|
305 |
<div class="response-card">
|
306 |
{response}
|
|
|
324 |
st.markdown("""
|
325 |
<div style="font-size: 0.8rem; color: #64748b; text-align: center;">
|
326 |
<b>📌 Nota:</b> Esta herramienta está diseñada para investigadores, lingüistas y preservacionistas culturales.
|
327 |
+
Para mejores resultados, usa preguntas específicas sobre idiomas, familias o regiones.
|
328 |
</div>
|
329 |
""", unsafe_allow_html=True)
|
330 |
|