Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

javiervz commited on Jun 2

Commit

0640991

verified ·

1 Parent(s): 842db18

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +138 -122

rag_hf.py CHANGED Viewed

@@ -20,13 +20,13 @@ EX = Namespace("http://example.org/lang/")
 # === STREAMLIT UI CONFIG ===
 st.set_page_config(
-    page_title="Language Atlas: South American Indigenous Languages",
     page_icon="🌍",
     layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
-        'About': "## AI-powered analysis of endangered indigenous languages\n"
-                "Developed by Departamento Académico de Humanidades"
     }
 )
@@ -100,22 +100,20 @@ st.markdown("""
 """, unsafe_allow_html=True)
 # === CORE FUNCTIONS ===
-@st.cache_resource(show_spinner="Loading AI models and knowledge graphs...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
-    for label, suffix, ttl, matrix_path in [
-        ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
-        ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
-    ]:
-        with open(f"id_map{suffix}.pkl", "rb") as f:
-            id_map = pickle.load(f)
-        with open(f"grafo_embed{suffix}.pickle", "rb") as f:
-            G = pickle.load(f)
-        matrix = np.load(matrix_path)
-        rdf = RDFGraph()
-        rdf.parse(ttl, format="ttl")
-        methods[label] = (matrix, id_map, G, rdf)
     return methods, embedder
 def get_top_k(matrix, id_map, query, k, embedder):
@@ -127,13 +125,13 @@ def get_top_k(matrix, id_map, query, k, embedder):
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
-    lines = [f"**Language:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
         lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
         lines.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
-        lines.append(f"**Countries:** {node['wikidata_countries']}")
     return "\n\n".join(lines)
 def query_rdf(rdf, lang_id):
@@ -152,32 +150,56 @@ def generate_response(matrix, id_map, G, rdf, user_question, k, embedder):
     rdf_facts = []
     for i in ids:
         rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
-    prompt = f"""<s>[INST]
-You are an expert in South American indigenous languages.
-Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
 - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
-### CONTEXT:
-{chr(10).join(context)}
-### RDF RELATIONS:
-{chr(10).join(rdf_facts)}
-### QUESTION:
-{user_question}
-Answer:
-[/INST]"""
     try:
-        res = requests.post(
             ENDPOINT_URL,
             headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
-            json={"inputs": prompt}, timeout=60
         )
-        out = res.json()
-        if isinstance(out, list) and "generated_text" in out[0]:
-            return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
-        return str(out), ids, context, rdf_facts
     except Exception as e:
-        return str(e), ids, context, rdf_facts
 # === MAIN APP ===
 def main():
@@ -185,124 +207,118 @@ def main():
     st.markdown("""
     <div class="header">
-        <h1>🌍 Language Atlas: South American Indigenous Languages</h1>
     </div>
     """, unsafe_allow_html=True)
-    with st.expander("📌 **Overview**", expanded=True):
         st.markdown("""
-        This app provides **AI-powered analysis** of endangered indigenous languages in South America,
-        integrating knowledge graphs from **Glottolog, Wikipedia, and Wikidata**.
-        \n\n*This is version 1 and currently English-only. Spanish version coming soon!*
         """)
     with st.sidebar:
-        st.markdown("### 📚 Pontificia Universidad Católica del Perú")
         st.markdown("""
-        - <span class="tech-badge">Departamento de Humanidades</span>
-        - <span class="tech-badge">jveraz@pucp.edu.pe</span>
-        - <span class="tech-badge">Suggestions? Contact us</span>
         """, unsafe_allow_html=True)
         st.markdown("---")
-        st.markdown("### 🚀 Quick Start")
         st.markdown("""
-        1. **Type a question** in the input box
-        2. **Click 'Analyze'** to compare methods
-        3. **Explore results** with expandable details
         """)
         st.markdown("---")
-        st.markdown("### 🔍 Example Queries")
         questions = [
-            "What languages are endangered in Brazil?",
-            "What languages are spoken in Perú?",
-            "Which languages are related to Quechua?",
-            "Where is Mapudungun spoken?"
         ]
         for q in questions:
-            if st.markdown(f"<div class='suggested-question'>{q}</div>", unsafe_allow_html=True):
-                st.session_state.query = q
         st.markdown("---")
-        st.markdown("### ⚙️ Technical Details")
         st.markdown("""
-        - <span class="tech-badge">Embeddings</span> Node2Vec vs. GraphSAGE
-        - <span class="tech-badge">Language Model</span> Mistral-7B-Instruct
-        - <span class="tech-badge">Knowledge Graph</span> RDF-based integration
         """, unsafe_allow_html=True)
         st.markdown("---")
-        st.markdown("### 📂 Data Sources")
         st.markdown("""
-        - **Glottolog** (Language classification)
-        - **Wikipedia** (Textual summaries)
-        - **Wikidata** (Structured facts)
         """)
         st.markdown("---")
-        st.markdown("### 📊 Analysis Parameters")
-        k = st.slider("Number of languages to analyze", 1, 10, 3)
         st.markdown("---")
-        st.markdown("### 🔧 Advanced Options")
-        show_ctx = st.checkbox("Show context information", False)
-        show_rdf = st.checkbox("Show structured facts", False)
-    st.markdown("### 📝 Ask About Indigenous Languages")
     query = st.text_input(
-        "Enter your question:",
         value=st.session_state.get("query", ""),
         label_visibility="collapsed",
-        placeholder="e.g. What languages are spoken in Peru?"
     )
-    if st.button("Analyze", type="primary", use_container_width=True):
         if not query:
-            st.warning("Please enter a question")
             return
-        col1, col2 = st.columns(2)
-        for col, (label, method) in zip([col1, col2], methods.items()):
-            with col:
-                st.markdown(f"#### {label} Method")
-                st.caption({
-                    "InfoMatch": "Node2Vec embeddings combining text and graph structure",
-                    "LinkGraph": "GraphSAGE embeddings capturing network patterns"
-                }[label])
-                start = datetime.datetime.now()
-                response, lang_ids, context, rdf_data = generate_response(*method, query, k, embedder)
-                duration = (datetime.datetime.now() - start).total_seconds()
-                st.markdown(f"""
-                <div class="response-card">
-                    {response}
-                    <div style="margin-top: 1rem;">
-                        <span class="metric-badge">⏱️ {duration:.2f}s</span>
-                        <span class="metric-badge">🌐 {len(lang_ids)} languages</span>
-                    </div>
-                </div>
-                """, unsafe_allow_html=True)
-                if show_ctx:
-                    with st.expander(f"📖 Context from {len(lang_ids)} languages"):
-                        for lang_id, ctx in zip(lang_ids, context):
-                            st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
-                if show_rdf:
-                    with st.expander("🔗 Structured facts (RDF)"):
-                        st.code("\n".join(rdf_data))
-        st.markdown("---")
-        st.markdown("""
-        <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
-        <b>📌 Note:</b> This tool is designed for researchers, linguists, and cultural preservationists.
-        For best results, use specific questions about languages, families, or regions.
         </div>
         """, unsafe_allow_html=True)
-if __name__ == "__main__":
-    main()

 # === STREAMLIT UI CONFIG ===
 st.set_page_config(
+    page_title="Atlas de Lenguas: Lenguas Indígenas Sudamericanas",
     page_icon="🌍",
     layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
+        'About': "## Análisis con IA de lenguas indígenas en peligro\n"
+                 "Esta aplicación integra grafos de conocimiento de Glottolog, Wikipedia y Wikidata."
     }
 )
 """, unsafe_allow_html=True)
 # === CORE FUNCTIONS ===
+@st.cache_resource(show_spinner="Cargando modelos de IA y grafos de conocimiento...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
+    # Solo carga el método LinkGraph
+    label, suffix, ttl, matrix_path = ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
+    with open(f"id_map{suffix}.pkl", "rb") as f:
+        id_map = pickle.load(f)
+    with open(f"grafo_embed{suffix}.pickle", "rb") as f:
+        G = pickle.load(f)
+    matrix = np.load(matrix_path)
+    rdf = RDFGraph()
+    rdf.parse(ttl, format="ttl")
+    methods[label] = (matrix, id_map, G, rdf)
     return methods, embedder
 def get_top_k(matrix, id_map, query, k, embedder):
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
+    lines = [f"**Lengua:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
         lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
         lines.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
+        lines.append(f"**Países:** {node['wikidata_countries']}")
     return "\n\n".join(lines)
 def query_rdf(rdf, lang_id):
     rdf_facts = []
     for i in ids:
         rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
+    # Prompt para generar respuesta en español
+    prompt_es = f"""<s>[INST] Eres un experto en lenguas indígenas sudamericanas. Utiliza estricta y únicamente la información a continuación para responder la pregunta del usuario en **español**.
+- No infieras ni asumas hechos que no estén explícitamente establecidos.
+- Si la respuesta es desconocida o insuficiente, di "No puedo responder con los datos disponibles."
+- Limita tu respuesta a 100 palabras.
+### CONTEXTO: {chr(10).join(context)}
+### RELACIONES RDF: {chr(10).join(rdf_facts)}
+### PREGUNTA: {user_question}
+Respuesta: [/INST]"""
+    # Prompt para generar respuesta en inglés
+    prompt_en = f"""<s>[INST] You are an expert in South American indigenous languages. Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
 - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
+### CONTEXT: {chr(10).join(context)}
+### RDF RELATIONS: {chr(10).join(rdf_facts)}
+### QUESTION: {user_question}
+Answer: [/INST]"""
+    response_es = "Error al generar respuesta en español."
+    response_en = "Error generating response in English."
     try:
+        # Generar respuesta en español
+        res_es = requests.post(
+            ENDPOINT_URL,
+            headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
+            json={"inputs": prompt_es}, timeout=60
+        )
+        out_es = res_es.json()
+        if isinstance(out_es, list) and "generated_text" in out_es[0]:
+            response_es = out_es[0]["generated_text"].replace(prompt_es.strip(), "").strip()
+        # Generar respuesta en inglés
+        res_en = requests.post(
             ENDPOINT_URL,
             headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
+            json={"inputs": prompt_en}, timeout=60
         )
+        out_en = res_en.json()
+        if isinstance(out_en, list) and "generated_text" in out_en[0]:
+            response_en = out_en[0]["generated_text"].replace(prompt_en.strip(), "").strip()
+        # Concatenar ambas respuestas
+        full_response = f"**Respuesta en español:**\n{response_es}\n\n**Answer in English:**\n{response_en}"
+        return full_response, ids, context, rdf_facts
     except Exception as e:
+        return f"Ocurrió un error al generar la respuesta: {str(e)}", ids, context, rdf_facts
 # === MAIN APP ===
 def main():
     st.markdown("""
     <div class="header">
+        <h1>🌍 Atlas de Lenguas: Lenguas Indígenas Sudamericanas</h1>
     </div>
     """, unsafe_allow_html=True)
+    with st.expander("📌 **Resumen General**", expanded=True):
         st.markdown("""
+        Esta aplicación ofrece **análisis impulsado por IA** de lenguas indígenas en peligro de extinción en América del Sur,
+        integrando grafos de conocimiento de **Glottolog, Wikipedia y Wikidata**.
         """)
+        st.markdown("*Puedes preguntar en **español o inglés**, y el modelo responderá en **ambos idiomas**.*")
     with st.sidebar:
+        st.markdown("### 📚 Información de Contacto")
         st.markdown("""
+        - <span class="tech-badge">Correo: jxvera@gmail.com</span>
         """, unsafe_allow_html=True)
         st.markdown("---")
+        st.markdown("### 🚀 Inicio Rápido")
         st.markdown("""
+        1. **Escribe una pregunta** en el cuadro de entrada
+        2. **Haz clic en 'Analizar'** para obtener la respuesta
+        3. **Explora los resultados** con los detalles expandibles
         """)
         st.markdown("---")
+        st.markdown("### 🔍 Preguntas de Ejemplo")
         questions = [
+            "¿Qué idiomas están en peligro en Brasil? (What languages are endangered in Brazil?)",
+            "¿Qué idiomas se hablan en Perú? (What languages are spoken in Perú?)",
+            "¿Cuáles idiomas están relacionados con el Quechua? (Which languages are related to Quechua?)",
+            "¿Dónde se habla el Mapudungun? (Where is Mapudungun spoken?)"
         ]
         for q in questions:
+            if st.button(q, key=f"suggested_{q}", use_container_width=True):
+                st.session_state.query = q.split(" (")[0]
         st.markdown("---")
+        st.markdown("### ⚙️ Detalles Técnicos")
         st.markdown("""
+        - <span class="tech-badge">Embeddings</span> GraphSAGE
+        - <span class="tech-badge">Modelo de Lenguaje</span> Mistral-7B-Instruct
+        - <span class="tech-badge">Grafo de Conocimiento</span> Integración basada en RDF
         """, unsafe_allow_html=True)
         st.markdown("---")
+        st.markdown("### 📂 Fuentes de Datos")
         st.markdown("""
+        - **Glottolog** (Clasificación de idiomas)
+        - **Wikipedia** (Resúmenes textuales)
+        - **Wikidata** (Hechos estructurados)
         """)
         st.markdown("---")
+        st.markdown("### 📊 Parámetros de Análisis")
+        k = st.slider("Número de idiomas a analizar", 1, 10, 3)
         st.markdown("---")
+        st.markdown("### 🔧 Opciones Avanzadas")
+        show_ctx = st.checkbox("Mostrar información de contexto", False)
+        show_rdf = st.checkbox("Mostrar hechos estructurados", False)
+    st.markdown("### 📝 Haz una pregunta sobre lenguas indígenas")
+    st.markdown("*(Puedes preguntar en español o inglés, y el modelo responderá en **ambos idiomas**.)*")
     query = st.text_input(
+        "Ingresa tu pregunta:",
         value=st.session_state.get("query", ""),
         label_visibility="collapsed",
+        placeholder="Ej. ¿Qué lenguas se hablan en Perú?"
     )
+    if st.button("Analizar", type="primary", use_container_width=True):
         if not query:
+            st.warning("Por favor, ingresa una pregunta")
             return
+        label = "LinkGraph"
+        method = methods[label]
+        st.markdown(f"#### Método {label}")
+        st.caption("Embeddings de GraphSAGE que capturan patrones de red")
+        start = datetime.datetime.now()
+        # Llamamos a generate_response sin un código de idioma, ya que generará ambos
+        response, lang_ids, context, rdf_data = generate_response(*method, query, k, embedder)
+        duration = (datetime.datetime.now() - start).total_seconds()
+        st.markdown(f"""
+        <div class="response-card">
+            {response}
+            <div style="margin-top: 1rem;">
+                <span class="metric-badge">⏱️ {duration:.2f}s</span>
+                <span class="metric-badge">🌐 {len(lang_ids)} idiomas</span>
+            </div>
         </div>
         """, unsafe_allow_html=True)
+        if show_ctx:
+            with st.expander(f"📖 Contexto de {len(lang_ids)} idiomas"):
+                for lang_id, ctx in zip(lang_ids, context):
+                    st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
+        if show_rdf:
+            with st.expander("🔗 Hechos estructurados (RDF)"):
+                st.code("\n".join(rdf_data))
+    st.markdown("---")
+    st.markdown("""
+    <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
+    <b>📌 Nota:</b> Esta herramienta está diseñada para investigadores, lingüistas y preservacionistas culturales.
+      Para mejores resultados, usa preguntas específicas sobre idiomas, familias o regiones.
+    </div>
+    """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()