Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

Javier Vera commited on Apr 9

Commit

d5a9a0d

verified ·

1 Parent(s): b276dbe

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +30 -60

rag_hf.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
 import streamlit as st
 import pickle
 import numpy as np
@@ -20,44 +20,16 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
-st.set_page_config(
-    page_title="Vanishing Voices: Language Atlas",
-    page_icon="🌍",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .header {
-        color: #2c3e50;
-        border-bottom: 2px solid #3498db;
-        padding-bottom: 10px;
-        margin-bottom: 1.5rem;
-    }
-    .info-box {
-        background-color: #e8f4fc;
-        border-radius: 8px;
-        padding: 1rem;
-        margin-bottom: 1.5rem;
-        border-left: 4px solid #3498db;
-    }
-    .sidebar-title {
-        font-size: 1.1rem;
-        font-weight: 600;
-        margin-top: 1rem;
-    }
-</style>
-""", unsafe_allow_html=True)
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
     for label, suffix, ttl, matrix_path in [
-        ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
-        ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
     ]:
         with open(f"id_map{suffix}.pkl", "rb") as f:
             id_map = pickle.load(f)
@@ -71,7 +43,6 @@ def load_all_components():
 methods, embedder = load_all_components()
-# === CORE FUNCTIONS ===
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
@@ -81,14 +52,14 @@ def get_top_k(matrix, id_map, query, k):
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
-    lines = [f"**Language:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
-        lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
-        lines.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
-        lines.append(f"**Countries:** {node['wikidata_countries']}")
-    return "\n\n".join(lines)
 def query_rdf(rdf, lang_id):
     q = f"""
@@ -110,10 +81,9 @@ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
 You are an expert in South American indigenous languages.
 Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
-- If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
@@ -138,53 +108,53 @@ Answer:
     except Exception as e:
         return str(e), ids, context, rdf_facts
-# === MAIN FUNCTION ===
 def main():
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
     <div class='info-box'>
-    <b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        st.markdown("### What are the methods?")
         st.markdown("""
-        - **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
-        - **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
         """)
-        st.markdown("### Options")
-        k = st.slider("How many languages to analyze?", 1, 10, 3)
-        show_ids = st.checkbox("Show IDs", value=True)
-        show_ctx = st.checkbox("Show Text Info", value=True)
-        show_rdf = st.checkbox("Show Extra Facts", value=True)
-    query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
-    if st.button("Analyze") and query:
-        col1, col2 = st.columns(2)
-        results = {}
-        for col, (label, method) in zip([col1, col2], methods.items()):
             with col:
-                st.subheader(f"{label} Method")
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
-                    st.markdown("**Language IDs:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
-                    st.markdown("**Text Info:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
-                    st.markdown("**Extra Facts:**")
                     st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()

+# rag_interface.py (tres métodos, descripciones técnicas)
 import streamlit as st
 import pickle
 import numpy as np
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
+st.set_page_config(page_title="Vanishing Voices: Language Atlas", page_icon="🌍", layout="wide")
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
     for label, suffix, ttl, matrix_path in [
+        ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
+        ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
+        ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
     ]:
         with open(f"id_map{suffix}.pkl", "rb") as f:
             id_map = pickle.load(f)
 methods, embedder = load_all_components()
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
+    parts = [f"**Language:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
+        parts.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
+        parts.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
+        parts.append(f"**Countries:** {node['wikidata_countries']}")
+    return "\n\n".join(parts)
 def query_rdf(rdf, lang_id):
     q = f"""
 You are an expert in South American indigenous languages.
 Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
+- If the answer is unknown or insufficient, say "I cannot answer with the available data."
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
     except Exception as e:
         return str(e), ids, context, rdf_facts
 def main():
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
     <div class='info-box'>
+    <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
+    This tool documents these cultural treasures before they disappear forever.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        st.markdown("### About This Tool")
         st.markdown("""
+        - **Standard**: Semantic search based on text-only embeddings.
+        - **Hybrid**: Uses node2vec to combine graph structure with descriptive features.
+        - **GraphSAGE**: Employs deep graph learning (GraphSAGE) for relational patterns.
         """)
+        k = st.slider("Languages to analyze per query", 1, 10, 3)
+        show_ids = st.checkbox("Language IDs", value=True)
+        show_ctx = st.checkbox("Contextual Info", value=True)
+        show_rdf = st.checkbox("RDF Relations", value=True)
+    query = st.text_input("Ask something about South American languages:", "Which Amazonian languages are most at risk?")
+    if st.button("Analyze"):
+        cols = st.columns(len(methods))
+        for col, (label, method) in zip(cols, methods.items()):
             with col:
+                st.subheader(label)
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
+                    st.markdown("**IDs:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
+                    st.markdown("**Context:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
+                    st.markdown("**RDF:**")
                     st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()