Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

Javier Vera commited on Apr 9

Commit

9e91c10

verified ·

1 Parent(s): cf45911

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +33 -97

rag_hf.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# rag_interface.py (with numpy instead of faiss)
 import streamlit as st
 import pickle
 import numpy as np
@@ -14,7 +14,8 @@ from dotenv import load_dotenv
 # === CONFIGURATION ===
 load_dotenv()
-MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
@@ -42,28 +43,10 @@ st.markdown("""
         margin-bottom: 1.5rem;
         border-left: 4px solid #3498db;
     }
-    .sidebar-section {
-        margin-bottom: 2rem;
-    }
     .sidebar-title {
-        color: #2c3e50;
         font-size: 1.1rem;
         font-weight: 600;
-        margin-bottom: 0.5rem;
-        border-bottom: 1px solid #eee;
-        padding-bottom: 0.5rem;
-    }
-    .method-card {
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        padding: 0.8rem;
-        margin-bottom: 0.8rem;
-        border-left: 3px solid #3498db;
-    }
-    .method-title {
-        font-weight: 600;
-        color: #3498db;
-        margin-bottom: 0.3rem;
     }
 </style>
 """, unsafe_allow_html=True)
@@ -73,9 +56,8 @@ def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
     for label, suffix, ttl, matrix_path in [
-        ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
-        ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
-        ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
     ]:
         with open(f"id_map{suffix}.pkl", "rb") as f:
             id_map = pickle.load(f)
@@ -114,10 +96,7 @@ def query_rdf(rdf, lang_id):
     SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
     """
     try:
-        return [
-            (str(row[0]).split("/")[-1], str(row[1]))
-            for row in rdf.query(q)
-        ]
     except Exception as e:
         return [("error", str(e))]
@@ -131,7 +110,7 @@ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
 You are an expert in South American indigenous languages.
 Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
-- If the answer is unknown or insufficient, say "I cannot answer with the available data."
 - Limit your answer to 100 words.
@@ -148,9 +127,9 @@ Answer:
 [/INST]"""
     try:
         res = requests.post(
-            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
-            headers={"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json"},
-            json={"inputs": prompt}, timeout=30
         )
         out = res.json()
         if isinstance(out, list) and "generated_text" in out[0]:
@@ -164,90 +143,47 @@ def main():
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
     <div class='info-box'>
-    <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
-    This tool documents these cultural treasures before they disappear forever.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        with st.container():
-            st.markdown('<div class="sidebar-title">About This Tool</div>', unsafe_allow_html=True)
-            st.markdown("""
-            <div class="method-card">
-                <div class="method-title">Standard Search</div>
-                Semantic retrieval based on text-only embeddings. Identifies languages using purely linguistic similarity from Wikipedia summaries and labels.
-            </div>
-            <div class="method-card">
-                <div class="method-title">Hybrid Search</div>
-                Combines semantic embeddings with structured data from knowledge graphs. Enriches language representation with contextual facts.
-            </div>
-            <div class="method-card">
-                <div class="method-title">GraphSAGE Search</div>
-                Leverages deep graph neural networks to learn relational patterns across languages. Captures complex cultural and genealogical connections.
-            </div>
-            """, unsafe_allow_html=True)
-        with st.container():
-            st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
-            k = st.slider("Languages to analyze per query", 1, 10, 3)
-            st.markdown("**Display Options:**")
-            show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
-            show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
-            show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
-        with st.container():
-            st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
-            st.markdown("""
-            - Glottolog
-            - Wikidata
-            - Wikipedia
-            - Ethnologue
-            """)
-    query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
-    if st.button("Analyze with All Methods") and query:
-        col1, col2, col3 = st.columns(3)
         results = {}
-        for col, (label, method) in zip([col1, col2, col3], methods.items()):
             with col:
-                st.subheader(f"{label} Analysis")
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
-                    st.markdown("**Language Identifiers:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
-                    st.markdown("**Cultural Context:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
-                    st.markdown("**RDF Knowledge:**")
                     st.code("\n".join(rdf_data))
-                results[label] = response
-        log = f"""
-[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
-QUERY: {query}
-STANDARD:
-{results.get('Standard', '')}
-HYBRID:
-{results.get('Hybrid', '')}
-GRAPH-SAGE:
-{results.get('GraphSAGE', '')}
-{'='*60}
-"""
-        try:
-            with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
-                f.write(log)
-        except Exception as e:
-            st.warning(f"Failed to log: {str(e)}")
 if __name__ == "__main__":
     main()

+# rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
 import streamlit as st
 import pickle
 import numpy as np
 # === CONFIGURATION ===
 load_dotenv()
+ENDPOINT_URL = os.getenv("HF_ENDPOINT")
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
         margin-bottom: 1.5rem;
         border-left: 4px solid #3498db;
     }
     .sidebar-title {
         font-size: 1.1rem;
         font-weight: 600;
+        margin-top: 1rem;
     }
 </style>
 """, unsafe_allow_html=True)
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
     for label, suffix, ttl, matrix_path in [
+        ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
+        ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
     ]:
         with open(f"id_map{suffix}.pkl", "rb") as f:
             id_map = pickle.load(f)
     SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
     """
     try:
+        return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)]
     except Exception as e:
         return [("error", str(e))]
 You are an expert in South American indigenous languages.
 Use strictly and only the information below to answer the user question in **English**.
 - Do not infer or assume facts that are not explicitly stated.
+- If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
 [/INST]"""
     try:
         res = requests.post(
+            ENDPOINT_URL,
+            headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
+            json={"inputs": prompt}, timeout=60
         )
         out = res.json()
         if isinstance(out, list) and "generated_text" in out[0]:
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
     <div class='info-box'>
+    <b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        st.markdown("### What are the methods?")
+        st.markdown("""
+        - **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
+        - **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
+        """)
+        st.markdown("### Options")
+        k = st.slider("How many languages to analyze?", 1, 10, 3)
+        show_ids = st.checkbox("Show IDs", value=True)
+        show_ctx = st.checkbox("Show Text Info", value=True)
+        show_rdf = st.checkbox("Show Extra Facts", value=True)
+    query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
+    if st.button("Analyze") and query:
+        col1, col2 = st.columns(2)
         results = {}
+        for col, (label, method) in zip([col1, col2], methods.items()):
             with col:
+                st.subheader(f"{label} Method")
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
+                    st.markdown("**Language IDs:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
+                    st.markdown("**Text Info:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
+                    st.markdown("**Extra Facts:**")
                     st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()