Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

Javier Vera commited on Apr 9

Commit

cf45911

verified ·

1 Parent(s): d5a9a0d

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +129 -34

rag_hf.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# rag_interface.py (tres métodos, descripciones técnicas)
 import streamlit as st
 import pickle
 import numpy as np
@@ -14,13 +14,59 @@ from dotenv import load_dotenv
 # === CONFIGURATION ===
 load_dotenv()
-ENDPOINT_URL = os.getenv("HF_ENDPOINT")
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
-st.set_page_config(page_title="Vanishing Voices: Language Atlas", page_icon="🌍", layout="wide")
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
@@ -43,6 +89,7 @@ def load_all_components():
 methods, embedder = load_all_components()
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
@@ -52,14 +99,14 @@ def get_top_k(matrix, id_map, query, k):
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
-    parts = [f"**Language:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
-        parts.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
-        parts.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
-        parts.append(f"**Countries:** {node['wikidata_countries']}")
-    return "\n\n".join(parts)
 def query_rdf(rdf, lang_id):
     q = f"""
@@ -67,7 +114,10 @@ def query_rdf(rdf, lang_id):
     SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
     """
     try:
-        return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)]
     except Exception as e:
         return [("error", str(e))]
@@ -84,6 +134,7 @@ Use strictly and only the information below to answer the user question in **Eng
 - If the answer is unknown or insufficient, say "I cannot answer with the available data."
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
@@ -97,9 +148,9 @@ Answer:
 [/INST]"""
     try:
         res = requests.post(
-            ENDPOINT_URL,
-            headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
-            json={"inputs": prompt}, timeout=60
         )
         out = res.json()
         if isinstance(out, list) and "generated_text" in out[0]:
@@ -108,6 +159,7 @@ Answer:
     except Exception as e:
         return str(e), ids, context, rdf_facts
 def main():
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
@@ -120,39 +172,82 @@ def main():
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        st.markdown("### About This Tool")
-        st.markdown("""
-        - **Standard**: Semantic search based on text-only embeddings.
-        - **Hybrid**: Uses node2vec to combine graph structure with descriptive features.
-        - **GraphSAGE**: Employs deep graph learning (GraphSAGE) for relational patterns.
-        """)
-        k = st.slider("Languages to analyze per query", 1, 10, 3)
-        show_ids = st.checkbox("Language IDs", value=True)
-        show_ctx = st.checkbox("Contextual Info", value=True)
-        show_rdf = st.checkbox("RDF Relations", value=True)
-    query = st.text_input("Ask something about South American languages:", "Which Amazonian languages are most at risk?")
-    if st.button("Analyze"):
-        cols = st.columns(len(methods))
-        for col, (label, method) in zip(cols, methods.items()):
             with col:
-                st.subheader(label)
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
-                    st.markdown("**IDs:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
-                    st.markdown("**Context:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
-                    st.markdown("**RDF:**")
                     st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()

+# rag_interface.py (with numpy instead of faiss)
 import streamlit as st
 import pickle
 import numpy as np
 # === CONFIGURATION ===
 load_dotenv()
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
+st.set_page_config(
+    page_title="Vanishing Voices: Language Atlas",
+    page_icon="🌍",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .header {
+        color: #2c3e50;
+        border-bottom: 2px solid #3498db;
+        padding-bottom: 10px;
+        margin-bottom: 1.5rem;
+    }
+    .info-box {
+        background-color: #e8f4fc;
+        border-radius: 8px;
+        padding: 1rem;
+        margin-bottom: 1.5rem;
+        border-left: 4px solid #3498db;
+    }
+    .sidebar-section {
+        margin-bottom: 2rem;
+    }
+    .sidebar-title {
+        color: #2c3e50;
+        font-size: 1.1rem;
+        font-weight: 600;
+        margin-bottom: 0.5rem;
+        border-bottom: 1px solid #eee;
+        padding-bottom: 0.5rem;
+    }
+    .method-card {
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        padding: 0.8rem;
+        margin-bottom: 0.8rem;
+        border-left: 3px solid #3498db;
+    }
+    .method-title {
+        font-weight: 600;
+        color: #3498db;
+        margin-bottom: 0.3rem;
+    }
+</style>
+""", unsafe_allow_html=True)
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
 methods, embedder = load_all_components()
+# === CORE FUNCTIONS ===
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
 def get_context(G, lang_id):
     node = G.nodes.get(lang_id, {})
+    lines = [f"**Language:** {node.get('label', lang_id)}"]
     if node.get("wikipedia_summary"):
+        lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
     if node.get("wikidata_description"):
+        lines.append(f"**Wikidata:** {node['wikidata_description']}")
     if node.get("wikidata_countries"):
+        lines.append(f"**Countries:** {node['wikidata_countries']}")
+    return "\n\n".join(lines)
 def query_rdf(rdf, lang_id):
     q = f"""
     SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
     """
     try:
+        return [
+            (str(row[0]).split("/")[-1], str(row[1]))
+            for row in rdf.query(q)
+        ]
     except Exception as e:
         return [("error", str(e))]
 - If the answer is unknown or insufficient, say "I cannot answer with the available data."
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
 [/INST]"""
     try:
         res = requests.post(
+            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
+            headers={"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json"},
+            json={"inputs": prompt}, timeout=30
         )
         out = res.json()
         if isinstance(out, list) and "generated_text" in out[0]:
     except Exception as e:
         return str(e), ids, context, rdf_facts
+# === MAIN FUNCTION ===
 def main():
     st.markdown("""
     <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        with st.container():
+            st.markdown('<div class="sidebar-title">About This Tool</div>', unsafe_allow_html=True)
+            st.markdown("""
+            <div class="method-card">
+                <div class="method-title">Standard Search</div>
+                Semantic retrieval based on text-only embeddings. Identifies languages using purely linguistic similarity from Wikipedia summaries and labels.
+            </div>
+            <div class="method-card">
+                <div class="method-title">Hybrid Search</div>
+                Combines semantic embeddings with structured data from knowledge graphs. Enriches language representation with contextual facts.
+            </div>
+            <div class="method-card">
+                <div class="method-title">GraphSAGE Search</div>
+                Leverages deep graph neural networks to learn relational patterns across languages. Captures complex cultural and genealogical connections.
+            </div>
+            """, unsafe_allow_html=True)
+        with st.container():
+            st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
+            k = st.slider("Languages to analyze per query", 1, 10, 3)
+            st.markdown("**Display Options:**")
+            show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
+            show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
+            show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
+        with st.container():
+            st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
+            st.markdown("""
+            - Glottolog
+            - Wikidata
+            - Wikipedia
+            - Ethnologue
+            """)
+    query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
+    if st.button("Analyze with All Methods") and query:
+        col1, col2, col3 = st.columns(3)
+        results = {}
+        for col, (label, method) in zip([col1, col2, col3], methods.items()):
             with col:
+                st.subheader(f"{label} Analysis")
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
                 st.markdown(response)
                 st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
+                    st.markdown("**Language Identifiers:**")
                     st.code("\n".join(lang_ids))
                 if show_ctx:
+                    st.markdown("**Cultural Context:**")
                     st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
+                    st.markdown("**RDF Knowledge:**")
                     st.code("\n".join(rdf_data))
+                results[label] = response
+        log = f"""
+[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
+QUERY: {query}
+STANDARD:
+{results.get('Standard', '')}
+HYBRID:
+{results.get('Hybrid', '')}
+GRAPH-SAGE:
+{results.get('GraphSAGE', '')}
+{'='*60}
+"""
+        try:
+            with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
+                f.write(log)
+        except Exception as e:
+            st.warning(f"Failed to log: {str(e)}")
 if __name__ == "__main__":
     main()