Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

Javier Vera commited on Apr 9

Commit

b276dbe

verified ·

1 Parent(s): 43bac59

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +190 -253

rag_hf.py CHANGED Viewed

@@ -1,253 +1,190 @@
-# rag_interface.py (with numpy instead of faiss)
-import streamlit as st
-import pickle
-import numpy as np
-import rdflib
-import torch
-import datetime
-import os
-import requests
-from rdflib import Graph as RDFGraph, Namespace
-from sentence_transformers import SentenceTransformer
-from dotenv import load_dotenv
-# === CONFIGURATION ===
-load_dotenv()
-MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
-EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-EX = Namespace("http://example.org/lang/")
-st.set_page_config(
-    page_title="Vanishing Voices: Language Atlas",
-    page_icon="🌍",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .header {
-        color: #2c3e50;
-        border-bottom: 2px solid #3498db;
-        padding-bottom: 10px;
-        margin-bottom: 1.5rem;
-    }
-    .info-box {
-        background-color: #e8f4fc;
-        border-radius: 8px;
-        padding: 1rem;
-        margin-bottom: 1.5rem;
-        border-left: 4px solid #3498db;
-    }
-    .sidebar-section {
-        margin-bottom: 2rem;
-    }
-    .sidebar-title {
-        color: #2c3e50;
-        font-size: 1.1rem;
-        font-weight: 600;
-        margin-bottom: 0.5rem;
-        border-bottom: 1px solid #eee;
-        padding-bottom: 0.5rem;
-    }
-    .method-card {
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        padding: 0.8rem;
-        margin-bottom: 0.8rem;
-        border-left: 3px solid #3498db;
-    }
-    .method-title {
-        font-weight: 600;
-        color: #3498db;
-        margin-bottom: 0.3rem;
-    }
-</style>
-""", unsafe_allow_html=True)
-@st.cache_resource(show_spinner="Loading models and indexes...")
-def load_all_components():
-    embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
-    methods = {}
-    for label, suffix, ttl, matrix_path in [
-        ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
-        ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
-        ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
-    ]:
-        with open(f"id_map{suffix}.pkl", "rb") as f:
-            id_map = pickle.load(f)
-        with open(f"grafo_embed{suffix}.pickle", "rb") as f:
-            G = pickle.load(f)
-        matrix = np.load(matrix_path)
-        rdf = RDFGraph()
-        rdf.parse(ttl, format="ttl")
-        methods[label] = (matrix, id_map, G, rdf)
-    return methods, embedder
-methods, embedder = load_all_components()
-# === CORE FUNCTIONS ===
-def get_top_k(matrix, id_map, query, k):
-    vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
-    vec = vec.cpu().numpy().astype("float32")
-    sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10)
-    top_k_idx = np.argsort(sims)[-k:][::-1]
-    return [id_map[i] for i in top_k_idx]
-def get_context(G, lang_id):
-    node = G.nodes.get(lang_id, {})
-    lines = [f"**Language:** {node.get('label', lang_id)}"]
-    if node.get("wikipedia_summary"):
-        lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
-    if node.get("wikidata_description"):
-        lines.append(f"**Wikidata:** {node['wikidata_description']}")
-    if node.get("wikidata_countries"):
-        lines.append(f"**Countries:** {node['wikidata_countries']}")
-    return "\n\n".join(lines)
-def query_rdf(rdf, lang_id):
-    q = f"""
-    PREFIX ex: <http://example.org/lang/>
-    SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
-    """
-    try:
-        return [
-            (str(row[0]).split("/")[-1], str(row[1]))
-            for row in rdf.query(q)
-        ]
-    except Exception as e:
-        return [("error", str(e))]
-def generate_response(matrix, id_map, G, rdf, user_question, k=3):
-    ids = get_top_k(matrix, id_map, user_question, k)
-    context = [get_context(G, i) for i in ids]
-    rdf_facts = []
-    for i in ids:
-        rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
-    prompt = f"""<s>[INST]
-You are an expert in South American indigenous languages.
-Use strictly and only the information below to answer the user question in **English**.
-- Do not infer or assume facts that are not explicitly stated.
-- If the answer is unknown or insufficient, say "I cannot answer with the available data."
-- Limit your answer to 100 words.
-### CONTEXT:
-{chr(10).join(context)}
-### RDF RELATIONS:
-{chr(10).join(rdf_facts)}
-### QUESTION:
-{user_question}
-Answer:
-[/INST]"""
-    try:
-        res = requests.post(
-            f"https://api-inference.huggingface.co/models/{MODEL_ID}",
-            headers={"Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json"},
-            json={"inputs": prompt}, timeout=30
-        )
-        out = res.json()
-        if isinstance(out, list) and "generated_text" in out[0]:
-            return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
-        return str(out), ids, context, rdf_facts
-    except Exception as e:
-        return str(e), ids, context, rdf_facts
-# === MAIN FUNCTION ===
-def main():
-    st.markdown("""
-    <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
-    <div class='info-box'>
-    <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
-    This tool documents these cultural treasures before they disappear forever.
-    </div>
-    """, unsafe_allow_html=True)
-    with st.sidebar:
-        st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        with st.container():
-            st.markdown('<div class="sidebar-title">About This Tool</div>', unsafe_allow_html=True)
-            st.markdown("""
-            <div class="method-card">
-                <div class="method-title">Standard Search</div>
-                Semantic retrieval based on text-only embeddings. Identifies languages using purely linguistic similarity from Wikipedia summaries and labels.
-            </div>
-            <div class="method-card">
-                <div class="method-title">Hybrid Search</div>
-                Combines semantic embeddings with structured data from knowledge graphs. Enriches language representation with contextual facts.
-            </div>
-            <div class="method-card">
-                <div class="method-title">GraphSAGE Search</div>
-                Leverages deep graph neural networks to learn relational patterns across languages. Captures complex cultural and genealogical connections.
-            </div>
-            """, unsafe_allow_html=True)
-        with st.container():
-            st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
-            k = st.slider("Languages to analyze per query", 1, 10, 3)
-            st.markdown("**Display Options:**")
-            show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
-            show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
-            show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
-        with st.container():
-            st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
-            st.markdown("""
-            - Glottolog
-            - Wikidata
-            - Wikipedia
-            - Ethnologue
-            """)
-    query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
-    if st.button("Analyze with All Methods") and query:
-        col1, col2, col3 = st.columns(3)
-        results = {}
-        for col, (label, method) in zip([col1, col2, col3], methods.items()):
-            with col:
-                st.subheader(f"{label} Analysis")
-                start = datetime.datetime.now()
-                response, lang_ids, context, rdf_data = generate_response(*method, query, k)
-                duration = (datetime.datetime.now() - start).total_seconds()
-                st.markdown(response)
-                st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
-                if show_ids:
-                    st.markdown("**Language Identifiers:**")
-                    st.code("\n".join(lang_ids))
-                if show_ctx:
-                    st.markdown("**Cultural Context:**")
-                    st.markdown("\n\n---\n\n".join(context))
-                if show_rdf:
-                    st.markdown("**RDF Knowledge:**")
-                    st.code("\n".join(rdf_data))
-                results[label] = response
-        log = f"""
-[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
-QUERY: {query}
-STANDARD:
-{results.get('Standard', '')}
-HYBRID:
-{results.get('Hybrid', '')}
-GRAPH-SAGE:
-{results.get('GraphSAGE', '')}
-{'='*60}
-"""
-        try:
-            with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
-                f.write(log)
-        except Exception as e:
-            st.warning(f"Failed to log: {str(e)}")
-if __name__ == "__main__":
-    main()

+# rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
+import streamlit as st
+import pickle
+import numpy as np
+import rdflib
+import torch
+import datetime
+import os
+import requests
+from rdflib import Graph as RDFGraph, Namespace
+from sentence_transformers import SentenceTransformer
+from dotenv import load_dotenv
+# === CONFIGURATION ===
+load_dotenv()
+ENDPOINT_URL = os.getenv("HF_ENDPOINT")
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")
+EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EX = Namespace("http://example.org/lang/")
+st.set_page_config(
+    page_title="Vanishing Voices: Language Atlas",
+    page_icon="🌍",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .header {
+        color: #2c3e50;
+        border-bottom: 2px solid #3498db;
+        padding-bottom: 10px;
+        margin-bottom: 1.5rem;
+    }
+    .info-box {
+        background-color: #e8f4fc;
+        border-radius: 8px;
+        padding: 1rem;
+        margin-bottom: 1.5rem;
+        border-left: 4px solid #3498db;
+    }
+    .sidebar-title {
+        font-size: 1.1rem;
+        font-weight: 600;
+        margin-top: 1rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_resource(show_spinner="Loading models and indexes...")
+def load_all_components():
+    embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
+    methods = {}
+    for label, suffix, ttl, matrix_path in [
+        ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
+        ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
+    ]:
+        with open(f"id_map{suffix}.pkl", "rb") as f:
+            id_map = pickle.load(f)
+        with open(f"grafo_embed{suffix}.pickle", "rb") as f:
+            G = pickle.load(f)
+        matrix = np.load(matrix_path)
+        rdf = RDFGraph()
+        rdf.parse(ttl, format="ttl")
+        methods[label] = (matrix, id_map, G, rdf)
+    return methods, embedder
+methods, embedder = load_all_components()
+# === CORE FUNCTIONS ===
+def get_top_k(matrix, id_map, query, k):
+    vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
+    vec = vec.cpu().numpy().astype("float32")
+    sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10)
+    top_k_idx = np.argsort(sims)[-k:][::-1]
+    return [id_map[i] for i in top_k_idx]
+def get_context(G, lang_id):
+    node = G.nodes.get(lang_id, {})
+    lines = [f"**Language:** {node.get('label', lang_id)}"]
+    if node.get("wikipedia_summary"):
+        lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
+    if node.get("wikidata_description"):
+        lines.append(f"**Wikidata:** {node['wikidata_description']}")
+    if node.get("wikidata_countries"):
+        lines.append(f"**Countries:** {node['wikidata_countries']}")
+    return "\n\n".join(lines)
+def query_rdf(rdf, lang_id):
+    q = f"""
+    PREFIX ex: <http://example.org/lang/>
+    SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
+    """
+    try:
+        return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)]
+    except Exception as e:
+        return [("error", str(e))]
+def generate_response(matrix, id_map, G, rdf, user_question, k=3):
+    ids = get_top_k(matrix, id_map, user_question, k)
+    context = [get_context(G, i) for i in ids]
+    rdf_facts = []
+    for i in ids:
+        rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)])
+    prompt = f"""<s>[INST]
+You are an expert in South American indigenous languages.
+Use strictly and only the information below to answer the user question in **English**.
+- Do not infer or assume facts that are not explicitly stated.
+- If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
+- Limit your answer to 100 words.
+### CONTEXT:
+{chr(10).join(context)}
+### RDF RELATIONS:
+{chr(10).join(rdf_facts)}
+### QUESTION:
+{user_question}
+Answer:
+[/INST]"""
+    try:
+        res = requests.post(
+            ENDPOINT_URL,
+            headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
+            json={"inputs": prompt}, timeout=60
+        )
+        out = res.json()
+        if isinstance(out, list) and "generated_text" in out[0]:
+            return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts
+        return str(out), ids, context, rdf_facts
+    except Exception as e:
+        return str(e), ids, context, rdf_facts
+# === MAIN FUNCTION ===
+def main():
+    st.markdown("""
+    <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
+    <div class='info-box'>
+    <b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
+    </div>
+    """, unsafe_allow_html=True)
+    with st.sidebar:
+        st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        st.markdown("### What are the methods?")
+        st.markdown("""
+        - **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
+        - **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
+        """)
+        st.markdown("### Options")
+        k = st.slider("How many languages to analyze?", 1, 10, 3)
+        show_ids = st.checkbox("Show IDs", value=True)
+        show_ctx = st.checkbox("Show Text Info", value=True)
+        show_rdf = st.checkbox("Show Extra Facts", value=True)
+    query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
+    if st.button("Analyze") and query:
+        col1, col2 = st.columns(2)
+        results = {}
+        for col, (label, method) in zip([col1, col2], methods.items()):
+            with col:
+                st.subheader(f"{label} Method")
+                start = datetime.datetime.now()
+                response, lang_ids, context, rdf_data = generate_response(*method, query, k)
+                duration = (datetime.datetime.now() - start).total_seconds()
+                st.markdown(response)
+                st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
+                if show_ids:
+                    st.markdown("**Language IDs:**")
+                    st.code("\n".join(lang_ids))
+                if show_ctx:
+                    st.markdown("**Text Info:**")
+                    st.markdown("\n\n---\n\n".join(context))
+                if show_rdf:
+                    st.markdown("**Extra Facts:**")
+                    st.code("\n".join(rdf_data))
+if __name__ == "__main__":
+    main()