Spaces:

javiervz
/

RAG-SA

Running

App Files Files Community

Javier Vera commited on Apr 10

Commit

159fe5e

verified ·

1 Parent(s): f45ad41

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +63 -44

rag_hf.py CHANGED Viewed

@@ -1,26 +1,24 @@
-# rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
 import streamlit as st
 import pickle
 import numpy as np
 import rdflib
 import torch
-import datetime
 import os
 import requests
 from rdflib import Graph as RDFGraph, Namespace
 from sentence_transformers import SentenceTransformer
 from dotenv import load_dotenv
-# === CONFIGURATION ===
 load_dotenv()
 ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
 st.set_page_config(
     page_title="Vanishing Voices: Language Atlas",
     page_icon="🌍",
@@ -28,7 +26,7 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# Custom CSS
 st.markdown("""
 <style>
     .header {
@@ -37,21 +35,32 @@ st.markdown("""
         padding-bottom: 10px;
         margin-bottom: 1.5rem;
     }
-    .info-box {
-        background-color: #e8f4fc;
         border-radius: 8px;
-        padding: 1rem;
-        margin-bottom: 1.5rem;
         border-left: 4px solid #3498db;
     }
-    .sidebar-title {
-        font-size: 1.1rem;
-        font-weight: 600;
-        margin-top: 1rem;
     }
 </style>
 """, unsafe_allow_html=True)
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
@@ -72,7 +81,7 @@ def load_all_components():
 methods, embedder = load_all_components()
-# === CORE FUNCTIONS ===
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
@@ -113,17 +122,12 @@ Use strictly and only the information below to answer the user question in **Eng
 - Do not infer or assume facts that are not explicitly stated.
 - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
 ### RDF RELATIONS:
 {chr(10).join(rdf_facts)}
 ### QUESTION:
 {user_question}
 Answer:
 [/INST]"""
     try:
@@ -139,33 +143,34 @@ Answer:
     except Exception as e:
         return str(e), ids, context, rdf_facts
-# === MAIN FUNCTION ===
 def main():
     st.markdown("""
-    <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
-    <div class='info-box'>
-    <b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        st.markdown("### What are the methods?")
         st.markdown("""
-        - **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
-        - **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
         """)
-        st.markdown("### Options")
-        k = st.slider("How many languages to analyze?", 1, 10, 3)
-        show_ids = st.checkbox("Show IDs", value=True)
-        show_ctx = st.checkbox("Show Text Info", value=True)
-        show_rdf = st.checkbox("Show Extra Facts", value=True)
-    query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
-    if st.button("Analyze") and query:
         col1, col2 = st.columns(2)
         results = {}
         for col, (label, method) in zip([col1, col2], methods.items()):
@@ -174,17 +179,31 @@ def main():
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
-                st.markdown(response)
-                st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
                 if show_ids:
-                    st.markdown("**Language IDs:**")
-                    st.code("\n".join(lang_ids))
                 if show_ctx:
-                    st.markdown("**Text Info:**")
-                    st.markdown("\n\n---\n\n".join(context))
                 if show_rdf:
-                    st.markdown("**Extra Facts:**")
-                    st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()

 import streamlit as st
+import datetime
 import pickle
 import numpy as np
 import rdflib
 import torch
 import os
 import requests
 from rdflib import Graph as RDFGraph, Namespace
 from sentence_transformers import SentenceTransformer
 from dotenv import load_dotenv
+# === ORIGINAL CONFIGURATION (unchanged) ===
 load_dotenv()
 ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
+# === IMPROVED UI SETUP ===
 st.set_page_config(
     page_title="Vanishing Voices: Language Atlas",
     page_icon="🌍",
     initial_sidebar_state="expanded"
 )
+# Professional CSS (visual only)
 st.markdown("""
 <style>
     .header {
         padding-bottom: 10px;
         margin-bottom: 1.5rem;
     }
+    .response-card {
+        background-color: #f8f9fa;
         border-radius: 8px;
+        padding: 1.5rem;
+        margin: 1rem 0;
         border-left: 4px solid #3498db;
     }
+    .language-info {
+        background-color: white;
+        border-radius: 8px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+    }
+    .metric-badge {
+        display: inline-block;
+        background-color: #e8f4fc;
+        padding: 0.25rem 0.5rem;
+        border-radius: 4px;
+        font-size: 0.85rem;
+        margin-right: 0.5rem;
     }
 </style>
 """, unsafe_allow_html=True)
+# === ORIGINAL FUNCTIONALITY (unchanged) ===
 @st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
 methods, embedder = load_all_components()
+# === ORIGINAL CORE FUNCTIONS (unchanged) ===
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
 - Do not infer or assume facts that are not explicitly stated.
 - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
 - Limit your answer to 100 words.
 ### CONTEXT:
 {chr(10).join(context)}
 ### RDF RELATIONS:
 {chr(10).join(rdf_facts)}
 ### QUESTION:
 {user_question}
 Answer:
 [/INST]"""
     try:
     except Exception as e:
         return str(e), ids, context, rdf_facts
+# === IMPROVED MAIN FUNCTION (same functionality, better UI) ===
 def main():
     st.markdown("""
+    <div class="header">
+        <h1>Vanishing Voices: South America's Endangered Language Atlas</h1>
+    </div>
+    <div style="background-color: #e8f4fc; border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem;">
+        <b>AI-Powered Analysis:</b> This app uses Mistral-7B-Instruct with RAG (Retrieval-Augmented Generation) to analyze indigenous languages.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
         st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
+        st.markdown("### Analysis Methods")
         st.markdown("""
+        - **InfoMatch**: Combines text embeddings with metadata
+        - **LinkGraph**: Uses graph neural networks (GraphSAGE)
         """)
+        # Original controls with same parameters
+        k = st.slider("Languages to analyze", 1, 10, 3)
+        show_ids = st.checkbox("Show Language IDs", True)
+        show_ctx = st.checkbox("Show Context Info", True)
+        show_rdf = st.checkbox("Show RDF Facts", False)
+    query = st.text_input("Ask about South American languages:", "What languages are spoken in Perú?")
+    if st.button("Analyze with AI"):
         col1, col2 = st.columns(2)
         results = {}
         for col, (label, method) in zip([col1, col2], methods.items()):
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
+                # Improved response display
+                st.markdown(f"""
+                <div class="response-card">
+                    {response}
+                    <div style="margin-top: 1rem;">
+                        <span class="metric-badge">⏱️ {duration:.2f}s</span>
+                        <span class="metric-badge">🌐 {len(lang_ids)} languages</span>
+                    </div>
+                </div>
+                """, unsafe_allow_html=True)
+                # Original debug info with better presentation
                 if show_ids:
+                    with st.expander("Language IDs"):
+                        st.code("\n".join(lang_ids))
                 if show_ctx:
+                    with st.expander("Context Information"):
+                        for ctx in context:
+                            st.markdown(f"<div class='language-info'>{ctx}</div>", unsafe_allow_html=True)
                 if show_rdf:
+                    with st.expander("RDF Relations"):
+                        st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()