Spaces:

javiervz
/

RAG-SA

Sleeping

App Files Files Community

Javier Vera commited on Apr 10

Commit

2d6d126

verified ·

1 Parent(s): 159fe5e

Update rag_hf.py

Browse files

Files changed (1) hide show

rag_hf.py +187 -46

rag_hf.py CHANGED Viewed

@@ -10,7 +10,7 @@ from rdflib import Graph as RDFGraph, Namespace
 from sentence_transformers import SentenceTransformer
 from dotenv import load_dotenv
-# === ORIGINAL CONFIGURATION (unchanged) ===
 load_dotenv()
 ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
@@ -18,37 +18,70 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
-# === IMPROVED UI SETUP ===
 st.set_page_config(
-    page_title="Vanishing Voices: Language Atlas",
     page_icon="🌍",
     layout="wide",
-    initial_sidebar_state="expanded"
 )
-# Professional CSS (visual only)
 st.markdown("""
 <style>
     .header {
         color: #2c3e50;
-        border-bottom: 2px solid #3498db;
-        padding-bottom: 10px;
         margin-bottom: 1.5rem;
     }
     .response-card {
-        background-color: #f8f9fa;
         border-radius: 8px;
         padding: 1.5rem;
         margin: 1rem 0;
-        border-left: 4px solid #3498db;
     }
-    .language-info {
-        background-color: white;
         border-radius: 8px;
         padding: 1rem;
         margin: 0.5rem 0;
-        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
     }
     .metric-badge {
         display: inline-block;
         background-color: #e8f4fc;
@@ -57,11 +90,19 @@ st.markdown("""
         font-size: 0.85rem;
         margin-right: 0.5rem;
     }
 </style>
 """, unsafe_allow_html=True)
-# === ORIGINAL FUNCTIONALITY (unchanged) ===
-@st.cache_resource(show_spinner="Loading models and indexes...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
@@ -79,9 +120,6 @@ def load_all_components():
         methods[label] = (matrix, id_map, G, rdf)
     return methods, embedder
-methods, embedder = load_all_components()
-# === ORIGINAL CORE FUNCTIONS (unchanged) ===
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
@@ -143,44 +181,142 @@ Answer:
     except Exception as e:
         return str(e), ids, context, rdf_facts
-# === IMPROVED MAIN FUNCTION (same functionality, better UI) ===
 def main():
     st.markdown("""
     <div class="header">
-        <h1>Vanishing Voices: South America's Endangered Language Atlas</h1>
-    </div>
-    <div style="background-color: #e8f4fc; border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem;">
-        <b>AI-Powered Analysis:</b> This app uses Mistral-7B-Instruct with RAG (Retrieval-Augmented Generation) to analyze indigenous languages.
     </div>
     """, unsafe_allow_html=True)
     with st.sidebar:
-        st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
-        st.markdown("### Analysis Methods")
         st.markdown("""
-        - **InfoMatch**: Combines text embeddings with metadata
-        - **LinkGraph**: Uses graph neural networks (GraphSAGE)
         """)
-        # Original controls with same parameters
-        k = st.slider("Languages to analyze", 1, 10, 3)
-        show_ids = st.checkbox("Show Language IDs", True)
-        show_ctx = st.checkbox("Show Context Info", True)
-        show_rdf = st.checkbox("Show RDF Facts", False)
-    query = st.text_input("Ask about South American languages:", "What languages are spoken in Perú?")
-    if st.button("Analyze with AI"):
         col1, col2 = st.columns(2)
-        results = {}
         for col, (label, method) in zip([col1, col2], methods.items()):
             with col:
-                st.subheader(f"{label} Method")
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
-                # Improved response display
                 st.markdown(f"""
                 <div class="response-card">
                     {response}
@@ -191,19 +327,24 @@ def main():
                 </div>
                 """, unsafe_allow_html=True)
-                # Original debug info with better presentation
-                if show_ids:
-                    with st.expander("Language IDs"):
-                        st.code("\n".join(lang_ids))
                 if show_ctx:
-                    with st.expander("Context Information"):
-                        for ctx in context:
-                            st.markdown(f"<div class='language-info'>{ctx}</div>", unsafe_allow_html=True)
                 if show_rdf:
-                    with st.expander("RDF Relations"):
                         st.code("\n".join(rdf_data))
 if __name__ == "__main__":
     main()

 from sentence_transformers import SentenceTransformer
 from dotenv import load_dotenv
+# === CONFIGURATION ===
 load_dotenv()
 ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 EX = Namespace("http://example.org/lang/")
+# === STREAMLIT UI CONFIG ===
 st.set_page_config(
+    page_title="Language Atlas: South American Indigenous Languages",
     page_icon="🌍",
     layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'About': "## AI-powered analysis of endangered indigenous languages\n"
+                "Developed by Departamento Académico de Humanidades"
+    }
 )
+# === CUSTOM CSS ===
 st.markdown("""
 <style>
+    /* Main styles */
     .header {
         color: #2c3e50;
+        border-bottom: 2px solid #4f46e5;
+        padding-bottom: 0.5rem;
         margin-bottom: 1.5rem;
     }
+    .feature-card {
+        background-color: #f8fafc;
+        border-radius: 8px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        border-left: 3px solid #4f46e5;
+    }
     .response-card {
+        background-color: white;
         border-radius: 8px;
         padding: 1.5rem;
+        box-shadow: 0 1px 3px rgba(0,0,0,0.1);
         margin: 1rem 0;
     }
+    .language-card {
+        background-color: #f9fafb;
         border-radius: 8px;
         padding: 1rem;
         margin: 0.5rem 0;
+        border: 1px solid #e5e7eb;
+    }
+    /* Sidebar styles */
+    .sidebar-section {
+        margin-bottom: 1.5rem;
     }
+    .sidebar-title {
+        font-weight: 600;
+        color: #4f46e5;
+    }
+    .suggested-question {
+        padding: 0.5rem;
+        margin: 0.25rem 0;
+        border-radius: 4px;
+        cursor: pointer;
+        transition: all 0.2s;
+    }
+    .suggested-question:hover {
+        background-color: #f1f5f9;
+    }
+    /* Metrics and badges */
     .metric-badge {
         display: inline-block;
         background-color: #e8f4fc;
         font-size: 0.85rem;
         margin-right: 0.5rem;
     }
+    .tech-badge {
+        background-color: #ecfdf5;
+        color: #065f46;
+        padding: 0.25rem 0.5rem;
+        border-radius: 4px;
+        font-size: 0.75rem;
+        font-weight: 500;
+    }
 </style>
 """, unsafe_allow_html=True)
+# === CORE FUNCTIONS ===
+@st.cache_resource(show_spinner="Loading AI models and knowledge graphs...")
 def load_all_components():
     embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
     methods = {}
         methods[label] = (matrix, id_map, G, rdf)
     return methods, embedder
 def get_top_k(matrix, id_map, query, k):
     vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
     vec = vec.cpu().numpy().astype("float32")
     except Exception as e:
         return str(e), ids, context, rdf_facts
+# === MAIN APP ===
 def main():
+    # Load components
+    methods, embedder = load_all_components()
+    # Main header
     st.markdown("""
     <div class="header">
+        <h1>🌍 Language Atlas: South American Indigenous Languages</h1>
     </div>
     """, unsafe_allow_html=True)
+    # Overview section
+    with st.expander("📌 Overview", expanded=True):
+        st.markdown("""
+        This app provides **AI-powered analysis** of endangered indigenous languages in South America,
+        integrating knowledge graphs from **Glottolog, Wikipedia, and Wikidata**.
+        """)
+        cols = st.columns(2)
+        with cols[0]:
+            st.markdown("""
+            <div class="feature-card">
+                <h4>🔹 Two AI Methods Available:</h4>
+                <ul>
+                    <li><b>InfoMatch</b> (Node2Vec + Textual Data)</li>
+                    <li><b>LinkGraph</b> (GraphSAGE + Structured Relations)</li>
+                </ul>
+                <p>🔹 <b>Powered by Mistral-7B</b> for contextual responses</p>
+            </div>
+            """, unsafe_allow_html=True)
+        with cols[1]:
+            st.markdown("""
+            <div class="feature-card">
+                <h4>🛠️ Features</h4>
+                <ul>
+                    <li>✅ Multisource Knowledge Graph</li>
+                    <li>✅ Hybrid AI Analysis</li>
+                    <li>✅ Comparative Results</li>
+                    <li>✅ Structured & Unstructured Data</li>
+                </ul>
+            </div>
+            """, unsafe_allow_html=True)
+    # Sidebar
     with st.sidebar:
+        # Logo and academic info
+        st.image("626af4b6fb5b8b79dac12078_logo-dpto-humanidades.png", width=180)
+        st.markdown("### Departamento Académico de Humanidades")
+        st.markdown("---")
+        # Quick start guide
+        st.markdown("### 🚀 Quick Start")
+        st.markdown("""
+        1. **Type a question** in the input box
+        2. **Click 'Analyze'** to compare methods
+        3. **Explore results** with expandable details
+        """)
+        st.markdown("---")
+        # Suggested questions
+        st.markdown("### 🔍 Example Queries")
+        questions = [
+            "What languages are endangered in Brazil?",
+            "How many speakers does Aymara have?",
+            "Which languages are related to Quechua?",
+            "Where is Mapudungun spoken?"
+        ]
+        for q in questions:
+            if st.markdown(f"<div class='suggested-question'>{q}</div>", unsafe_allow_html=True):
+                st.session_state.query = q
+        st.markdown("---")
+        # Technical details
+        st.markdown("### ⚙️ Technical Details")
+        st.markdown("""
+        - <span class="tech-badge">Embeddings</span> Node2Vec vs. GraphSAGE
+        - <span class="tech-badge">Language Model</span> Mistral-7B-Instruct
+        - <span class="tech-badge">Knowledge Graph</span> RDF-based integration
+        """, unsafe_allow_html=True)
+        st.markdown("---")
+        # Data sources
+        st.markdown("### 📂 Data Sources")
         st.markdown("""
+        - **Glottolog** (Language classification)
+        - **Wikipedia** (Textual summaries)
+        - **Wikidata** (Structured facts)
         """)
+        st.markdown("---")
+        # Analysis parameters
+        st.markdown("### 📊 Analysis Parameters")
+        k = st.slider("Number of languages to analyze", 1, 10, 3)
+        st.markdown("---")
+        # Debug options
+        st.markdown("### 🔧 Advanced Options")
+        show_ctx = st.checkbox("Show context information", False)
+        show_rdf = st.checkbox("Show structured facts", False)
+    # Main query interface
+    st.markdown("### 📝 Ask About Indigenous Languages")
+    query = st.text_input(
+        "Enter your question:",
+        value=st.session_state.get("query", ""),
+        label_visibility="collapsed",
+        placeholder="e.g. What languages are spoken in Peru?"
+    )
+    if st.button("Analyze", type="primary", use_container_width=True):
+        if not query:
+            st.warning("Please enter a question")
+            return
         col1, col2 = st.columns(2)
         for col, (label, method) in zip([col1, col2], methods.items()):
             with col:
+                st.markdown(f"#### {label} Method")
+                st.caption({
+                    "InfoMatch": "Node2Vec embeddings combining text and graph structure",
+                    "LinkGraph": "GraphSAGE embeddings capturing network patterns"
+                }[label])
                 start = datetime.datetime.now()
                 response, lang_ids, context, rdf_data = generate_response(*method, query, k)
                 duration = (datetime.datetime.now() - start).total_seconds()
+                # Response display
                 st.markdown(f"""
                 <div class="response-card">
                     {response}
                 </div>
                 """, unsafe_allow_html=True)
+                # Additional information
                 if show_ctx:
+                    with st.expander(f"📖 Context from {len(lang_ids)} languages"):
+                        for lang_id, ctx in zip(lang_ids, context):
+                            st.markdown(f"<div class='language-card'>{ctx}</div>", unsafe_allow_html=True)
                 if show_rdf:
+                    with st.expander("🔗 Structured facts (RDF)"):
                         st.code("\n".join(rdf_data))
+        # Footer note
+        st.markdown("---")
+        st.markdown("""
+        <div style="font-size: 0.8rem; color: #64748b; text-align: center;">
+        <b>📌 Note:</b> This tool is designed for researchers, linguists, and cultural preservationists.
+        For best results, use specific questions about languages, families, or regions.
+        </div>
+        """, unsafe_allow_html=True)
 if __name__ == "__main__":
     main()