import streamlit as st import datetime import pickle import numpy as np import rdflib import torch import os import requests from rdflib import Graph as RDFGraph, Namespace from sentence_transformers import SentenceTransformer from dotenv import load_dotenv # === ORIGINAL CONFIGURATION (unchanged) === load_dotenv() ENDPOINT_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3" HF_API_TOKEN = os.getenv("HF_API_TOKEN") EMBEDDING_MODEL = "intfloat/multilingual-e5-base" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" EX = Namespace("http://example.org/lang/") # === IMPROVED UI SETUP === st.set_page_config( page_title="Vanishing Voices: Language Atlas", page_icon="🌍", layout="wide", initial_sidebar_state="expanded" ) # Professional CSS (visual only) st.markdown(""" """, unsafe_allow_html=True) # === ORIGINAL FUNCTIONALITY (unchanged) === @st.cache_resource(show_spinner="Loading models and indexes...") def load_all_components(): embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE) methods = {} for label, suffix, ttl, matrix_path in [ ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"), ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy") ]: with open(f"id_map{suffix}.pkl", "rb") as f: id_map = pickle.load(f) with open(f"grafo_embed{suffix}.pickle", "rb") as f: G = pickle.load(f) matrix = np.load(matrix_path) rdf = RDFGraph() rdf.parse(ttl, format="ttl") methods[label] = (matrix, id_map, G, rdf) return methods, embedder methods, embedder = load_all_components() # === ORIGINAL CORE FUNCTIONS (unchanged) === def get_top_k(matrix, id_map, query, k): vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE) vec = vec.cpu().numpy().astype("float32") sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10) top_k_idx = np.argsort(sims)[-k:][::-1] return [id_map[i] for i in top_k_idx] def get_context(G, lang_id): node = G.nodes.get(lang_id, {}) lines = [f"**Language:** {node.get('label', lang_id)}"] if node.get("wikipedia_summary"): lines.append(f"**Wikipedia:** {node['wikipedia_summary']}") if node.get("wikidata_description"): lines.append(f"**Wikidata:** {node['wikidata_description']}") if node.get("wikidata_countries"): lines.append(f"**Countries:** {node['wikidata_countries']}") return "\n\n".join(lines) def query_rdf(rdf, lang_id): q = f""" PREFIX ex: SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }} """ try: return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)] except Exception as e: return [("error", str(e))] def generate_response(matrix, id_map, G, rdf, user_question, k=3): ids = get_top_k(matrix, id_map, user_question, k) context = [get_context(G, i) for i in ids] rdf_facts = [] for i in ids: rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)]) prompt = f"""[INST] You are an expert in South American indigenous languages. Use strictly and only the information below to answer the user question in **English**. - Do not infer or assume facts that are not explicitly stated. - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\" - Limit your answer to 100 words. ### CONTEXT: {chr(10).join(context)} ### RDF RELATIONS: {chr(10).join(rdf_facts)} ### QUESTION: {user_question} Answer: [/INST]""" try: res = requests.post( ENDPOINT_URL, headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}, json={"inputs": prompt}, timeout=60 ) out = res.json() if isinstance(out, list) and "generated_text" in out[0]: return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts return str(out), ids, context, rdf_facts except Exception as e: return str(e), ids, context, rdf_facts # === IMPROVED MAIN FUNCTION (same functionality, better UI) === def main(): st.markdown("""

Vanishing Voices: South America's Endangered Language Atlas

AI-Powered Analysis: This app uses Mistral-7B-Instruct with RAG (Retrieval-Augmented Generation) to analyze indigenous languages.
""", unsafe_allow_html=True) with st.sidebar: st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180) st.markdown("### Analysis Methods") st.markdown(""" - **InfoMatch**: Combines text embeddings with metadata - **LinkGraph**: Uses graph neural networks (GraphSAGE) """) # Original controls with same parameters k = st.slider("Languages to analyze", 1, 10, 3) show_ids = st.checkbox("Show Language IDs", True) show_ctx = st.checkbox("Show Context Info", True) show_rdf = st.checkbox("Show RDF Facts", False) query = st.text_input("Ask about South American languages:", "What languages are spoken in Perú?") if st.button("Analyze with AI"): col1, col2 = st.columns(2) results = {} for col, (label, method) in zip([col1, col2], methods.items()): with col: st.subheader(f"{label} Method") start = datetime.datetime.now() response, lang_ids, context, rdf_data = generate_response(*method, query, k) duration = (datetime.datetime.now() - start).total_seconds() # Improved response display st.markdown(f"""
{response}
⏱️ {duration:.2f}s 🌐 {len(lang_ids)} languages
""", unsafe_allow_html=True) # Original debug info with better presentation if show_ids: with st.expander("Language IDs"): st.code("\n".join(lang_ids)) if show_ctx: with st.expander("Context Information"): for ctx in context: st.markdown(f"
{ctx}
", unsafe_allow_html=True) if show_rdf: with st.expander("RDF Relations"): st.code("\n".join(rdf_data)) if __name__ == "__main__": main()