|
|
|
import streamlit as st |
|
import pickle |
|
import numpy as np |
|
import rdflib |
|
import torch |
|
import datetime |
|
import os |
|
import requests |
|
from rdflib import Graph as RDFGraph, Namespace |
|
from sentence_transformers import SentenceTransformer |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
ENDPOINT_URL = os.getenv("HF_ENDPOINT") |
|
HF_API_TOKEN = os.getenv("HF_API_TOKEN") |
|
EMBEDDING_MODEL = "intfloat/multilingual-e5-base" |
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
EX = Namespace("http://example.org/lang/") |
|
|
|
st.set_page_config( |
|
page_title="Vanishing Voices: Language Atlas", |
|
page_icon="🌍", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.header { |
|
color: #2c3e50; |
|
border-bottom: 2px solid #3498db; |
|
padding-bottom: 10px; |
|
margin-bottom: 1.5rem; |
|
} |
|
.info-box { |
|
background-color: #e8f4fc; |
|
border-radius: 8px; |
|
padding: 1rem; |
|
margin-bottom: 1.5rem; |
|
border-left: 4px solid #3498db; |
|
} |
|
.sidebar-title { |
|
font-size: 1.1rem; |
|
font-weight: 600; |
|
margin-top: 1rem; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
@st.cache_resource(show_spinner="Loading models and indexes...") |
|
def load_all_components(): |
|
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE) |
|
methods = {} |
|
for label, suffix, ttl, matrix_path in [ |
|
("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"), |
|
("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy") |
|
]: |
|
with open(f"id_map{suffix}.pkl", "rb") as f: |
|
id_map = pickle.load(f) |
|
with open(f"grafo_embed{suffix}.pickle", "rb") as f: |
|
G = pickle.load(f) |
|
matrix = np.load(matrix_path) |
|
rdf = RDFGraph() |
|
rdf.parse(ttl, format="ttl") |
|
methods[label] = (matrix, id_map, G, rdf) |
|
return methods, embedder |
|
|
|
methods, embedder = load_all_components() |
|
|
|
|
|
def get_top_k(matrix, id_map, query, k): |
|
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE) |
|
vec = vec.cpu().numpy().astype("float32") |
|
sims = np.dot(matrix, vec) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vec) + 1e-10) |
|
top_k_idx = np.argsort(sims)[-k:][::-1] |
|
return [id_map[i] for i in top_k_idx] |
|
|
|
def get_context(G, lang_id): |
|
node = G.nodes.get(lang_id, {}) |
|
lines = [f"**Language:** {node.get('label', lang_id)}"] |
|
if node.get("wikipedia_summary"): |
|
lines.append(f"**Wikipedia:** {node['wikipedia_summary']}") |
|
if node.get("wikidata_description"): |
|
lines.append(f"**Wikidata:** {node['wikidata_description']}") |
|
if node.get("wikidata_countries"): |
|
lines.append(f"**Countries:** {node['wikidata_countries']}") |
|
return "\n\n".join(lines) |
|
|
|
def query_rdf(rdf, lang_id): |
|
q = f""" |
|
PREFIX ex: <http://example.org/lang/> |
|
SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }} |
|
""" |
|
try: |
|
return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)] |
|
except Exception as e: |
|
return [("error", str(e))] |
|
|
|
def generate_response(matrix, id_map, G, rdf, user_question, k=3): |
|
ids = get_top_k(matrix, id_map, user_question, k) |
|
context = [get_context(G, i) for i in ids] |
|
rdf_facts = [] |
|
for i in ids: |
|
rdf_facts.extend([f"{p}: {v}" for p, v in query_rdf(rdf, i)]) |
|
prompt = f"""<s>[INST] |
|
You are an expert in South American indigenous languages. |
|
Use strictly and only the information below to answer the user question in **English**. |
|
- Do not infer or assume facts that are not explicitly stated. |
|
- If the answer is unknown or insufficient, say \"I cannot answer with the available data.\" |
|
- Limit your answer to 100 words. |
|
|
|
|
|
### CONTEXT: |
|
{chr(10).join(context)} |
|
|
|
### RDF RELATIONS: |
|
{chr(10).join(rdf_facts)} |
|
|
|
### QUESTION: |
|
{user_question} |
|
|
|
Answer: |
|
[/INST]""" |
|
try: |
|
res = requests.post( |
|
ENDPOINT_URL, |
|
headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"}, |
|
json={"inputs": prompt}, timeout=60 |
|
) |
|
out = res.json() |
|
if isinstance(out, list) and "generated_text" in out[0]: |
|
return out[0]["generated_text"].replace(prompt.strip(), "").strip(), ids, context, rdf_facts |
|
return str(out), ids, context, rdf_facts |
|
except Exception as e: |
|
return str(e), ids, context, rdf_facts |
|
|
|
|
|
def main(): |
|
st.markdown(""" |
|
<h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1> |
|
<div class='info-box'> |
|
<b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence. |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
with st.sidebar: |
|
st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180) |
|
|
|
st.markdown("### What are the methods?") |
|
st.markdown(""" |
|
- **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings. |
|
- **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other. |
|
""") |
|
|
|
st.markdown("### Options") |
|
k = st.slider("How many languages to analyze?", 1, 10, 3) |
|
show_ids = st.checkbox("Show IDs", value=True) |
|
show_ctx = st.checkbox("Show Text Info", value=True) |
|
show_rdf = st.checkbox("Show Extra Facts", value=True) |
|
|
|
query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?") |
|
|
|
if st.button("Analyze") and query: |
|
col1, col2 = st.columns(2) |
|
results = {} |
|
for col, (label, method) in zip([col1, col2], methods.items()): |
|
with col: |
|
st.subheader(f"{label} Method") |
|
start = datetime.datetime.now() |
|
response, lang_ids, context, rdf_data = generate_response(*method, query, k) |
|
duration = (datetime.datetime.now() - start).total_seconds() |
|
st.markdown(response) |
|
st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages") |
|
if show_ids: |
|
st.markdown("**Language IDs:**") |
|
st.code("\n".join(lang_ids)) |
|
if show_ctx: |
|
st.markdown("**Text Info:**") |
|
st.markdown("\n\n---\n\n".join(context)) |
|
if show_rdf: |
|
st.markdown("**Extra Facts:**") |
|
st.code("\n".join(rdf_data)) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|