Javier Vera
commited on
Update rag_hf.py
Browse files
rag_hf.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# rag_interface.py (
|
2 |
import streamlit as st
|
3 |
import pickle
|
4 |
import numpy as np
|
@@ -20,44 +20,16 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
|
|
20 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
EX = Namespace("http://example.org/lang/")
|
22 |
|
23 |
-
st.set_page_config(
|
24 |
-
page_title="Vanishing Voices: Language Atlas",
|
25 |
-
page_icon="🌍",
|
26 |
-
layout="wide",
|
27 |
-
initial_sidebar_state="expanded"
|
28 |
-
)
|
29 |
-
|
30 |
-
# Custom CSS
|
31 |
-
st.markdown("""
|
32 |
-
<style>
|
33 |
-
.header {
|
34 |
-
color: #2c3e50;
|
35 |
-
border-bottom: 2px solid #3498db;
|
36 |
-
padding-bottom: 10px;
|
37 |
-
margin-bottom: 1.5rem;
|
38 |
-
}
|
39 |
-
.info-box {
|
40 |
-
background-color: #e8f4fc;
|
41 |
-
border-radius: 8px;
|
42 |
-
padding: 1rem;
|
43 |
-
margin-bottom: 1.5rem;
|
44 |
-
border-left: 4px solid #3498db;
|
45 |
-
}
|
46 |
-
.sidebar-title {
|
47 |
-
font-size: 1.1rem;
|
48 |
-
font-weight: 600;
|
49 |
-
margin-top: 1rem;
|
50 |
-
}
|
51 |
-
</style>
|
52 |
-
""", unsafe_allow_html=True)
|
53 |
|
54 |
@st.cache_resource(show_spinner="Loading models and indexes...")
|
55 |
def load_all_components():
|
56 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
57 |
methods = {}
|
58 |
for label, suffix, ttl, matrix_path in [
|
59 |
-
("
|
60 |
-
("
|
|
|
61 |
]:
|
62 |
with open(f"id_map{suffix}.pkl", "rb") as f:
|
63 |
id_map = pickle.load(f)
|
@@ -71,7 +43,6 @@ def load_all_components():
|
|
71 |
|
72 |
methods, embedder = load_all_components()
|
73 |
|
74 |
-
# === CORE FUNCTIONS ===
|
75 |
def get_top_k(matrix, id_map, query, k):
|
76 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
77 |
vec = vec.cpu().numpy().astype("float32")
|
@@ -81,14 +52,14 @@ def get_top_k(matrix, id_map, query, k):
|
|
81 |
|
82 |
def get_context(G, lang_id):
|
83 |
node = G.nodes.get(lang_id, {})
|
84 |
-
|
85 |
if node.get("wikipedia_summary"):
|
86 |
-
|
87 |
if node.get("wikidata_description"):
|
88 |
-
|
89 |
if node.get("wikidata_countries"):
|
90 |
-
|
91 |
-
return "\n\n".join(
|
92 |
|
93 |
def query_rdf(rdf, lang_id):
|
94 |
q = f"""
|
@@ -110,10 +81,9 @@ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
|
|
110 |
You are an expert in South American indigenous languages.
|
111 |
Use strictly and only the information below to answer the user question in **English**.
|
112 |
- Do not infer or assume facts that are not explicitly stated.
|
113 |
-
- If the answer is unknown or insufficient, say
|
114 |
- Limit your answer to 100 words.
|
115 |
|
116 |
-
|
117 |
### CONTEXT:
|
118 |
{chr(10).join(context)}
|
119 |
|
@@ -138,53 +108,53 @@ Answer:
|
|
138 |
except Exception as e:
|
139 |
return str(e), ids, context, rdf_facts
|
140 |
|
141 |
-
# === MAIN FUNCTION ===
|
142 |
def main():
|
143 |
st.markdown("""
|
144 |
<h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
|
145 |
<div class='info-box'>
|
146 |
-
<b>
|
|
|
147 |
</div>
|
148 |
""", unsafe_allow_html=True)
|
149 |
|
150 |
with st.sidebar:
|
151 |
st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
|
152 |
|
153 |
-
st.markdown("###
|
154 |
st.markdown("""
|
155 |
-
- **
|
156 |
-
- **
|
|
|
157 |
""")
|
158 |
|
159 |
-
st.
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
show_rdf = st.checkbox("Show Extra Facts", value=True)
|
164 |
|
165 |
-
query = st.text_input("Ask something about South American languages:", "
|
166 |
|
167 |
-
if st.button("Analyze")
|
168 |
-
|
169 |
-
|
170 |
-
for col, (label, method) in zip([col1, col2], methods.items()):
|
171 |
with col:
|
172 |
-
st.subheader(
|
173 |
start = datetime.datetime.now()
|
174 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
175 |
duration = (datetime.datetime.now() - start).total_seconds()
|
176 |
st.markdown(response)
|
177 |
st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
|
178 |
if show_ids:
|
179 |
-
st.markdown("**
|
180 |
st.code("\n".join(lang_ids))
|
181 |
if show_ctx:
|
182 |
-
st.markdown("**
|
183 |
st.markdown("\n\n---\n\n".join(context))
|
184 |
if show_rdf:
|
185 |
-
st.markdown("**
|
186 |
st.code("\n".join(rdf_data))
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
main()
|
190 |
|
|
|
|
1 |
+
# rag_interface.py (tres métodos, descripciones técnicas)
|
2 |
import streamlit as st
|
3 |
import pickle
|
4 |
import numpy as np
|
|
|
20 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
EX = Namespace("http://example.org/lang/")
|
22 |
|
23 |
+
st.set_page_config(page_title="Vanishing Voices: Language Atlas", page_icon="🌍", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
@st.cache_resource(show_spinner="Loading models and indexes...")
|
26 |
def load_all_components():
|
27 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
28 |
methods = {}
|
29 |
for label, suffix, ttl, matrix_path in [
|
30 |
+
("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
|
31 |
+
("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
|
32 |
+
("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
|
33 |
]:
|
34 |
with open(f"id_map{suffix}.pkl", "rb") as f:
|
35 |
id_map = pickle.load(f)
|
|
|
43 |
|
44 |
methods, embedder = load_all_components()
|
45 |
|
|
|
46 |
def get_top_k(matrix, id_map, query, k):
|
47 |
vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
|
48 |
vec = vec.cpu().numpy().astype("float32")
|
|
|
52 |
|
53 |
def get_context(G, lang_id):
|
54 |
node = G.nodes.get(lang_id, {})
|
55 |
+
parts = [f"**Language:** {node.get('label', lang_id)}"]
|
56 |
if node.get("wikipedia_summary"):
|
57 |
+
parts.append(f"**Wikipedia:** {node['wikipedia_summary']}")
|
58 |
if node.get("wikidata_description"):
|
59 |
+
parts.append(f"**Wikidata:** {node['wikidata_description']}")
|
60 |
if node.get("wikidata_countries"):
|
61 |
+
parts.append(f"**Countries:** {node['wikidata_countries']}")
|
62 |
+
return "\n\n".join(parts)
|
63 |
|
64 |
def query_rdf(rdf, lang_id):
|
65 |
q = f"""
|
|
|
81 |
You are an expert in South American indigenous languages.
|
82 |
Use strictly and only the information below to answer the user question in **English**.
|
83 |
- Do not infer or assume facts that are not explicitly stated.
|
84 |
+
- If the answer is unknown or insufficient, say "I cannot answer with the available data."
|
85 |
- Limit your answer to 100 words.
|
86 |
|
|
|
87 |
### CONTEXT:
|
88 |
{chr(10).join(context)}
|
89 |
|
|
|
108 |
except Exception as e:
|
109 |
return str(e), ids, context, rdf_facts
|
110 |
|
|
|
111 |
def main():
|
112 |
st.markdown("""
|
113 |
<h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
|
114 |
<div class='info-box'>
|
115 |
+
<b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
|
116 |
+
This tool documents these cultural treasures before they disappear forever.
|
117 |
</div>
|
118 |
""", unsafe_allow_html=True)
|
119 |
|
120 |
with st.sidebar:
|
121 |
st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
|
122 |
|
123 |
+
st.markdown("### About This Tool")
|
124 |
st.markdown("""
|
125 |
+
- **Standard**: Semantic search based on text-only embeddings.
|
126 |
+
- **Hybrid**: Uses node2vec to combine graph structure with descriptive features.
|
127 |
+
- **GraphSAGE**: Employs deep graph learning (GraphSAGE) for relational patterns.
|
128 |
""")
|
129 |
|
130 |
+
k = st.slider("Languages to analyze per query", 1, 10, 3)
|
131 |
+
show_ids = st.checkbox("Language IDs", value=True)
|
132 |
+
show_ctx = st.checkbox("Contextual Info", value=True)
|
133 |
+
show_rdf = st.checkbox("RDF Relations", value=True)
|
|
|
134 |
|
135 |
+
query = st.text_input("Ask something about South American languages:", "Which Amazonian languages are most at risk?")
|
136 |
|
137 |
+
if st.button("Analyze"):
|
138 |
+
cols = st.columns(len(methods))
|
139 |
+
for col, (label, method) in zip(cols, methods.items()):
|
|
|
140 |
with col:
|
141 |
+
st.subheader(label)
|
142 |
start = datetime.datetime.now()
|
143 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
144 |
duration = (datetime.datetime.now() - start).total_seconds()
|
145 |
st.markdown(response)
|
146 |
st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
|
147 |
if show_ids:
|
148 |
+
st.markdown("**IDs:**")
|
149 |
st.code("\n".join(lang_ids))
|
150 |
if show_ctx:
|
151 |
+
st.markdown("**Context:**")
|
152 |
st.markdown("\n\n---\n\n".join(context))
|
153 |
if show_rdf:
|
154 |
+
st.markdown("**RDF:**")
|
155 |
st.code("\n".join(rdf_data))
|
156 |
|
157 |
if __name__ == "__main__":
|
158 |
main()
|
159 |
|
160 |
+
|