Javier Vera
commited on
Update rag_hf.py
Browse files
rag_hf.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# rag_interface.py (
|
2 |
import streamlit as st
|
3 |
import pickle
|
4 |
import numpy as np
|
@@ -14,7 +14,8 @@ from dotenv import load_dotenv
|
|
14 |
# === CONFIGURATION ===
|
15 |
load_dotenv()
|
16 |
|
17 |
-
|
|
|
18 |
EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
|
19 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
EX = Namespace("http://example.org/lang/")
|
@@ -42,28 +43,10 @@ st.markdown("""
|
|
42 |
margin-bottom: 1.5rem;
|
43 |
border-left: 4px solid #3498db;
|
44 |
}
|
45 |
-
.sidebar-section {
|
46 |
-
margin-bottom: 2rem;
|
47 |
-
}
|
48 |
.sidebar-title {
|
49 |
-
color: #2c3e50;
|
50 |
font-size: 1.1rem;
|
51 |
font-weight: 600;
|
52 |
-
margin-
|
53 |
-
border-bottom: 1px solid #eee;
|
54 |
-
padding-bottom: 0.5rem;
|
55 |
-
}
|
56 |
-
.method-card {
|
57 |
-
background-color: #f8f9fa;
|
58 |
-
border-radius: 8px;
|
59 |
-
padding: 0.8rem;
|
60 |
-
margin-bottom: 0.8rem;
|
61 |
-
border-left: 3px solid #3498db;
|
62 |
-
}
|
63 |
-
.method-title {
|
64 |
-
font-weight: 600;
|
65 |
-
color: #3498db;
|
66 |
-
margin-bottom: 0.3rem;
|
67 |
}
|
68 |
</style>
|
69 |
""", unsafe_allow_html=True)
|
@@ -73,9 +56,8 @@ def load_all_components():
|
|
73 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
74 |
methods = {}
|
75 |
for label, suffix, ttl, matrix_path in [
|
76 |
-
("
|
77 |
-
("
|
78 |
-
("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
|
79 |
]:
|
80 |
with open(f"id_map{suffix}.pkl", "rb") as f:
|
81 |
id_map = pickle.load(f)
|
@@ -114,10 +96,7 @@ def query_rdf(rdf, lang_id):
|
|
114 |
SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
|
115 |
"""
|
116 |
try:
|
117 |
-
return [
|
118 |
-
(str(row[0]).split("/")[-1], str(row[1]))
|
119 |
-
for row in rdf.query(q)
|
120 |
-
]
|
121 |
except Exception as e:
|
122 |
return [("error", str(e))]
|
123 |
|
@@ -131,7 +110,7 @@ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
|
|
131 |
You are an expert in South American indigenous languages.
|
132 |
Use strictly and only the information below to answer the user question in **English**.
|
133 |
- Do not infer or assume facts that are not explicitly stated.
|
134 |
-
- If the answer is unknown or insufficient, say "I cannot answer with the available data
|
135 |
- Limit your answer to 100 words.
|
136 |
|
137 |
|
@@ -148,9 +127,9 @@ Answer:
|
|
148 |
[/INST]"""
|
149 |
try:
|
150 |
res = requests.post(
|
151 |
-
|
152 |
-
headers={"Authorization": f"Bearer {
|
153 |
-
json={"inputs": prompt}, timeout=
|
154 |
)
|
155 |
out = res.json()
|
156 |
if isinstance(out, list) and "generated_text" in out[0]:
|
@@ -164,90 +143,47 @@ def main():
|
|
164 |
st.markdown("""
|
165 |
<h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
|
166 |
<div class='info-box'>
|
167 |
-
<b>
|
168 |
-
This tool documents these cultural treasures before they disappear forever.
|
169 |
</div>
|
170 |
""", unsafe_allow_html=True)
|
171 |
|
172 |
with st.sidebar:
|
173 |
st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
with st.container():
|
193 |
-
st.markdown('<div class="sidebar-title">Research Settings</div>', unsafe_allow_html=True)
|
194 |
-
k = st.slider("Languages to analyze per query", 1, 10, 3)
|
195 |
-
st.markdown("**Display Options:**")
|
196 |
-
show_ids = st.checkbox("Language IDs", value=True, key="show_ids")
|
197 |
-
show_ctx = st.checkbox("Cultural Context", value=True, key="show_ctx")
|
198 |
-
show_rdf = st.checkbox("RDF Relations", value=True, key="show_rdf")
|
199 |
-
|
200 |
-
with st.container():
|
201 |
-
st.markdown('<div class="sidebar-title">Data Sources</div>', unsafe_allow_html=True)
|
202 |
-
st.markdown("""
|
203 |
-
- Glottolog
|
204 |
-
- Wikidata
|
205 |
-
- Wikipedia
|
206 |
-
- Ethnologue
|
207 |
-
""")
|
208 |
-
|
209 |
-
query = st.text_input("Ask about indigenous languages:", "Which Amazonian languages are most at risk?")
|
210 |
-
|
211 |
-
if st.button("Analyze with All Methods") and query:
|
212 |
-
col1, col2, col3 = st.columns(3)
|
213 |
results = {}
|
214 |
-
for col, (label, method) in zip([col1, col2
|
215 |
with col:
|
216 |
-
st.subheader(f"{label}
|
217 |
start = datetime.datetime.now()
|
218 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
219 |
duration = (datetime.datetime.now() - start).total_seconds()
|
220 |
st.markdown(response)
|
221 |
st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
|
222 |
if show_ids:
|
223 |
-
st.markdown("**Language
|
224 |
st.code("\n".join(lang_ids))
|
225 |
if show_ctx:
|
226 |
-
st.markdown("**
|
227 |
st.markdown("\n\n---\n\n".join(context))
|
228 |
if show_rdf:
|
229 |
-
st.markdown("**
|
230 |
st.code("\n".join(rdf_data))
|
231 |
-
results[label] = response
|
232 |
-
|
233 |
-
log = f"""
|
234 |
-
[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]
|
235 |
-
QUERY: {query}
|
236 |
-
STANDARD:
|
237 |
-
{results.get('Standard', '')}
|
238 |
-
|
239 |
-
HYBRID:
|
240 |
-
{results.get('Hybrid', '')}
|
241 |
-
|
242 |
-
GRAPH-SAGE:
|
243 |
-
{results.get('GraphSAGE', '')}
|
244 |
-
{'='*60}
|
245 |
-
"""
|
246 |
-
try:
|
247 |
-
with open("language_analysis_logs.txt", "a", encoding="utf-8") as f:
|
248 |
-
f.write(log)
|
249 |
-
except Exception as e:
|
250 |
-
st.warning(f"Failed to log: {str(e)}")
|
251 |
|
252 |
if __name__ == "__main__":
|
253 |
main()
|
|
|
1 |
+
# rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
|
2 |
import streamlit as st
|
3 |
import pickle
|
4 |
import numpy as np
|
|
|
14 |
# === CONFIGURATION ===
|
15 |
load_dotenv()
|
16 |
|
17 |
+
ENDPOINT_URL = os.getenv("HF_ENDPOINT")
|
18 |
+
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
|
19 |
EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
|
20 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
EX = Namespace("http://example.org/lang/")
|
|
|
43 |
margin-bottom: 1.5rem;
|
44 |
border-left: 4px solid #3498db;
|
45 |
}
|
|
|
|
|
|
|
46 |
.sidebar-title {
|
|
|
47 |
font-size: 1.1rem;
|
48 |
font-weight: 600;
|
49 |
+
margin-top: 1rem;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
}
|
51 |
</style>
|
52 |
""", unsafe_allow_html=True)
|
|
|
56 |
embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
|
57 |
methods = {}
|
58 |
for label, suffix, ttl, matrix_path in [
|
59 |
+
("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
|
60 |
+
("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
|
|
|
61 |
]:
|
62 |
with open(f"id_map{suffix}.pkl", "rb") as f:
|
63 |
id_map = pickle.load(f)
|
|
|
96 |
SELECT ?property ?value WHERE {{ ex:{lang_id} ?property ?value }}
|
97 |
"""
|
98 |
try:
|
99 |
+
return [(str(row[0]).split("/")[-1], str(row[1])) for row in rdf.query(q)]
|
|
|
|
|
|
|
100 |
except Exception as e:
|
101 |
return [("error", str(e))]
|
102 |
|
|
|
110 |
You are an expert in South American indigenous languages.
|
111 |
Use strictly and only the information below to answer the user question in **English**.
|
112 |
- Do not infer or assume facts that are not explicitly stated.
|
113 |
+
- If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
|
114 |
- Limit your answer to 100 words.
|
115 |
|
116 |
|
|
|
127 |
[/INST]"""
|
128 |
try:
|
129 |
res = requests.post(
|
130 |
+
ENDPOINT_URL,
|
131 |
+
headers={"Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json"},
|
132 |
+
json={"inputs": prompt}, timeout=60
|
133 |
)
|
134 |
out = res.json()
|
135 |
if isinstance(out, list) and "generated_text" in out[0]:
|
|
|
143 |
st.markdown("""
|
144 |
<h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
|
145 |
<div class='info-box'>
|
146 |
+
<b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
|
|
|
147 |
</div>
|
148 |
""", unsafe_allow_html=True)
|
149 |
|
150 |
with st.sidebar:
|
151 |
st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
|
152 |
|
153 |
+
st.markdown("### What are the methods?")
|
154 |
+
st.markdown("""
|
155 |
+
- **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
|
156 |
+
- **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
|
157 |
+
""")
|
158 |
+
|
159 |
+
st.markdown("### Options")
|
160 |
+
k = st.slider("How many languages to analyze?", 1, 10, 3)
|
161 |
+
show_ids = st.checkbox("Show IDs", value=True)
|
162 |
+
show_ctx = st.checkbox("Show Text Info", value=True)
|
163 |
+
show_rdf = st.checkbox("Show Extra Facts", value=True)
|
164 |
+
|
165 |
+
query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
|
166 |
+
|
167 |
+
if st.button("Analyze") and query:
|
168 |
+
col1, col2 = st.columns(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
results = {}
|
170 |
+
for col, (label, method) in zip([col1, col2], methods.items()):
|
171 |
with col:
|
172 |
+
st.subheader(f"{label} Method")
|
173 |
start = datetime.datetime.now()
|
174 |
response, lang_ids, context, rdf_data = generate_response(*method, query, k)
|
175 |
duration = (datetime.datetime.now() - start).total_seconds()
|
176 |
st.markdown(response)
|
177 |
st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
|
178 |
if show_ids:
|
179 |
+
st.markdown("**Language IDs:**")
|
180 |
st.code("\n".join(lang_ids))
|
181 |
if show_ctx:
|
182 |
+
st.markdown("**Text Info:**")
|
183 |
st.markdown("\n\n---\n\n".join(context))
|
184 |
if show_rdf:
|
185 |
+
st.markdown("**Extra Facts:**")
|
186 |
st.code("\n".join(rdf_data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
main()
|