Javier Vera commited on
Commit
d5a9a0d
·
verified ·
1 Parent(s): b276dbe

Update rag_hf.py

Browse files
Files changed (1) hide show
  1. rag_hf.py +30 -60
rag_hf.py CHANGED
@@ -1,4 +1,4 @@
1
- # rag_interface.py (Hybrid & GraphSAGE only, simplified explanations, renamed methods)
2
  import streamlit as st
3
  import pickle
4
  import numpy as np
@@ -20,44 +20,16 @@ EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
  EX = Namespace("http://example.org/lang/")
22
 
23
- st.set_page_config(
24
- page_title="Vanishing Voices: Language Atlas",
25
- page_icon="🌍",
26
- layout="wide",
27
- initial_sidebar_state="expanded"
28
- )
29
-
30
- # Custom CSS
31
- st.markdown("""
32
- <style>
33
- .header {
34
- color: #2c3e50;
35
- border-bottom: 2px solid #3498db;
36
- padding-bottom: 10px;
37
- margin-bottom: 1.5rem;
38
- }
39
- .info-box {
40
- background-color: #e8f4fc;
41
- border-radius: 8px;
42
- padding: 1rem;
43
- margin-bottom: 1.5rem;
44
- border-left: 4px solid #3498db;
45
- }
46
- .sidebar-title {
47
- font-size: 1.1rem;
48
- font-weight: 600;
49
- margin-top: 1rem;
50
- }
51
- </style>
52
- """, unsafe_allow_html=True)
53
 
54
  @st.cache_resource(show_spinner="Loading models and indexes...")
55
  def load_all_components():
56
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
57
  methods = {}
58
  for label, suffix, ttl, matrix_path in [
59
- ("InfoMatch", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
60
- ("LinkGraph", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
 
61
  ]:
62
  with open(f"id_map{suffix}.pkl", "rb") as f:
63
  id_map = pickle.load(f)
@@ -71,7 +43,6 @@ def load_all_components():
71
 
72
  methods, embedder = load_all_components()
73
 
74
- # === CORE FUNCTIONS ===
75
  def get_top_k(matrix, id_map, query, k):
76
  vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
77
  vec = vec.cpu().numpy().astype("float32")
@@ -81,14 +52,14 @@ def get_top_k(matrix, id_map, query, k):
81
 
82
  def get_context(G, lang_id):
83
  node = G.nodes.get(lang_id, {})
84
- lines = [f"**Language:** {node.get('label', lang_id)}"]
85
  if node.get("wikipedia_summary"):
86
- lines.append(f"**Wikipedia:** {node['wikipedia_summary']}")
87
  if node.get("wikidata_description"):
88
- lines.append(f"**Wikidata:** {node['wikidata_description']}")
89
  if node.get("wikidata_countries"):
90
- lines.append(f"**Countries:** {node['wikidata_countries']}")
91
- return "\n\n".join(lines)
92
 
93
  def query_rdf(rdf, lang_id):
94
  q = f"""
@@ -110,10 +81,9 @@ def generate_response(matrix, id_map, G, rdf, user_question, k=3):
110
  You are an expert in South American indigenous languages.
111
  Use strictly and only the information below to answer the user question in **English**.
112
  - Do not infer or assume facts that are not explicitly stated.
113
- - If the answer is unknown or insufficient, say \"I cannot answer with the available data.\"
114
  - Limit your answer to 100 words.
115
 
116
-
117
  ### CONTEXT:
118
  {chr(10).join(context)}
119
 
@@ -138,53 +108,53 @@ Answer:
138
  except Exception as e:
139
  return str(e), ids, context, rdf_facts
140
 
141
- # === MAIN FUNCTION ===
142
  def main():
143
  st.markdown("""
144
  <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
145
  <div class='info-box'>
146
- <b>Why this matters:</b> Many indigenous languages in South America are disappearing. This app helps understand and preserve them using artificial intelligence.
 
147
  </div>
148
  """, unsafe_allow_html=True)
149
 
150
  with st.sidebar:
151
  st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
152
 
153
- st.markdown("### What are the methods?")
154
  st.markdown("""
155
- - **Graph A**: Combines descriptions, country info, and speaker data using classic node2vec embeddings.
156
- - **Graph B**: Uses graph learning (GraphSAGE) to detect patterns in how languages relate to each other.
 
157
  """)
158
 
159
- st.markdown("### Options")
160
- k = st.slider("How many languages to analyze?", 1, 10, 3)
161
- show_ids = st.checkbox("Show IDs", value=True)
162
- show_ctx = st.checkbox("Show Text Info", value=True)
163
- show_rdf = st.checkbox("Show Extra Facts", value=True)
164
 
165
- query = st.text_input("Ask something about South American languages:", "What languages are spoken in Perú?")
166
 
167
- if st.button("Analyze") and query:
168
- col1, col2 = st.columns(2)
169
- results = {}
170
- for col, (label, method) in zip([col1, col2], methods.items()):
171
  with col:
172
- st.subheader(f"{label} Method")
173
  start = datetime.datetime.now()
174
  response, lang_ids, context, rdf_data = generate_response(*method, query, k)
175
  duration = (datetime.datetime.now() - start).total_seconds()
176
  st.markdown(response)
177
  st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
178
  if show_ids:
179
- st.markdown("**Language IDs:**")
180
  st.code("\n".join(lang_ids))
181
  if show_ctx:
182
- st.markdown("**Text Info:**")
183
  st.markdown("\n\n---\n\n".join(context))
184
  if show_rdf:
185
- st.markdown("**Extra Facts:**")
186
  st.code("\n".join(rdf_data))
187
 
188
  if __name__ == "__main__":
189
  main()
190
 
 
 
1
+ # rag_interface.py (tres métodos, descripciones técnicas)
2
  import streamlit as st
3
  import pickle
4
  import numpy as np
 
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
  EX = Namespace("http://example.org/lang/")
22
 
23
+ st.set_page_config(page_title="Vanishing Voices: Language Atlas", page_icon="🌍", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  @st.cache_resource(show_spinner="Loading models and indexes...")
26
  def load_all_components():
27
  embedder = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)
28
  methods = {}
29
  for label, suffix, ttl, matrix_path in [
30
+ ("Standard", "", "grafo_ttl_no_hibrido.ttl", "embed_matrix.npy"),
31
+ ("Hybrid", "_hybrid", "grafo_ttl_hibrido.ttl", "embed_matrix_hybrid.npy"),
32
+ ("GraphSAGE", "_hybrid_graphsage", "grafo_ttl_hibrido_graphsage.ttl", "embed_matrix_hybrid_graphsage.npy")
33
  ]:
34
  with open(f"id_map{suffix}.pkl", "rb") as f:
35
  id_map = pickle.load(f)
 
43
 
44
  methods, embedder = load_all_components()
45
 
 
46
  def get_top_k(matrix, id_map, query, k):
47
  vec = embedder.encode(f"query: {query}", convert_to_tensor=True, device=DEVICE)
48
  vec = vec.cpu().numpy().astype("float32")
 
52
 
53
  def get_context(G, lang_id):
54
  node = G.nodes.get(lang_id, {})
55
+ parts = [f"**Language:** {node.get('label', lang_id)}"]
56
  if node.get("wikipedia_summary"):
57
+ parts.append(f"**Wikipedia:** {node['wikipedia_summary']}")
58
  if node.get("wikidata_description"):
59
+ parts.append(f"**Wikidata:** {node['wikidata_description']}")
60
  if node.get("wikidata_countries"):
61
+ parts.append(f"**Countries:** {node['wikidata_countries']}")
62
+ return "\n\n".join(parts)
63
 
64
  def query_rdf(rdf, lang_id):
65
  q = f"""
 
81
  You are an expert in South American indigenous languages.
82
  Use strictly and only the information below to answer the user question in **English**.
83
  - Do not infer or assume facts that are not explicitly stated.
84
+ - If the answer is unknown or insufficient, say "I cannot answer with the available data."
85
  - Limit your answer to 100 words.
86
 
 
87
  ### CONTEXT:
88
  {chr(10).join(context)}
89
 
 
108
  except Exception as e:
109
  return str(e), ids, context, rdf_facts
110
 
 
111
  def main():
112
  st.markdown("""
113
  <h1 class='header'>Vanishing Voices: South America's Endangered Language Atlas</h1>
114
  <div class='info-box'>
115
+ <b>Linguistic Emergency:</b> Over 40% of South America's indigenous languages face extinction.
116
+ This tool documents these cultural treasures before they disappear forever.
117
  </div>
118
  """, unsafe_allow_html=True)
119
 
120
  with st.sidebar:
121
  st.image("https://glottolog.org/static/img/glottolog_lod.png", width=180)
122
 
123
+ st.markdown("### About This Tool")
124
  st.markdown("""
125
+ - **Standard**: Semantic search based on text-only embeddings.
126
+ - **Hybrid**: Uses node2vec to combine graph structure with descriptive features.
127
+ - **GraphSAGE**: Employs deep graph learning (GraphSAGE) for relational patterns.
128
  """)
129
 
130
+ k = st.slider("Languages to analyze per query", 1, 10, 3)
131
+ show_ids = st.checkbox("Language IDs", value=True)
132
+ show_ctx = st.checkbox("Contextual Info", value=True)
133
+ show_rdf = st.checkbox("RDF Relations", value=True)
 
134
 
135
+ query = st.text_input("Ask something about South American languages:", "Which Amazonian languages are most at risk?")
136
 
137
+ if st.button("Analyze"):
138
+ cols = st.columns(len(methods))
139
+ for col, (label, method) in zip(cols, methods.items()):
 
140
  with col:
141
+ st.subheader(label)
142
  start = datetime.datetime.now()
143
  response, lang_ids, context, rdf_data = generate_response(*method, query, k)
144
  duration = (datetime.datetime.now() - start).total_seconds()
145
  st.markdown(response)
146
  st.markdown(f"⏱️ {duration:.2f}s | 🌐 {len(lang_ids)} languages")
147
  if show_ids:
148
+ st.markdown("**IDs:**")
149
  st.code("\n".join(lang_ids))
150
  if show_ctx:
151
+ st.markdown("**Context:**")
152
  st.markdown("\n\n---\n\n".join(context))
153
  if show_rdf:
154
+ st.markdown("**RDF:**")
155
  st.code("\n".join(rdf_data))
156
 
157
  if __name__ == "__main__":
158
  main()
159
 
160
+