Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +414 -0
- config.json +37 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:21769
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: Lajavaness/bilingual-embedding-large
|
10 |
+
widget:
|
11 |
+
- source_sentence: 'Go bobO..... Take this one! Slap in the face! MAP Leda Beck 5h-
|
12 |
+
A Spanish biologist, researcher: - You pay 1 million euros a month to a football
|
13 |
+
player and 1,800 euros for a Biology researcher. Now you want one treatment. Will
|
14 |
+
you ask Cristiano Ronaldo or to Messi and they will find a cure for you.'
|
15 |
+
sentences:
|
16 |
+
- Treat stroke using a needle Doctors warn against ‘dangerously misleading’ posts
|
17 |
+
claiming you can treat a stroke with a needle
|
18 |
+
- This Spanish biologist said that Cristiano Ronaldo and Messi must find a cure
|
19 |
+
for the new coronavirus as they earn much more than scientists The woman in these
|
20 |
+
posts is a Spanish politician who has made no statements about Messi, CR7, or
|
21 |
+
the cure for COVID-19.
|
22 |
+
- The Simpsons predicted 2022 Canada trucker protests Footage from The Simpsons
|
23 |
+
was edited to look like the show predicted Canada’s Covid truckers protest
|
24 |
+
- source_sentence: 'This is what the rivers of Colombia are becoming... Disappeared
|
25 |
+
people floating in the rivers, killed by Duque''s assassins This is what the rivers
|
26 |
+
of Cali are becoming In transportation of our dead at hands of duqueh''s henchmen: 242'
|
27 |
+
sentences:
|
28 |
+
- A photograph shows bodies floated in a river in Cali The photo of black bags in
|
29 |
+
a river is a tribute to those killed in the protests in Colombia
|
30 |
+
- Sri Lankan doctor created COVID-19 rapid test kits The doctor interviewed in this
|
31 |
+
report did not say he was involved in the development of COVID-19 test kits
|
32 |
+
- Masks are meant to protect the vaccinated Face mask requirements aim to protect
|
33 |
+
unvaccinated people
|
34 |
+
- source_sentence: 'How can you say it proudly that you are leaders of SA... When
|
35 |
+
it looks like a dumping site nd you living high lavishly life in your porsh houses
|
36 |
+
built out of hard earned Tax payers money? CRY SA OUR BELOVED COUNTRY '
|
37 |
+
sentences:
|
38 |
+
- Donald Trump next to a stack of declassified files Trump did not pose in this
|
39 |
+
photo with declassified files, but with federal regulations
|
40 |
+
- Images show trash-strewn streets in South Africa These photos of messy Johannesburg
|
41 |
+
streets are old and taken out of context
|
42 |
+
- SBT will again air the program "A Semana do Presidente" with Bolsonaro There is
|
43 |
+
no forecast for the return of "A Semana do Presidente" on SBT, despite a project
|
44 |
+
in 2020
|
45 |
+
- source_sentence: 'First photos of Earth sent by India''s Chadrayan2 space mission.
|
46 |
+
Breathtaking. '
|
47 |
+
sentences:
|
48 |
+
- This nest of bats is the source of the coronavirus in Wuhan China The video of
|
49 |
+
a roof infested with bats was recorded in 2011 in the United States
|
50 |
+
- Australia recalled 50 million doses of Covid vaccine No, Australia has not recalled
|
51 |
+
50 million doses of a Covid vaccine
|
52 |
+
- First photos of Earth sent by India's Chadrayan2 space mission These alleged photos
|
53 |
+
of Earth have no connection to the Chandrayaan-2 lunar mission.
|
54 |
+
- source_sentence: Even if you remove JIMENEZ ....... IF THERE IS STILL A SMARTMATIC
|
55 |
+
THAT IS GOOD AT MAGIC THERE IS ALSO NO ..... SMARTMATIC SHOULD BE REMOVED FROM
|
56 |
+
THE COMELEC CONTRACT ..... BECAUSE THE DEMON COMELEC HAS LONG HONORED THE VOTE
|
57 |
+
OF MANY PEOPLE ..... AS LONG AS THERE ARE COMMISSIONERS IN THE COMELEC WHO LOOK
|
58 |
+
LIKE MONEY, WE WILL NOT HAVE A CLEAN ELECTION ....... JUST IMAGINE HOW LONG THE
|
59 |
+
ISSUE SPREADS THAT IF A CANDIDATE WANTS TO WIN, IT WILL PAY THE COMELEC 25 MILLION
|
60 |
+
???????????????????????????? ? SO ARE THE ELECTION RESULTS HOKOS POKOS ??????????????????????
|
61 |
+
DEMONS ...... SO ALL THE PUNISHMENT OF HEAVEN HAS BEEN GIVEN IN THE PHILIPPINES
|
62 |
+
BECAUSE TANING LIVES WITH US ...... THE THOUGHT IS PURE MONEY ..... SO EVEN ELECTIONS
|
63 |
+
ARE MONEY ..... ..... 7:08 AM 4G 51% FINALLY, COMELEC OFFICIAL JIMENEZ, REMOVED
|
64 |
+
IN PLACE. BY PRRD AND OTHERS AGAIN THIS. FOR CLEAN NOW ELECTION TO COMING 2022
|
65 |
+
ELECTION
|
66 |
+
sentences:
|
67 |
+
- The WHO declared covid-19 an endemic disease Although it considers it probable,
|
68 |
+
the WHO has not yet declared covid-19 an endemic disease
|
69 |
+
- Israel, the only country with four vaccines, broke the record for covid-19 cases
|
70 |
+
Israel has not immunized its entire population with 4 doses in January 2022 and
|
71 |
+
the vaccines are effective
|
72 |
+
- Philippine President Rodrigo Duterte fired Comelec spokesman James Jimenez in
|
73 |
+
May 2021 Posts misleadingly claim Philippine president fired poll body spokesman
|
74 |
+
pipeline_tag: sentence-similarity
|
75 |
+
library_name: sentence-transformers
|
76 |
+
---
|
77 |
+
|
78 |
+
# SentenceTransformer based on Lajavaness/bilingual-embedding-large
|
79 |
+
|
80 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [Lajavaness/bilingual-embedding-large](https://huggingface.co/Lajavaness/bilingual-embedding-large). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
81 |
+
|
82 |
+
## Model Details
|
83 |
+
|
84 |
+
### Model Description
|
85 |
+
- **Model Type:** Sentence Transformer
|
86 |
+
- **Base model:** [Lajavaness/bilingual-embedding-large](https://huggingface.co/Lajavaness/bilingual-embedding-large) <!-- at revision e83179d7a66e8aed1b3015e98bb5ae234ed89598 -->
|
87 |
+
- **Maximum Sequence Length:** 512 tokens
|
88 |
+
- **Output Dimensionality:** 1024 dimensions
|
89 |
+
- **Similarity Function:** Cosine Similarity
|
90 |
+
<!-- - **Training Dataset:** Unknown -->
|
91 |
+
<!-- - **Language:** Unknown -->
|
92 |
+
<!-- - **License:** Unknown -->
|
93 |
+
|
94 |
+
### Model Sources
|
95 |
+
|
96 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
97 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
98 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
99 |
+
|
100 |
+
### Full Model Architecture
|
101 |
+
|
102 |
+
```
|
103 |
+
SentenceTransformer(
|
104 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
|
105 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
106 |
+
(2): Normalize()
|
107 |
+
)
|
108 |
+
```
|
109 |
+
|
110 |
+
## Usage
|
111 |
+
|
112 |
+
### Direct Usage (Sentence Transformers)
|
113 |
+
|
114 |
+
First install the Sentence Transformers library:
|
115 |
+
|
116 |
+
```bash
|
117 |
+
pip install -U sentence-transformers
|
118 |
+
```
|
119 |
+
|
120 |
+
Then you can load this model and run inference.
|
121 |
+
```python
|
122 |
+
from sentence_transformers import SentenceTransformer
|
123 |
+
|
124 |
+
# Download from the 🤗 Hub
|
125 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
126 |
+
# Run inference
|
127 |
+
sentences = [
|
128 |
+
'Even if you remove JIMENEZ ....... IF THERE IS STILL A SMARTMATIC THAT IS GOOD AT MAGIC THERE IS ALSO NO ..... SMARTMATIC SHOULD BE REMOVED FROM THE COMELEC CONTRACT ..... BECAUSE THE DEMON COMELEC HAS LONG HONORED THE VOTE OF MANY PEOPLE ..... AS LONG AS THERE ARE COMMISSIONERS IN THE COMELEC WHO LOOK LIKE MONEY, WE WILL NOT HAVE A CLEAN ELECTION ....... JUST IMAGINE HOW LONG THE ISSUE SPREADS THAT IF A CANDIDATE WANTS TO WIN, IT WILL PAY THE COMELEC 25 MILLION ???????????????????????????? ? SO ARE THE ELECTION RESULTS HOKOS POKOS ?????????????????????? DEMONS ...... SO ALL THE PUNISHMENT OF HEAVEN HAS BEEN GIVEN IN THE PHILIPPINES BECAUSE TANING LIVES WITH US ...... THE THOUGHT IS PURE MONEY ..... SO EVEN ELECTIONS ARE MONEY ..... ..... 7:08 AM 4G 51% FINALLY, COMELEC OFFICIAL JIMENEZ, REMOVED IN PLACE. BY PRRD AND OTHERS AGAIN THIS. FOR CLEAN NOW ELECTION TO COMING 2022 ELECTION',
|
129 |
+
'Philippine President Rodrigo Duterte fired Comelec spokesman James Jimenez in May 2021 Posts misleadingly claim Philippine president fired poll body spokesman',
|
130 |
+
'The WHO declared covid-19 an endemic disease Although it considers it probable, the WHO has not yet declared covid-19 an endemic disease',
|
131 |
+
]
|
132 |
+
embeddings = model.encode(sentences)
|
133 |
+
print(embeddings.shape)
|
134 |
+
# [3, 1024]
|
135 |
+
|
136 |
+
# Get the similarity scores for the embeddings
|
137 |
+
similarities = model.similarity(embeddings, embeddings)
|
138 |
+
print(similarities.shape)
|
139 |
+
# [3, 3]
|
140 |
+
```
|
141 |
+
|
142 |
+
<!--
|
143 |
+
### Direct Usage (Transformers)
|
144 |
+
|
145 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
146 |
+
|
147 |
+
</details>
|
148 |
+
-->
|
149 |
+
|
150 |
+
<!--
|
151 |
+
### Downstream Usage (Sentence Transformers)
|
152 |
+
|
153 |
+
You can finetune this model on your own dataset.
|
154 |
+
|
155 |
+
<details><summary>Click to expand</summary>
|
156 |
+
|
157 |
+
</details>
|
158 |
+
-->
|
159 |
+
|
160 |
+
<!--
|
161 |
+
### Out-of-Scope Use
|
162 |
+
|
163 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
164 |
+
-->
|
165 |
+
|
166 |
+
<!--
|
167 |
+
## Bias, Risks and Limitations
|
168 |
+
|
169 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
170 |
+
-->
|
171 |
+
|
172 |
+
<!--
|
173 |
+
### Recommendations
|
174 |
+
|
175 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
176 |
+
-->
|
177 |
+
|
178 |
+
## Training Details
|
179 |
+
|
180 |
+
### Training Dataset
|
181 |
+
|
182 |
+
#### Unnamed Dataset
|
183 |
+
|
184 |
+
* Size: 21,769 training samples
|
185 |
+
* Columns: <code>sentence_0</code> and <code>sentence_1</code>
|
186 |
+
* Approximate statistics based on the first 1000 samples:
|
187 |
+
| | sentence_0 | sentence_1 |
|
188 |
+
|:--------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|
189 |
+
| type | string | string |
|
190 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 120.55 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 14 tokens</li><li>mean: 38.75 tokens</li><li>max: 148 tokens</li></ul> |
|
191 |
+
* Samples:
|
192 |
+
| sentence_0 | sentence_1 |
|
193 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
194 |
+
| <code>"On January 1, 1979 New York billionaire Brandon Torrent allowed himself to be photographed while urinating on a homeless man sleeping on the street. This image explains, better than many words, the division of the world into social classes that we must eliminate . Meanwhile, in 21st century Brazil, many 'good citizens', just above the homeless condition, applaud politicians and politicians* who support the predatory elite represented by this abject and unworthy human being, who urinates on people who, in the final analysis, are the builders of the fortune he enjoys. Until we realize which side of this stream of urine we are on, we will not be able to build a truly just society. Class consciousness is the true and most urgent education." </code> | <code>This photo shows a billionaire named Brandon Torrent urinating on a homeless man The real story behind the image of a man who appears to urinate on a homeless person</code> |
|
195 |
+
| <code>French secret service officer jean claude returns from his mission as imam with deash (isis) like others from several countries in Syria.. there are questions </code> | <code>This man is a French intelligence officer No, this man is not a French intelligence officer</code> |
|
196 |
+
| <code>Oh yes! Rohit Sharma Mumbai Indians Burj Khalifa DIEL 82 SAMSUNG MUMBAI INDIANS</code> | <code>Dubai’s Burj Khalifa skyscraper displays photo of Indian cricketer Rohit Sharma This image of the Burj Khalifa has been doctored – the original does not show a projection of Indian cricketer Rohit Sharma</code> |
|
197 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
198 |
+
```json
|
199 |
+
{
|
200 |
+
"scale": 20.0,
|
201 |
+
"similarity_fct": "cos_sim"
|
202 |
+
}
|
203 |
+
```
|
204 |
+
|
205 |
+
### Training Hyperparameters
|
206 |
+
#### Non-Default Hyperparameters
|
207 |
+
|
208 |
+
- `per_device_train_batch_size`: 2
|
209 |
+
- `per_device_eval_batch_size`: 2
|
210 |
+
- `num_train_epochs`: 1
|
211 |
+
- `multi_dataset_batch_sampler`: round_robin
|
212 |
+
|
213 |
+
#### All Hyperparameters
|
214 |
+
<details><summary>Click to expand</summary>
|
215 |
+
|
216 |
+
- `overwrite_output_dir`: False
|
217 |
+
- `do_predict`: False
|
218 |
+
- `eval_strategy`: no
|
219 |
+
- `prediction_loss_only`: True
|
220 |
+
- `per_device_train_batch_size`: 2
|
221 |
+
- `per_device_eval_batch_size`: 2
|
222 |
+
- `per_gpu_train_batch_size`: None
|
223 |
+
- `per_gpu_eval_batch_size`: None
|
224 |
+
- `gradient_accumulation_steps`: 1
|
225 |
+
- `eval_accumulation_steps`: None
|
226 |
+
- `torch_empty_cache_steps`: None
|
227 |
+
- `learning_rate`: 5e-05
|
228 |
+
- `weight_decay`: 0.0
|
229 |
+
- `adam_beta1`: 0.9
|
230 |
+
- `adam_beta2`: 0.999
|
231 |
+
- `adam_epsilon`: 1e-08
|
232 |
+
- `max_grad_norm`: 1
|
233 |
+
- `num_train_epochs`: 1
|
234 |
+
- `max_steps`: -1
|
235 |
+
- `lr_scheduler_type`: linear
|
236 |
+
- `lr_scheduler_kwargs`: {}
|
237 |
+
- `warmup_ratio`: 0.0
|
238 |
+
- `warmup_steps`: 0
|
239 |
+
- `log_level`: passive
|
240 |
+
- `log_level_replica`: warning
|
241 |
+
- `log_on_each_node`: True
|
242 |
+
- `logging_nan_inf_filter`: True
|
243 |
+
- `save_safetensors`: True
|
244 |
+
- `save_on_each_node`: False
|
245 |
+
- `save_only_model`: False
|
246 |
+
- `restore_callback_states_from_checkpoint`: False
|
247 |
+
- `no_cuda`: False
|
248 |
+
- `use_cpu`: False
|
249 |
+
- `use_mps_device`: False
|
250 |
+
- `seed`: 42
|
251 |
+
- `data_seed`: None
|
252 |
+
- `jit_mode_eval`: False
|
253 |
+
- `use_ipex`: False
|
254 |
+
- `bf16`: False
|
255 |
+
- `fp16`: False
|
256 |
+
- `fp16_opt_level`: O1
|
257 |
+
- `half_precision_backend`: auto
|
258 |
+
- `bf16_full_eval`: False
|
259 |
+
- `fp16_full_eval`: False
|
260 |
+
- `tf32`: None
|
261 |
+
- `local_rank`: 0
|
262 |
+
- `ddp_backend`: None
|
263 |
+
- `tpu_num_cores`: None
|
264 |
+
- `tpu_metrics_debug`: False
|
265 |
+
- `debug`: []
|
266 |
+
- `dataloader_drop_last`: False
|
267 |
+
- `dataloader_num_workers`: 0
|
268 |
+
- `dataloader_prefetch_factor`: None
|
269 |
+
- `past_index`: -1
|
270 |
+
- `disable_tqdm`: False
|
271 |
+
- `remove_unused_columns`: True
|
272 |
+
- `label_names`: None
|
273 |
+
- `load_best_model_at_end`: False
|
274 |
+
- `ignore_data_skip`: False
|
275 |
+
- `fsdp`: []
|
276 |
+
- `fsdp_min_num_params`: 0
|
277 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
278 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
279 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
280 |
+
- `deepspeed`: None
|
281 |
+
- `label_smoothing_factor`: 0.0
|
282 |
+
- `optim`: adamw_torch
|
283 |
+
- `optim_args`: None
|
284 |
+
- `adafactor`: False
|
285 |
+
- `group_by_length`: False
|
286 |
+
- `length_column_name`: length
|
287 |
+
- `ddp_find_unused_parameters`: None
|
288 |
+
- `ddp_bucket_cap_mb`: None
|
289 |
+
- `ddp_broadcast_buffers`: False
|
290 |
+
- `dataloader_pin_memory`: True
|
291 |
+
- `dataloader_persistent_workers`: False
|
292 |
+
- `skip_memory_metrics`: True
|
293 |
+
- `use_legacy_prediction_loop`: False
|
294 |
+
- `push_to_hub`: False
|
295 |
+
- `resume_from_checkpoint`: None
|
296 |
+
- `hub_model_id`: None
|
297 |
+
- `hub_strategy`: every_save
|
298 |
+
- `hub_private_repo`: None
|
299 |
+
- `hub_always_push`: False
|
300 |
+
- `gradient_checkpointing`: False
|
301 |
+
- `gradient_checkpointing_kwargs`: None
|
302 |
+
- `include_inputs_for_metrics`: False
|
303 |
+
- `include_for_metrics`: []
|
304 |
+
- `eval_do_concat_batches`: True
|
305 |
+
- `fp16_backend`: auto
|
306 |
+
- `push_to_hub_model_id`: None
|
307 |
+
- `push_to_hub_organization`: None
|
308 |
+
- `mp_parameters`:
|
309 |
+
- `auto_find_batch_size`: False
|
310 |
+
- `full_determinism`: False
|
311 |
+
- `torchdynamo`: None
|
312 |
+
- `ray_scope`: last
|
313 |
+
- `ddp_timeout`: 1800
|
314 |
+
- `torch_compile`: False
|
315 |
+
- `torch_compile_backend`: None
|
316 |
+
- `torch_compile_mode`: None
|
317 |
+
- `dispatch_batches`: None
|
318 |
+
- `split_batches`: None
|
319 |
+
- `include_tokens_per_second`: False
|
320 |
+
- `include_num_input_tokens_seen`: False
|
321 |
+
- `neftune_noise_alpha`: None
|
322 |
+
- `optim_target_modules`: None
|
323 |
+
- `batch_eval_metrics`: False
|
324 |
+
- `eval_on_start`: False
|
325 |
+
- `use_liger_kernel`: False
|
326 |
+
- `eval_use_gather_object`: False
|
327 |
+
- `average_tokens_across_devices`: False
|
328 |
+
- `prompts`: None
|
329 |
+
- `batch_sampler`: batch_sampler
|
330 |
+
- `multi_dataset_batch_sampler`: round_robin
|
331 |
+
|
332 |
+
</details>
|
333 |
+
|
334 |
+
### Training Logs
|
335 |
+
| Epoch | Step | Training Loss |
|
336 |
+
|:------:|:-----:|:-------------:|
|
337 |
+
| 0.0459 | 500 | 0.0329 |
|
338 |
+
| 0.0919 | 1000 | 0.0296 |
|
339 |
+
| 0.1378 | 1500 | 0.0314 |
|
340 |
+
| 0.1837 | 2000 | 0.0199 |
|
341 |
+
| 0.2297 | 2500 | 0.0435 |
|
342 |
+
| 0.2756 | 3000 | 0.0213 |
|
343 |
+
| 0.3215 | 3500 | 0.0293 |
|
344 |
+
| 0.3675 | 4000 | 0.0387 |
|
345 |
+
| 0.4134 | 4500 | 0.0064 |
|
346 |
+
| 0.4593 | 5000 | 0.0338 |
|
347 |
+
| 0.5053 | 5500 | 0.0317 |
|
348 |
+
| 0.5512 | 6000 | 0.0395 |
|
349 |
+
| 0.5972 | 6500 | 0.0129 |
|
350 |
+
| 0.6431 | 7000 | 0.036 |
|
351 |
+
| 0.6890 | 7500 | 0.0292 |
|
352 |
+
| 0.7350 | 8000 | 0.0215 |
|
353 |
+
| 0.7809 | 8500 | 0.02 |
|
354 |
+
| 0.8268 | 9000 | 0.0215 |
|
355 |
+
| 0.8728 | 9500 | 0.0139 |
|
356 |
+
| 0.9187 | 10000 | 0.0273 |
|
357 |
+
| 0.9646 | 10500 | 0.0138 |
|
358 |
+
|
359 |
+
|
360 |
+
### Framework Versions
|
361 |
+
- Python: 3.11.11
|
362 |
+
- Sentence Transformers: 3.4.1
|
363 |
+
- Transformers: 4.48.3
|
364 |
+
- PyTorch: 2.5.1+cu124
|
365 |
+
- Accelerate: 1.3.0
|
366 |
+
- Datasets: 3.3.2
|
367 |
+
- Tokenizers: 0.21.0
|
368 |
+
|
369 |
+
## Citation
|
370 |
+
|
371 |
+
### BibTeX
|
372 |
+
|
373 |
+
#### Sentence Transformers
|
374 |
+
```bibtex
|
375 |
+
@inproceedings{reimers-2019-sentence-bert,
|
376 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
377 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
378 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
379 |
+
month = "11",
|
380 |
+
year = "2019",
|
381 |
+
publisher = "Association for Computational Linguistics",
|
382 |
+
url = "https://arxiv.org/abs/1908.10084",
|
383 |
+
}
|
384 |
+
```
|
385 |
+
|
386 |
+
#### MultipleNegativesRankingLoss
|
387 |
+
```bibtex
|
388 |
+
@misc{henderson2017efficient,
|
389 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
390 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
391 |
+
year={2017},
|
392 |
+
eprint={1705.00652},
|
393 |
+
archivePrefix={arXiv},
|
394 |
+
primaryClass={cs.CL}
|
395 |
+
}
|
396 |
+
```
|
397 |
+
|
398 |
+
<!--
|
399 |
+
## Glossary
|
400 |
+
|
401 |
+
*Clearly define terms in order to be accessible across audiences.*
|
402 |
+
-->
|
403 |
+
|
404 |
+
<!--
|
405 |
+
## Model Card Authors
|
406 |
+
|
407 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
408 |
+
-->
|
409 |
+
|
410 |
+
<!--
|
411 |
+
## Model Card Contact
|
412 |
+
|
413 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
414 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Lajavaness/bilingual-embedding-large",
|
3 |
+
"architectures": [
|
4 |
+
"BilingualModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "dangvantuan/bilingual_impl--config.BilingualConfig",
|
9 |
+
"AutoModel": "dangvantuan/bilingual_impl--modeling.BilingualModel",
|
10 |
+
"AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
|
11 |
+
"AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
|
12 |
+
"AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
|
13 |
+
"AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
|
14 |
+
"AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
|
15 |
+
},
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classifier_dropout": null,
|
18 |
+
"eos_token_id": 2,
|
19 |
+
"hidden_act": "gelu",
|
20 |
+
"hidden_dropout_prob": 0.1,
|
21 |
+
"hidden_size": 1024,
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"layer_norm_eps": 1e-05,
|
25 |
+
"max_position_embeddings": 514,
|
26 |
+
"model_type": "bilingual",
|
27 |
+
"num_attention_heads": 16,
|
28 |
+
"num_hidden_layers": 24,
|
29 |
+
"output_past": true,
|
30 |
+
"pad_token_id": 1,
|
31 |
+
"position_embedding_type": "absolute",
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.48.3",
|
34 |
+
"type_vocab_size": 1,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 250002
|
37 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.4.1",
|
4 |
+
"transformers": "4.48.3",
|
5 |
+
"pytorch": "2.5.1+cu124"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2de89203028f3eb5f449e4e4cb1dc4ba721753507ad0904d83fda7104b47d650
|
3 |
+
size 2239607176
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
3 |
+
size 17082987
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"additional_special_tokens": [],
|
45 |
+
"bos_token": "<s>",
|
46 |
+
"clean_up_tokenization_spaces": true,
|
47 |
+
"cls_token": "<s>",
|
48 |
+
"eos_token": "</s>",
|
49 |
+
"extra_special_tokens": {},
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"max_length": 512,
|
52 |
+
"model_max_length": 512,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "<pad>",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "</s>",
|
58 |
+
"stride": 0,
|
59 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
60 |
+
"truncation_side": "right",
|
61 |
+
"truncation_strategy": "longest_first",
|
62 |
+
"unk_token": "<unk>"
|
63 |
+
}
|