Spaces:
Running
Running
Commit
·
767e3da
1
Parent(s):
62e40e6
Upload folder using huggingface_hub
Browse files- .gitattributes +15 -0
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json +38 -0
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json +34 -0
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json +39 -0
- data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json +48 -0
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json +62 -0
- data/datasets/lilac/mmlu_professional_law/config.yml +35 -0
- data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/manifest.json +26 -0
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl +3 -0
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json +45 -0
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json +59 -0
.gitattributes
CHANGED
@@ -33,3 +33,18 @@ data/datasets/lilac/enron-emails/text/gte-small/spans.pkl filter=lfs diff=lfs me
|
|
33 |
data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
34 |
data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
35 |
data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
34 |
data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
35 |
data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
39 |
+
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
40 |
+
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
42 |
+
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
43 |
+
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
44 |
+
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
45 |
+
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
46 |
+
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
47 |
+
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
48 |
+
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
49 |
+
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
50 |
+
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df9d6e2f5df4b8693544f31ca78a9d1936a4caf47acc2babeb1cb766131b7636
|
3 |
+
size 684360968
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2081ce5d760026fe341e0553cd9e40747ca902e4e7edb851cb747f350f19bb0d
|
3 |
+
size 11174465
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(choices)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"choices": {
|
10 |
+
"repeated_field": {
|
11 |
+
"fields": {
|
12 |
+
"gte-small": {
|
13 |
+
"repeated_field": {
|
14 |
+
"fields": {
|
15 |
+
"embedding": {
|
16 |
+
"dtype": "embedding"
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"dtype": "string_span"
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"signal_name": "gte-small"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"signal": {
|
31 |
+
"signal_name": "gte-small"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"choices",
|
35 |
+
"*"
|
36 |
+
],
|
37 |
+
"vector_store": "hnsw"
|
38 |
+
}
|
data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02fb1662da21f33ea1429a0f9adf1301185da46f642a722717fe7c523314fa57
|
3 |
+
size 11173475
|
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:968d4f87c7b51b995d9e3a96423a06b91984e5ee4a47062cd53fe87cca5cafbe
|
3 |
+
size 3469413
|
data/datasets/lilac/mmlu_professional_law/choices/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(choices)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"choices": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"lang_detection": {
|
15 |
+
"dtype": "string",
|
16 |
+
"signal": {
|
17 |
+
"split_by_paragraph": false,
|
18 |
+
"signal_name": "lang_detection"
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"split_by_paragraph": false,
|
28 |
+
"signal_name": "lang_detection"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"choices",
|
32 |
+
"*"
|
33 |
+
]
|
34 |
+
}
|
data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:316f3be499fcbb960bc1e83a201838ca0b3047a71d8e1c302fe4e0d833a3bf90
|
3 |
+
size 5544176
|
data/datasets/lilac/mmlu_professional_law/choices/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(choices)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"choices": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"near_dup": {
|
15 |
+
"fields": {
|
16 |
+
"cluster_id": {
|
17 |
+
"dtype": "uint32",
|
18 |
+
"categorical": true
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"threshold": 0.85,
|
23 |
+
"signal_name": "near_dup"
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"threshold": 0.85,
|
33 |
+
"signal_name": "near_dup"
|
34 |
+
},
|
35 |
+
"enriched_path": [
|
36 |
+
"choices",
|
37 |
+
"*"
|
38 |
+
]
|
39 |
+
}
|
data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cb41d4e9d0d82bd824abfa733d5be3a599e011098c5d41ebadeb1166a15f722
|
3 |
+
size 3393096
|
data/datasets/lilac/mmlu_professional_law/choices/pii/signal_manifest.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(choices)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"choices": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"pii": {
|
15 |
+
"fields": {
|
16 |
+
"emails": {
|
17 |
+
"repeated_field": {
|
18 |
+
"dtype": "string_span"
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"ip_addresses": {
|
22 |
+
"repeated_field": {
|
23 |
+
"dtype": "string_span"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"secrets": {
|
27 |
+
"repeated_field": {
|
28 |
+
"dtype": "string_span"
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"signal": {
|
33 |
+
"signal_name": "pii"
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"signal": {
|
42 |
+
"signal_name": "pii"
|
43 |
+
},
|
44 |
+
"enriched_path": [
|
45 |
+
"choices",
|
46 |
+
"*"
|
47 |
+
]
|
48 |
+
}
|
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc00a68e0f835e25b214d90e7e48251b39d748f1e836af713440cd0ea2517ead
|
3 |
+
size 4634821
|
data/datasets/lilac/mmlu_professional_law/choices/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(choices)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"choices": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"text_statistics": {
|
15 |
+
"fields": {
|
16 |
+
"num_characters": {
|
17 |
+
"dtype": "int32"
|
18 |
+
},
|
19 |
+
"readability": {
|
20 |
+
"dtype": "float32"
|
21 |
+
},
|
22 |
+
"log(type_token_ratio)": {
|
23 |
+
"dtype": "float32"
|
24 |
+
},
|
25 |
+
"frac_non_ascii": {
|
26 |
+
"dtype": "float32",
|
27 |
+
"bins": [
|
28 |
+
[
|
29 |
+
"Low",
|
30 |
+
null,
|
31 |
+
0.15
|
32 |
+
],
|
33 |
+
[
|
34 |
+
"Medium",
|
35 |
+
0.15,
|
36 |
+
0.3
|
37 |
+
],
|
38 |
+
[
|
39 |
+
"High",
|
40 |
+
0.3,
|
41 |
+
null
|
42 |
+
]
|
43 |
+
]
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"signal": {
|
47 |
+
"signal_name": "text_statistics"
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"signal": {
|
56 |
+
"signal_name": "text_statistics"
|
57 |
+
},
|
58 |
+
"enriched_path": [
|
59 |
+
"choices",
|
60 |
+
"*"
|
61 |
+
]
|
62 |
+
}
|
data/datasets/lilac/mmlu_professional_law/config.yml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- embedding: gte-small
|
3 |
+
path: [choices, '*']
|
4 |
+
- {embedding: gte-small, path: question}
|
5 |
+
name: mmlu_professional_law
|
6 |
+
namespace: local
|
7 |
+
settings:
|
8 |
+
preferred_embedding: gte-small
|
9 |
+
ui:
|
10 |
+
media_paths:
|
11 |
+
- question
|
12 |
+
- [choices, '*']
|
13 |
+
signals:
|
14 |
+
- path: question
|
15 |
+
signal: {signal_name: text_statistics}
|
16 |
+
- path: question
|
17 |
+
signal: {signal_name: pii}
|
18 |
+
- path: question
|
19 |
+
signal: {signal_name: near_dup}
|
20 |
+
- path: [choices, '*']
|
21 |
+
signal: {signal_name: text_statistics}
|
22 |
+
- path: [choices, '*']
|
23 |
+
signal: {signal_name: spacy_ner}
|
24 |
+
- path: question
|
25 |
+
signal: {signal_name: lang_detection}
|
26 |
+
- path: [choices, '*']
|
27 |
+
signal: {signal_name: near_dup}
|
28 |
+
- path: [choices, '*']
|
29 |
+
signal: {signal_name: pii}
|
30 |
+
- path: [choices, '*']
|
31 |
+
signal: {signal_name: lang_detection}
|
32 |
+
- path: question
|
33 |
+
signal: {signal_name: spacy_ner}
|
34 |
+
source: {config_name: professional_law, dataset_name: cais/mmlu, source_name: huggingface}
|
35 |
+
tags: [legal]
|
data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65cd2771cf0bb88dbed9ad66ceaff472115f07c9dfea866c7e3f65b68392e745
|
3 |
+
size 50699938
|
data/datasets/lilac/mmlu_professional_law/manifest.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"question": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"choices": {
|
11 |
+
"repeated_field": {
|
12 |
+
"dtype": "string"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"answer": {
|
16 |
+
"dtype": "string"
|
17 |
+
},
|
18 |
+
"__hfsplit__": {
|
19 |
+
"dtype": "string"
|
20 |
+
},
|
21 |
+
"__rowid__": {
|
22 |
+
"dtype": "string"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b02300405fccc3011294e15ee869933dd81578173435defbcb19e3b40a65e93
|
3 |
+
size 771802212
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f72169740d80ee2b2ea66589d7ebcc58c83381978a4640a27510c416a02bf6c7
|
3 |
+
size 11296648
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(question)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"question": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"question"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b51cad455e94b167bc9cf130c262ed1b143a8f386c7074a61983e01cd93d277
|
3 |
+
size 7911602
|
data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf6cf8cdc246ce4406599aec8782d3be02f2585f1fbad74173faf0ffcb453a49
|
3 |
+
size 3361922
|
data/datasets/lilac/mmlu_professional_law/question/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"question"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c4139f699d1a248cf5378c442ef6f17970913394d5d0c79bd7c6e6801ab548a
|
3 |
+
size 3697516
|
data/datasets/lilac/mmlu_professional_law/question/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"question"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2735c4a2c5d40973652d369140533af74425db6dd753f8a25850d4efeee4928e
|
3 |
+
size 3369080
|
data/datasets/lilac/mmlu_professional_law/question/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"question"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:995b3ac42907ea244d9cb04c68a4715af8ddb7d72dcced056bc58dc9a9f05e7e
|
3 |
+
size 4389031
|
data/datasets/lilac/mmlu_professional_law/question/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(question)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"question": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"question"
|
58 |
+
]
|
59 |
+
}
|