Spaces:
Running
Running
Commit
·
a1a0c11
1
Parent(s):
fc402db
Upload folder using huggingface_hub
Browse files- .gitattributes +10 -0
- data/datasets/lilac/piqa/config.yml +43 -0
- data/datasets/lilac/piqa/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/piqa/goal/gte-small/spans.pkl +3 -0
- data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/goal/pii/signal_manifest.json +45 -0
- data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json +59 -0
- data/datasets/lilac/piqa/manifest.json +27 -0
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/piqa/sol1/gte-small/spans.pkl +3 -0
- data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol1/pii/signal_manifest.json +45 -0
- data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json +59 -0
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/piqa/sol2/gte-small/spans.pkl +3 -0
- data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol2/pii/signal_manifest.json +45 -0
- data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet +0 -0
- data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json +59 -0
.gitattributes
CHANGED
@@ -16,3 +16,13 @@ data/.cache/lilac/concept/lilac/toxicity/gte-small.pkl filter=lfs diff=lfs merge
|
|
16 |
data/.cache/lilac/concept/lilac/toxicity/openai.pkl filter=lfs diff=lfs merge=lfs -text
|
17 |
data/.cache/lilac/concept/lilac/toxicity/palm.pkl filter=lfs diff=lfs merge=lfs -text
|
18 |
data/.cache/lilac/concept/lilac/toxicity/sbert.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
data/.cache/lilac/concept/lilac/toxicity/openai.pkl filter=lfs diff=lfs merge=lfs -text
|
17 |
data/.cache/lilac/concept/lilac/toxicity/palm.pkl filter=lfs diff=lfs merge=lfs -text
|
18 |
data/.cache/lilac/concept/lilac/toxicity/sbert.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
+
data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
21 |
+
data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
24 |
+
data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
25 |
+
data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
26 |
+
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
27 |
+
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
28 |
+
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/piqa/config.yml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- {embedding: gte-small, path: goal}
|
3 |
+
- {embedding: gte-small, path: sol2}
|
4 |
+
- {embedding: gte-small, path: sol1}
|
5 |
+
name: piqa
|
6 |
+
namespace: local
|
7 |
+
settings:
|
8 |
+
preferred_embedding: gte-small
|
9 |
+
ui:
|
10 |
+
media_paths: [sol1, sol2, goal]
|
11 |
+
signals:
|
12 |
+
- path: sol1
|
13 |
+
signal: {signal_name: near_dup}
|
14 |
+
- path: sol1
|
15 |
+
signal: {signal_name: text_statistics}
|
16 |
+
- path: sol1
|
17 |
+
signal: {signal_name: pii}
|
18 |
+
- path: sol1
|
19 |
+
signal: {signal_name: lang_detection}
|
20 |
+
- path: sol1
|
21 |
+
signal: {signal_name: spacy_ner}
|
22 |
+
- path: sol2
|
23 |
+
signal: {signal_name: near_dup}
|
24 |
+
- path: sol2
|
25 |
+
signal: {signal_name: pii}
|
26 |
+
- path: sol2
|
27 |
+
signal: {signal_name: spacy_ner}
|
28 |
+
- path: sol2
|
29 |
+
signal: {signal_name: lang_detection}
|
30 |
+
- path: sol2
|
31 |
+
signal: {signal_name: text_statistics}
|
32 |
+
- path: goal
|
33 |
+
signal: {signal_name: near_dup}
|
34 |
+
- path: goal
|
35 |
+
signal: {signal_name: text_statistics}
|
36 |
+
- path: goal
|
37 |
+
signal: {signal_name: spacy_ner}
|
38 |
+
- path: goal
|
39 |
+
signal: {signal_name: lang_detection}
|
40 |
+
- path: goal
|
41 |
+
signal: {signal_name: pii}
|
42 |
+
source: {dataset_name: piqa, source_name: huggingface}
|
43 |
+
tags: [machine-learning]
|
data/datasets/lilac/piqa/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1530d99cdc0988b355d3dcbcd6f7e29439e4048535b598eaf08f2c372d5a76a4
|
3 |
+
size 4040510
|
data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e46018f32560457c6bb77c8cb6270be81c48730a37e640d70de62b5e75d9b0ad
|
3 |
+
size 35437836
|
data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:151e20cb34bc6ff100936ca9da15e7443e8ecc23cf68aa10977f90a5f5e3802b
|
3 |
+
size 1010253
|
data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(goal)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"goal": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"goal"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/piqa/goal/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af17a91bcfa9ef113f360a6a6a625741b81ba887a577393c23aff0621535b52b
|
3 |
+
size 1009873
|
data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (732 kB). View file
|
|
data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(goal)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"goal": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"goal"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (842 kB). View file
|
|
data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(goal)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"goal": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"goal"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet
ADDED
Binary file (728 kB). View file
|
|
data/datasets/lilac/piqa/goal/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(goal)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"goal": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"goal"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (787 kB). View file
|
|
data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(goal)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"goal": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"goal"
|
58 |
+
]
|
59 |
+
}
|
data/datasets/lilac/piqa/manifest.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"goal": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"sol1": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"sol2": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"label": {
|
17 |
+
"dtype": "string"
|
18 |
+
},
|
19 |
+
"__hfsplit__": {
|
20 |
+
"dtype": "string"
|
21 |
+
},
|
22 |
+
"__rowid__": {
|
23 |
+
"dtype": "string"
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91976ea520234c445b947e745fabddaae0cf455d2a8e8d18ccb945905c642ed2
|
3 |
+
size 36174988
|
data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1333c87108731a3851176d7751a918e8a170de5a6d4e0e0a5b79f8663b37494e
|
3 |
+
size 1018110
|
data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(sol1)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"sol1": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"sol1"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/piqa/sol1/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b043969052fae73d232c222d10362ce093ca26b9224b6ea14c87cf65affd04df
|
3 |
+
size 1015221
|
data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (731 kB). View file
|
|
data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(sol1)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol1": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"sol1"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (848 kB). View file
|
|
data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(sol1)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol1": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"sol1"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet
ADDED
Binary file (728 kB). View file
|
|
data/datasets/lilac/piqa/sol1/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(sol1)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol1": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"sol1"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (842 kB). View file
|
|
data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(sol1)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol1": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"sol1"
|
58 |
+
]
|
59 |
+
}
|
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b61683bccd432e615553a1e508b236f265663e74ec67c60edb4f368e169d7b65
|
3 |
+
size 36180028
|
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d814ae715f918b41488589fabed6fba617820b1619c484e437a6d575a89692c3
|
3 |
+
size 1018164
|
data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(sol2)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"sol2": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"sol2"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bae45d635be51218f7dbf110a5fd4935d7afb802dd79425d18164161e2f1645
|
3 |
+
size 1015232
|
data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet
ADDED
Binary file (732 kB). View file
|
|
data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(sol2)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol2": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"sol2"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet
ADDED
Binary file (848 kB). View file
|
|
data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(sol2)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol2": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"sol2"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet
ADDED
Binary file (728 kB). View file
|
|
data/datasets/lilac/piqa/sol2/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(sol2)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol2": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"sol2"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet
ADDED
Binary file (842 kB). View file
|
|
data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(sol2)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"sol2": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"sol2"
|
58 |
+
]
|
59 |
+
}
|