nsthorat-lilac commited on
Commit
a1a0c11
·
1 Parent(s): fc402db

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. .gitattributes +10 -0
  2. data/datasets/lilac/piqa/config.yml +43 -0
  3. data/datasets/lilac/piqa/data-00000-of-00001.parquet +3 -0
  4. data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin +3 -0
  5. data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl +3 -0
  6. data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json +35 -0
  7. data/datasets/lilac/piqa/goal/gte-small/spans.pkl +3 -0
  8. data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet +0 -0
  9. data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json +31 -0
  10. data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet +0 -0
  11. data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json +36 -0
  12. data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet +0 -0
  13. data/datasets/lilac/piqa/goal/pii/signal_manifest.json +45 -0
  14. data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet +0 -0
  15. data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json +59 -0
  16. data/datasets/lilac/piqa/manifest.json +27 -0
  17. data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin +3 -0
  18. data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl +3 -0
  19. data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json +35 -0
  20. data/datasets/lilac/piqa/sol1/gte-small/spans.pkl +3 -0
  21. data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet +0 -0
  22. data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json +31 -0
  23. data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet +0 -0
  24. data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json +36 -0
  25. data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet +0 -0
  26. data/datasets/lilac/piqa/sol1/pii/signal_manifest.json +45 -0
  27. data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet +0 -0
  28. data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json +59 -0
  29. data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin +3 -0
  30. data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl +3 -0
  31. data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json +35 -0
  32. data/datasets/lilac/piqa/sol2/gte-small/spans.pkl +3 -0
  33. data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet +0 -0
  34. data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json +31 -0
  35. data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet +0 -0
  36. data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json +36 -0
  37. data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet +0 -0
  38. data/datasets/lilac/piqa/sol2/pii/signal_manifest.json +45 -0
  39. data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet +0 -0
  40. data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json +59 -0
.gitattributes CHANGED
@@ -16,3 +16,13 @@ data/.cache/lilac/concept/lilac/toxicity/gte-small.pkl filter=lfs diff=lfs merge
16
  data/.cache/lilac/concept/lilac/toxicity/openai.pkl filter=lfs diff=lfs merge=lfs -text
17
  data/.cache/lilac/concept/lilac/toxicity/palm.pkl filter=lfs diff=lfs merge=lfs -text
18
  data/.cache/lilac/concept/lilac/toxicity/sbert.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
16
  data/.cache/lilac/concept/lilac/toxicity/openai.pkl filter=lfs diff=lfs merge=lfs -text
17
  data/.cache/lilac/concept/lilac/toxicity/palm.pkl filter=lfs diff=lfs merge=lfs -text
18
  data/.cache/lilac/concept/lilac/toxicity/sbert.pkl filter=lfs diff=lfs merge=lfs -text
19
+ data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
20
+ data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
21
+ data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
22
+ data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
23
+ data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
24
+ data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
25
+ data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
26
+ data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
27
+ data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
28
+ data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/piqa/config.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - {embedding: gte-small, path: goal}
3
+ - {embedding: gte-small, path: sol2}
4
+ - {embedding: gte-small, path: sol1}
5
+ name: piqa
6
+ namespace: local
7
+ settings:
8
+ preferred_embedding: gte-small
9
+ ui:
10
+ media_paths: [sol1, sol2, goal]
11
+ signals:
12
+ - path: sol1
13
+ signal: {signal_name: near_dup}
14
+ - path: sol1
15
+ signal: {signal_name: text_statistics}
16
+ - path: sol1
17
+ signal: {signal_name: pii}
18
+ - path: sol1
19
+ signal: {signal_name: lang_detection}
20
+ - path: sol1
21
+ signal: {signal_name: spacy_ner}
22
+ - path: sol2
23
+ signal: {signal_name: near_dup}
24
+ - path: sol2
25
+ signal: {signal_name: pii}
26
+ - path: sol2
27
+ signal: {signal_name: spacy_ner}
28
+ - path: sol2
29
+ signal: {signal_name: lang_detection}
30
+ - path: sol2
31
+ signal: {signal_name: text_statistics}
32
+ - path: goal
33
+ signal: {signal_name: near_dup}
34
+ - path: goal
35
+ signal: {signal_name: text_statistics}
36
+ - path: goal
37
+ signal: {signal_name: spacy_ner}
38
+ - path: goal
39
+ signal: {signal_name: lang_detection}
40
+ - path: goal
41
+ signal: {signal_name: pii}
42
+ source: {dataset_name: piqa, source_name: huggingface}
43
+ tags: [machine-learning]
data/datasets/lilac/piqa/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1530d99cdc0988b355d3dcbcd6f7e29439e4048535b598eaf08f2c372d5a76a4
3
+ size 4040510
data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e46018f32560457c6bb77c8cb6270be81c48730a37e640d70de62b5e75d9b0ad
3
+ size 35437836
data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:151e20cb34bc6ff100936ca9da15e7443e8ecc23cf68aa10977f90a5f5e3802b
3
+ size 1010253
data/datasets/lilac/piqa/goal/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(goal)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "goal": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "goal"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/piqa/goal/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af17a91bcfa9ef113f360a6a6a625741b81ba887a577393c23aff0621535b52b
3
+ size 1009873
data/datasets/lilac/piqa/goal/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (732 kB). View file
 
data/datasets/lilac/piqa/goal/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(goal)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "goal": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "goal"
30
+ ]
31
+ }
data/datasets/lilac/piqa/goal/near_dup/data-00000-of-00001.parquet ADDED
Binary file (842 kB). View file
 
data/datasets/lilac/piqa/goal/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(goal)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "goal": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "goal"
35
+ ]
36
+ }
data/datasets/lilac/piqa/goal/pii/data-00000-of-00001.parquet ADDED
Binary file (728 kB). View file
 
data/datasets/lilac/piqa/goal/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(goal)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "goal": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "goal"
44
+ ]
45
+ }
data/datasets/lilac/piqa/goal/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (787 kB). View file
 
data/datasets/lilac/piqa/goal/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(goal)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "goal": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "goal"
58
+ ]
59
+ }
data/datasets/lilac/piqa/manifest.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "goal": {
8
+ "dtype": "string"
9
+ },
10
+ "sol1": {
11
+ "dtype": "string"
12
+ },
13
+ "sol2": {
14
+ "dtype": "string"
15
+ },
16
+ "label": {
17
+ "dtype": "string"
18
+ },
19
+ "__hfsplit__": {
20
+ "dtype": "string"
21
+ },
22
+ "__rowid__": {
23
+ "dtype": "string"
24
+ }
25
+ }
26
+ }
27
+ }
data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91976ea520234c445b947e745fabddaae0cf455d2a8e8d18ccb945905c642ed2
3
+ size 36174988
data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1333c87108731a3851176d7751a918e8a170de5a6d4e0e0a5b79f8663b37494e
3
+ size 1018110
data/datasets/lilac/piqa/sol1/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(sol1)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "sol1": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "sol1"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/piqa/sol1/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b043969052fae73d232c222d10362ce093ca26b9224b6ea14c87cf65affd04df
3
+ size 1015221
data/datasets/lilac/piqa/sol1/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (731 kB). View file
 
data/datasets/lilac/piqa/sol1/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(sol1)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol1": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "sol1"
30
+ ]
31
+ }
data/datasets/lilac/piqa/sol1/near_dup/data-00000-of-00001.parquet ADDED
Binary file (848 kB). View file
 
data/datasets/lilac/piqa/sol1/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(sol1)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol1": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "sol1"
35
+ ]
36
+ }
data/datasets/lilac/piqa/sol1/pii/data-00000-of-00001.parquet ADDED
Binary file (728 kB). View file
 
data/datasets/lilac/piqa/sol1/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(sol1)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol1": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "sol1"
44
+ ]
45
+ }
data/datasets/lilac/piqa/sol1/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (842 kB). View file
 
data/datasets/lilac/piqa/sol1/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(sol1)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol1": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "sol1"
58
+ ]
59
+ }
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b61683bccd432e615553a1e508b236f265663e74ec67c60edb4f368e169d7b65
3
+ size 36180028
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d814ae715f918b41488589fabed6fba617820b1619c484e437a6d575a89692c3
3
+ size 1018164
data/datasets/lilac/piqa/sol2/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(sol2)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "sol2": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "sol2"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bae45d635be51218f7dbf110a5fd4935d7afb802dd79425d18164161e2f1645
3
+ size 1015232
data/datasets/lilac/piqa/sol2/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (732 kB). View file
 
data/datasets/lilac/piqa/sol2/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(sol2)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol2": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "sol2"
30
+ ]
31
+ }
data/datasets/lilac/piqa/sol2/near_dup/data-00000-of-00001.parquet ADDED
Binary file (848 kB). View file
 
data/datasets/lilac/piqa/sol2/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(sol2)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol2": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "sol2"
35
+ ]
36
+ }
data/datasets/lilac/piqa/sol2/pii/data-00000-of-00001.parquet ADDED
Binary file (728 kB). View file
 
data/datasets/lilac/piqa/sol2/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(sol2)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol2": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "sol2"
44
+ ]
45
+ }
data/datasets/lilac/piqa/sol2/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (842 kB). View file
 
data/datasets/lilac/piqa/sol2/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text_statistics(sol2)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "sol2": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ },
53
+ "signal": {
54
+ "signal_name": "text_statistics"
55
+ },
56
+ "enriched_path": [
57
+ "sol2"
58
+ ]
59
+ }