nsthorat-lilac commited on
Commit
41f36bc
·
1 Parent(s): 4e587fd

Upload folder using huggingface_hub

Browse files
Files changed (19) hide show
  1. data/datasets/lilac/opus100-en-us-validation/config.yml +26 -0
  2. data/datasets/lilac/opus100-en-us-validation/data-00000-of-00001.parquet +0 -0
  3. data/datasets/lilac/opus100-en-us-validation/manifest.json +22 -0
  4. data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/data-00000-of-00001.parquet +0 -0
  5. data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/signal_manifest.json +33 -0
  6. data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/data-00000-of-00001.parquet +0 -0
  7. data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/signal_manifest.json +38 -0
  8. data/datasets/lilac/opus100-en-us-validation/translation/en/pii/data-00000-of-00001.parquet +0 -0
  9. data/datasets/lilac/opus100-en-us-validation/translation/en/pii/signal_manifest.json +47 -0
  10. data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/data-00000-of-00001.parquet +0 -0
  11. data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/signal_manifest.json +61 -0
  12. data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/data-00000-of-00001.parquet +0 -0
  13. data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/signal_manifest.json +33 -0
  14. data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/data-00000-of-00001.parquet +0 -0
  15. data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/signal_manifest.json +38 -0
  16. data/datasets/lilac/opus100-en-us-validation/translation/es/pii/data-00000-of-00001.parquet +0 -0
  17. data/datasets/lilac/opus100-en-us-validation/translation/es/pii/signal_manifest.json +47 -0
  18. data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/data-00000-of-00001.parquet +0 -0
  19. data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/signal_manifest.json +61 -0
data/datasets/lilac/opus100-en-us-validation/config.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: opus100-en-us-validation
2
+ namespace: lilac
3
+ settings:
4
+ ui:
5
+ media_paths:
6
+ - [translation, es]
7
+ - [translation, en]
8
+ signals:
9
+ - path: [translation, es]
10
+ signal: {signal_name: near_dup}
11
+ - path: [translation, es]
12
+ signal: {signal_name: pii}
13
+ - path: [translation, es]
14
+ signal: {signal_name: lang_detection}
15
+ - path: [translation, es]
16
+ signal: {signal_name: text_statistics}
17
+ - path: [translation, en]
18
+ signal: {signal_name: near_dup}
19
+ - path: [translation, en]
20
+ signal: {signal_name: text_statistics}
21
+ - path: [translation, en]
22
+ signal: {signal_name: pii}
23
+ - path: [translation, en]
24
+ signal: {signal_name: lang_detection}
25
+ source: {config_name: en-es, dataset_name: opus100, source_name: huggingface, split: validation}
26
+ tags: [machine-learning]
data/datasets/lilac/opus100-en-us-validation/data-00000-of-00001.parquet ADDED
Binary file (304 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/manifest.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "translation": {
8
+ "fields": {
9
+ "en": {
10
+ "dtype": "string"
11
+ },
12
+ "es": {
13
+ "dtype": "string"
14
+ }
15
+ }
16
+ },
17
+ "__hfsplit__": {
18
+ "dtype": "string"
19
+ }
20
+ }
21
+ }
22
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (70 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "lang_detection": {
13
+ "dtype": "string",
14
+ "signal": {
15
+ "split_by_paragraph": false,
16
+ "signal_name": "lang_detection"
17
+ }
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "split_by_paragraph": false,
27
+ "signal_name": "lang_detection"
28
+ },
29
+ "enriched_path": [
30
+ "translation",
31
+ "en"
32
+ ]
33
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/data-00000-of-00001.parquet ADDED
Binary file (80 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "near_dup": {
13
+ "fields": {
14
+ "cluster_id": {
15
+ "dtype": "uint32",
16
+ "categorical": true
17
+ }
18
+ },
19
+ "signal": {
20
+ "threshold": 0.85,
21
+ "signal_name": "near_dup"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "threshold": 0.85,
32
+ "signal_name": "near_dup"
33
+ },
34
+ "enriched_path": [
35
+ "translation",
36
+ "en"
37
+ ]
38
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/pii/data-00000-of-00001.parquet ADDED
Binary file (71.7 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/pii/signal_manifest.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "pii": {
13
+ "fields": {
14
+ "emails": {
15
+ "repeated_field": {
16
+ "dtype": "string_span"
17
+ }
18
+ },
19
+ "ip_addresses": {
20
+ "repeated_field": {
21
+ "dtype": "string_span"
22
+ }
23
+ },
24
+ "secrets": {
25
+ "repeated_field": {
26
+ "dtype": "string_span"
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "signal_name": "pii"
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "signal": {
41
+ "signal_name": "pii"
42
+ },
43
+ "enriched_path": [
44
+ "translation",
45
+ "en"
46
+ ]
47
+ }
data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (84.2 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/en/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.en.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "en": {
11
+ "fields": {
12
+ "text_statistics": {
13
+ "fields": {
14
+ "num_characters": {
15
+ "dtype": "int32"
16
+ },
17
+ "readability": {
18
+ "dtype": "float32"
19
+ },
20
+ "log(type_token_ratio)": {
21
+ "dtype": "float32"
22
+ },
23
+ "frac_non_ascii": {
24
+ "dtype": "float32",
25
+ "bins": [
26
+ [
27
+ "Low",
28
+ null,
29
+ 0.15
30
+ ],
31
+ [
32
+ "Medium",
33
+ 0.15,
34
+ 0.3
35
+ ],
36
+ [
37
+ "High",
38
+ 0.3,
39
+ null
40
+ ]
41
+ ]
42
+ }
43
+ },
44
+ "signal": {
45
+ "signal_name": "text_statistics"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ },
54
+ "signal": {
55
+ "signal_name": "text_statistics"
56
+ },
57
+ "enriched_path": [
58
+ "translation",
59
+ "en"
60
+ ]
61
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/data-00000-of-00001.parquet ADDED
Binary file (69.9 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "lang_detection": {
13
+ "dtype": "string",
14
+ "signal": {
15
+ "split_by_paragraph": false,
16
+ "signal_name": "lang_detection"
17
+ }
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "split_by_paragraph": false,
27
+ "signal_name": "lang_detection"
28
+ },
29
+ "enriched_path": [
30
+ "translation",
31
+ "es"
32
+ ]
33
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/data-00000-of-00001.parquet ADDED
Binary file (80 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "near_dup": {
13
+ "fields": {
14
+ "cluster_id": {
15
+ "dtype": "uint32",
16
+ "categorical": true
17
+ }
18
+ },
19
+ "signal": {
20
+ "threshold": 0.85,
21
+ "signal_name": "near_dup"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "threshold": 0.85,
32
+ "signal_name": "near_dup"
33
+ },
34
+ "enriched_path": [
35
+ "translation",
36
+ "es"
37
+ ]
38
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/pii/data-00000-of-00001.parquet ADDED
Binary file (71.7 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/pii/signal_manifest.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "pii": {
13
+ "fields": {
14
+ "emails": {
15
+ "repeated_field": {
16
+ "dtype": "string_span"
17
+ }
18
+ },
19
+ "ip_addresses": {
20
+ "repeated_field": {
21
+ "dtype": "string_span"
22
+ }
23
+ },
24
+ "secrets": {
25
+ "repeated_field": {
26
+ "dtype": "string_span"
27
+ }
28
+ }
29
+ },
30
+ "signal": {
31
+ "signal_name": "pii"
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "signal": {
41
+ "signal_name": "pii"
42
+ },
43
+ "enriched_path": [
44
+ "translation",
45
+ "es"
46
+ ]
47
+ }
data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/data-00000-of-00001.parquet ADDED
Binary file (87.4 kB). View file
 
data/datasets/lilac/opus100-en-us-validation/translation/es/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "translation.es.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "translation": {
9
+ "fields": {
10
+ "es": {
11
+ "fields": {
12
+ "text_statistics": {
13
+ "fields": {
14
+ "num_characters": {
15
+ "dtype": "int32"
16
+ },
17
+ "readability": {
18
+ "dtype": "float32"
19
+ },
20
+ "log(type_token_ratio)": {
21
+ "dtype": "float32"
22
+ },
23
+ "frac_non_ascii": {
24
+ "dtype": "float32",
25
+ "bins": [
26
+ [
27
+ "Low",
28
+ null,
29
+ 0.15
30
+ ],
31
+ [
32
+ "Medium",
33
+ 0.15,
34
+ 0.3
35
+ ],
36
+ [
37
+ "High",
38
+ 0.3,
39
+ null
40
+ ]
41
+ ]
42
+ }
43
+ },
44
+ "signal": {
45
+ "signal_name": "text_statistics"
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ },
54
+ "signal": {
55
+ "signal_name": "text_statistics"
56
+ },
57
+ "enriched_path": [
58
+ "translation",
59
+ "es"
60
+ ]
61
+ }