Spaces:
Running
Running
Commit
·
fcf8b49
1
Parent(s):
9440712
Upload folder using huggingface_hub
Browse files- .gitattributes +9 -0
- data/datasets/local/imdb/config.yml +10 -0
- data/datasets/local/imdb/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/manifest.json +21 -0
- data/datasets/local/imdb/settings.json +1 -0
- data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/local/imdb/text/gte-small/signal_manifest.json +35 -0
- data/datasets/local/imdb/text/gte-small/spans.pkl +3 -0
- data/datasets/local/imdb/text/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/text/lang_detection/signal_manifest.json +31 -0
- data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/text/near_dup/signal_manifest.json +36 -0
- data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/text/pii/signal_manifest.json +45 -0
- data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/text/spacy_ner/signal_manifest.json +38 -0
- data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/local/imdb/text/text_statistics/signal_manifest.json +59 -0
.gitattributes
CHANGED
@@ -7,3 +7,12 @@ data/datasets/local/open-asssistant-conversations/text/near_dup/data-00000-of-00
|
|
7 |
data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
8 |
data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
9 |
data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
data/datasets/local/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
8 |
data/datasets/local/open-asssistant-conversations/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
9 |
data/datasets/local/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
10 |
+
data/datasets/local/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
11 |
+
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
12 |
+
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
13 |
+
data/datasets/local/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
14 |
+
data/datasets/local/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
15 |
+
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
+
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
18 |
+
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/local/imdb/config.yml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: imdb
|
2 |
+
namespace: local
|
3 |
+
settings:
|
4 |
+
preferred_embedding: gte-small
|
5 |
+
ui:
|
6 |
+
media_paths:
|
7 |
+
- text
|
8 |
+
source:
|
9 |
+
source_name: huggingface
|
10 |
+
dataset_name: imdb
|
data/datasets/local/imdb/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cf3f121bae8b8d8c12af8bebe4cda35c2a84750470fff57ea37a4930c257d6f
|
3 |
+
size 86160733
|
data/datasets/local/imdb/manifest.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"text": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"label": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"__hfsplit__": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"__rowid__": {
|
17 |
+
"dtype": "string"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
data/datasets/local/imdb/settings.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ui": {"media_paths": [["text"]], "markdown_paths": []}, "preferred_embedding": "gte-small"}
|
data/datasets/local/imdb/text/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4659a623093a2ef1646885a6ecb6ef86c56c2dcd0b10900d7b46d193dfb69e7f
|
3 |
+
size 691432464
|
data/datasets/local/imdb/text/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cbf4a5777b0cd1f8bb5061a6177b27cc0f5a8a6349c487c0c5c52fe60697d64
|
3 |
+
size 10390846
|
data/datasets/local/imdb/text/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(text)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"text": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"text"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/local/imdb/text/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:526e8505beb2386e3ff30367968685fd2229f76af2c0c86d50afaa7da3018dbc
|
3 |
+
size 7476546
|
data/datasets/local/imdb/text/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45651752b9f1178504ed253e070243e2782a79faf0e1272c3b9e3ba4ed8a717d
|
3 |
+
size 3309640
|
data/datasets/local/imdb/text/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"text"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/local/imdb/text/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1ccd12fc66d0c31a19554fcb5f442751807745e51c3a9336cec637525a422fc
|
3 |
+
size 3916036
|
data/datasets/local/imdb/text/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.75,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.75,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"text"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/local/imdb/text/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4f1f559281ca4e3efcafd4b10c51cbe2f5039d86ce95d3dc07156671fd8b824
|
3 |
+
size 3313984
|
data/datasets/local/imdb/text/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"text"
|
44 |
+
]
|
45 |
+
}
|
data/datasets/local/imdb/text/spacy_ner/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19ce0e0966a4db29b7b862aa3fa87ef3b02997e57efcdd722023819caa1be7bb
|
3 |
+
size 8483750
|
data/datasets/local/imdb/text/spacy_ner/signal_manifest.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "spacy_ner(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"spacy_ner": {
|
14 |
+
"repeated_field": {
|
15 |
+
"fields": {
|
16 |
+
"label": {
|
17 |
+
"dtype": "string"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"dtype": "string_span"
|
21 |
+
},
|
22 |
+
"signal": {
|
23 |
+
"model": "en_core_web_sm",
|
24 |
+
"signal_name": "spacy_ner"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"model": "en_core_web_sm",
|
33 |
+
"signal_name": "spacy_ner"
|
34 |
+
},
|
35 |
+
"enriched_path": [
|
36 |
+
"text"
|
37 |
+
]
|
38 |
+
}
|
data/datasets/local/imdb/text/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:672357a255fecf4e29604674ff3ceb11b6772d0388293f5267f608a6163faf49
|
3 |
+
size 4404092
|
data/datasets/local/imdb/text/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text_statistics(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"signal": {
|
54 |
+
"signal_name": "text_statistics"
|
55 |
+
},
|
56 |
+
"enriched_path": [
|
57 |
+
"text"
|
58 |
+
]
|
59 |
+
}
|