Spaces:
Running
Running
Commit
·
62e40e6
1
Parent(s):
a1a0c11
Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- data/datasets/lilac/enron-emails/config.yml +18 -0
- data/datasets/lilac/enron-emails/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/enron-emails/manifest.json +18 -0
- data/datasets/lilac/enron-emails/text/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/enron-emails/text/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/enron-emails/text/gte-small/signal_manifest.json +32 -0
- data/datasets/lilac/enron-emails/text/gte-small/spans.pkl +3 -0
- data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/enron-emails/text/lang_detection/signal_manifest.json +28 -0
- data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/enron-emails/text/near_dup/signal_manifest.json +33 -0
- data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/enron-emails/text/pii/signal_manifest.json +42 -0
.gitattributes
CHANGED
@@ -26,3 +26,10 @@ data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs
|
|
26 |
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
27 |
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
28 |
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
27 |
data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
28 |
data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
29 |
+
data/datasets/lilac/enron-emails/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
30 |
+
data/datasets/lilac/enron-emails/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
31 |
+
data/datasets/lilac/enron-emails/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
32 |
+
data/datasets/lilac/enron-emails/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
33 |
+
data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
34 |
+
data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
35 |
+
data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/enron-emails/config.yml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- {embedding: gte-small, path: text}
|
3 |
+
name: enron-emails
|
4 |
+
namespace: lilac
|
5 |
+
settings:
|
6 |
+
preferred_embedding: gte-small
|
7 |
+
ui:
|
8 |
+
media_paths: [text]
|
9 |
+
signals:
|
10 |
+
- path: text
|
11 |
+
signal: {signal_name: near_dup}
|
12 |
+
- path: text
|
13 |
+
signal: {signal_name: lang_detection}
|
14 |
+
- path: text
|
15 |
+
signal: {signal_name: pii}
|
16 |
+
source: {config_name: enron_emails, dataset_name: EleutherAI/pile, sample_size: 100000,
|
17 |
+
source_name: huggingface}
|
18 |
+
tags: [business]
|
data/datasets/lilac/enron-emails/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e9a6c560a9a4956d031d62b0ca7bb0832c270e1aa83114eab76685221946a2c
|
3 |
+
size 164578444
|
data/datasets/lilac/enron-emails/manifest.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"text": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"meta": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"__hfsplit__": {
|
14 |
+
"dtype": "string"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
data/datasets/lilac/enron-emails/text/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af45aa50af8fcc9c4a95b7558c39a19033e7481e55ae660b1b5addd2fd937998
|
3 |
+
size 1912888576
|
data/datasets/lilac/enron-emails/text/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db97c5da2cc259cdb4410d0c8dfb30ae06dc3b98489d7041e62ac4350b2241f7
|
3 |
+
size 23507190
|
data/datasets/lilac/enron-emails/text/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "text.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"text": {
|
7 |
+
"fields": {
|
8 |
+
"gte-small": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"embedding": {
|
12 |
+
"dtype": "embedding"
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"dtype": "string_span"
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"signal_name": "gte-small"
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"signal": {
|
26 |
+
"signal_name": "gte-small"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"text"
|
30 |
+
],
|
31 |
+
"vector_store": "hnsw"
|
32 |
+
}
|
data/datasets/lilac/enron-emails/text/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5bc8ce021c57a9ee2c6548eda4d941b691b5604ddff277483dccccae35a6860b
|
3 |
+
size 13603458
|
data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2d2ee7230f452d35c24b842357274a63b70b3037cc91cfc86613bce169e2cbf
|
3 |
+
size 3309649
|
data/datasets/lilac/enron-emails/text/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"text": {
|
9 |
+
"fields": {
|
10 |
+
"lang_detection": {
|
11 |
+
"dtype": "string",
|
12 |
+
"signal": {
|
13 |
+
"split_by_paragraph": false,
|
14 |
+
"signal_name": "lang_detection"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
}
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"signal": {
|
22 |
+
"split_by_paragraph": false,
|
23 |
+
"signal_name": "lang_detection"
|
24 |
+
},
|
25 |
+
"enriched_path": [
|
26 |
+
"text"
|
27 |
+
]
|
28 |
+
}
|
data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a12f04a049959c53570677eb42e60c63182d3f5dc8ab9a6619591418d813289
|
3 |
+
size 3792281
|
data/datasets/lilac/enron-emails/text/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"text": {
|
9 |
+
"fields": {
|
10 |
+
"near_dup": {
|
11 |
+
"fields": {
|
12 |
+
"cluster_id": {
|
13 |
+
"dtype": "uint32",
|
14 |
+
"categorical": true
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"signal": {
|
18 |
+
"threshold": 0.85,
|
19 |
+
"signal_name": "near_dup"
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"signal": {
|
27 |
+
"threshold": 0.85,
|
28 |
+
"signal_name": "near_dup"
|
29 |
+
},
|
30 |
+
"enriched_path": [
|
31 |
+
"text"
|
32 |
+
]
|
33 |
+
}
|
data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32930c920525dcee9901aa16b5c7c427f95b955556e3a343059c6302cf86caa9
|
3 |
+
size 10302224
|
data/datasets/lilac/enron-emails/text/pii/signal_manifest.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "text.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"text": {
|
9 |
+
"fields": {
|
10 |
+
"pii": {
|
11 |
+
"fields": {
|
12 |
+
"emails": {
|
13 |
+
"repeated_field": {
|
14 |
+
"dtype": "string_span"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"ip_addresses": {
|
18 |
+
"repeated_field": {
|
19 |
+
"dtype": "string_span"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"secrets": {
|
23 |
+
"repeated_field": {
|
24 |
+
"dtype": "string_span"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "pii"
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"signal": {
|
37 |
+
"signal_name": "pii"
|
38 |
+
},
|
39 |
+
"enriched_path": [
|
40 |
+
"text"
|
41 |
+
]
|
42 |
+
}
|