nsthorat-lilac commited on
Commit
62e40e6
·
1 Parent(s): a1a0c11

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -26,3 +26,10 @@ data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs
26
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
27
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
28
  data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
26
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
27
  data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
28
  data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
29
+ data/datasets/lilac/enron-emails/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
30
+ data/datasets/lilac/enron-emails/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
31
+ data/datasets/lilac/enron-emails/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
32
+ data/datasets/lilac/enron-emails/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
33
+ data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
34
+ data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
35
+ data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/enron-emails/config.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - {embedding: gte-small, path: text}
3
+ name: enron-emails
4
+ namespace: lilac
5
+ settings:
6
+ preferred_embedding: gte-small
7
+ ui:
8
+ media_paths: [text]
9
+ signals:
10
+ - path: text
11
+ signal: {signal_name: near_dup}
12
+ - path: text
13
+ signal: {signal_name: lang_detection}
14
+ - path: text
15
+ signal: {signal_name: pii}
16
+ source: {config_name: enron_emails, dataset_name: EleutherAI/pile, sample_size: 100000,
17
+ source_name: huggingface}
18
+ tags: [business]
data/datasets/lilac/enron-emails/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9a6c560a9a4956d031d62b0ca7bb0832c270e1aa83114eab76685221946a2c
3
+ size 164578444
data/datasets/lilac/enron-emails/manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "text": {
8
+ "dtype": "string"
9
+ },
10
+ "meta": {
11
+ "dtype": "string"
12
+ },
13
+ "__hfsplit__": {
14
+ "dtype": "string"
15
+ }
16
+ }
17
+ }
18
+ }
data/datasets/lilac/enron-emails/text/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af45aa50af8fcc9c4a95b7558c39a19033e7481e55ae660b1b5addd2fd937998
3
+ size 1912888576
data/datasets/lilac/enron-emails/text/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db97c5da2cc259cdb4410d0c8dfb30ae06dc3b98489d7041e62ac4350b2241f7
3
+ size 23507190
data/datasets/lilac/enron-emails/text/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "text.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "text": {
7
+ "fields": {
8
+ "gte-small": {
9
+ "repeated_field": {
10
+ "fields": {
11
+ "embedding": {
12
+ "dtype": "embedding"
13
+ }
14
+ },
15
+ "dtype": "string_span"
16
+ },
17
+ "signal": {
18
+ "signal_name": "gte-small"
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ },
25
+ "signal": {
26
+ "signal_name": "gte-small"
27
+ },
28
+ "enriched_path": [
29
+ "text"
30
+ ],
31
+ "vector_store": "hnsw"
32
+ }
data/datasets/lilac/enron-emails/text/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc8ce021c57a9ee2c6548eda4d941b691b5604ddff277483dccccae35a6860b
3
+ size 13603458
data/datasets/lilac/enron-emails/text/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d2ee7230f452d35c24b842357274a63b70b3037cc91cfc86613bce169e2cbf
3
+ size 3309649
data/datasets/lilac/enron-emails/text/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "text": {
9
+ "fields": {
10
+ "lang_detection": {
11
+ "dtype": "string",
12
+ "signal": {
13
+ "split_by_paragraph": false,
14
+ "signal_name": "lang_detection"
15
+ }
16
+ }
17
+ }
18
+ }
19
+ }
20
+ },
21
+ "signal": {
22
+ "split_by_paragraph": false,
23
+ "signal_name": "lang_detection"
24
+ },
25
+ "enriched_path": [
26
+ "text"
27
+ ]
28
+ }
data/datasets/lilac/enron-emails/text/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a12f04a049959c53570677eb42e60c63182d3f5dc8ab9a6619591418d813289
3
+ size 3792281
data/datasets/lilac/enron-emails/text/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "text": {
9
+ "fields": {
10
+ "near_dup": {
11
+ "fields": {
12
+ "cluster_id": {
13
+ "dtype": "uint32",
14
+ "categorical": true
15
+ }
16
+ },
17
+ "signal": {
18
+ "threshold": 0.85,
19
+ "signal_name": "near_dup"
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "signal": {
27
+ "threshold": 0.85,
28
+ "signal_name": "near_dup"
29
+ },
30
+ "enriched_path": [
31
+ "text"
32
+ ]
33
+ }
data/datasets/lilac/enron-emails/text/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32930c920525dcee9901aa16b5c7c427f95b955556e3a343059c6302cf86caa9
3
+ size 10302224
data/datasets/lilac/enron-emails/text/pii/signal_manifest.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "text.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "text": {
9
+ "fields": {
10
+ "pii": {
11
+ "fields": {
12
+ "emails": {
13
+ "repeated_field": {
14
+ "dtype": "string_span"
15
+ }
16
+ },
17
+ "ip_addresses": {
18
+ "repeated_field": {
19
+ "dtype": "string_span"
20
+ }
21
+ },
22
+ "secrets": {
23
+ "repeated_field": {
24
+ "dtype": "string_span"
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "pii"
30
+ }
31
+ }
32
+ }
33
+ }
34
+ }
35
+ },
36
+ "signal": {
37
+ "signal_name": "pii"
38
+ },
39
+ "enriched_path": [
40
+ "text"
41
+ ]
42
+ }