smilkov commited on
Commit
1ae7212
·
1 Parent(s): 0265d9a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -76,3 +76,10 @@ data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs di
76
  data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
  data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
78
  data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
76
  data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
  data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
78
  data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
79
+ data/datasets/lilac/pile-of-law-us-bills/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
80
+ data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
81
+ data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
82
+ data/datasets/lilac/pile-of-law-us-bills/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
83
+ data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
84
+ data/datasets/lilac/pile-of-law-us-bills/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
85
+ data/datasets/lilac/pile-of-law-us-bills/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/pile-of-law-us-bills/config.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path: text
4
+ name: pile-of-law-us-bills
5
+ namespace: lilac
6
+ settings:
7
+ preferred_embedding: gte-small
8
+ ui:
9
+ media_paths:
10
+ - text
11
+ signals:
12
+ - path: text
13
+ signal:
14
+ signal_name: near_dup
15
+ - path: text
16
+ signal:
17
+ signal_name: lang_detection
18
+ - path: text
19
+ signal:
20
+ signal_name: pii
21
+ source:
22
+ config_name: us_bills
23
+ dataset_name: pile-of-law/pile-of-law
24
+ source_name: huggingface
25
+ tags:
26
+ - legal
data/datasets/lilac/pile-of-law-us-bills/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:335e9bfb1ad3ee69297fc675291cae86036fc484771decb73d89e3dc2c38922d
3
+ size 608269567
data/datasets/lilac/pile-of-law-us-bills/manifest.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "text": {
8
+ "dtype": "string"
9
+ },
10
+ "created_timestamp": {
11
+ "dtype": "string"
12
+ },
13
+ "downloaded_timestamp": {
14
+ "dtype": "string"
15
+ },
16
+ "url": {
17
+ "dtype": "string"
18
+ },
19
+ "__hfsplit__": {
20
+ "dtype": "string"
21
+ },
22
+ "__rowid__": {
23
+ "dtype": "string"
24
+ }
25
+ }
26
+ }
27
+ }
data/datasets/lilac/pile-of-law-us-bills/text/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b51e1405bfac83b231f5042ef6291be46212e13eed21680038c1e6d1dc3a62e2
3
+ size 7759180008
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a69aa05110e06ed97dc761e16f0484697dcd672cb26b6392c243288122c031
3
+ size 87600896
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "gte-small(text)",
4
+ "data_schema": {
5
+ "fields": {
6
+ "__rowid__": {
7
+ "dtype": "string"
8
+ },
9
+ "text": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "signal": {
29
+ "signal_name": "gte-small"
30
+ },
31
+ "enriched_path": [
32
+ "text"
33
+ ],
34
+ "vector_store": "hnsw"
35
+ }
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1decf107ec3590af086c1e64bfbb1b7bfd0f9a378fb246d372fa9b8e5823f5e3
3
+ size 47185382
data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e18d165435a37859117a4ce805ab92c9129d7f9796331302feec275b3ce2476c
3
+ size 3716075
data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "lang_detection(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ },
24
+ "signal": {
25
+ "split_by_paragraph": false,
26
+ "signal_name": "lang_detection"
27
+ },
28
+ "enriched_path": [
29
+ "text"
30
+ ]
31
+ }
data/datasets/lilac/pile-of-law-us-bills/text/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19f85420c88189702b09000676c9d65c597f2e0225d6cb0d72257f67ac86ea13
3
+ size 4357010
data/datasets/lilac/pile-of-law-us-bills/text/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "near_dup(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ },
29
+ "signal": {
30
+ "threshold": 0.85,
31
+ "signal_name": "near_dup"
32
+ },
33
+ "enriched_path": [
34
+ "text"
35
+ ]
36
+ }
data/datasets/lilac/pile-of-law-us-bills/text/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97c25182b5e936da01587739a02210f1e3ee562caf2de204ca7b4ca8c8d63e46
3
+ size 3719352
data/datasets/lilac/pile-of-law-us-bills/text/pii/signal_manifest.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "pii(text)",
6
+ "data_schema": {
7
+ "fields": {
8
+ "__rowid__": {
9
+ "dtype": "string"
10
+ },
11
+ "text": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ },
39
+ "signal": {
40
+ "signal_name": "pii"
41
+ },
42
+ "enriched_path": [
43
+ "text"
44
+ ]
45
+ }