Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- data/datasets/lilac/pile-of-law-us-bills/config.yml +26 -0
- data/datasets/lilac/pile-of-law-us-bills/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/pile-of-law-us-bills/manifest.json +27 -0
- data/datasets/lilac/pile-of-law-us-bills/text/.DS_Store +0 -0
- data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/gte-small/signal_manifest.json +35 -0
- data/datasets/lilac/pile-of-law-us-bills/text/gte-small/spans.pkl +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/signal_manifest.json +31 -0
- data/datasets/lilac/pile-of-law-us-bills/text/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/near_dup/signal_manifest.json +36 -0
- data/datasets/lilac/pile-of-law-us-bills/text/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/pile-of-law-us-bills/text/pii/signal_manifest.json +45 -0
.gitattributes
CHANGED
@@ -76,3 +76,10 @@ data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs di
|
|
76 |
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
77 |
data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
78 |
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
77 |
data/datasets/lilac/imdb/text/spacy_ner/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
78 |
data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
79 |
+
data/datasets/lilac/pile-of-law-us-bills/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
80 |
+
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
81 |
+
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
82 |
+
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
83 |
+
data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
84 |
+
data/datasets/lilac/pile-of-law-us-bills/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
85 |
+
data/datasets/lilac/pile-of-law-us-bills/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/pile-of-law-us-bills/config.yml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- embedding: gte-small
|
3 |
+
path: text
|
4 |
+
name: pile-of-law-us-bills
|
5 |
+
namespace: lilac
|
6 |
+
settings:
|
7 |
+
preferred_embedding: gte-small
|
8 |
+
ui:
|
9 |
+
media_paths:
|
10 |
+
- text
|
11 |
+
signals:
|
12 |
+
- path: text
|
13 |
+
signal:
|
14 |
+
signal_name: near_dup
|
15 |
+
- path: text
|
16 |
+
signal:
|
17 |
+
signal_name: lang_detection
|
18 |
+
- path: text
|
19 |
+
signal:
|
20 |
+
signal_name: pii
|
21 |
+
source:
|
22 |
+
config_name: us_bills
|
23 |
+
dataset_name: pile-of-law/pile-of-law
|
24 |
+
source_name: huggingface
|
25 |
+
tags:
|
26 |
+
- legal
|
data/datasets/lilac/pile-of-law-us-bills/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:335e9bfb1ad3ee69297fc675291cae86036fc484771decb73d89e3dc2c38922d
|
3 |
+
size 608269567
|
data/datasets/lilac/pile-of-law-us-bills/manifest.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"text": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"created_timestamp": {
|
11 |
+
"dtype": "string"
|
12 |
+
},
|
13 |
+
"downloaded_timestamp": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"url": {
|
17 |
+
"dtype": "string"
|
18 |
+
},
|
19 |
+
"__hfsplit__": {
|
20 |
+
"dtype": "string"
|
21 |
+
},
|
22 |
+
"__rowid__": {
|
23 |
+
"dtype": "string"
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
data/datasets/lilac/pile-of-law-us-bills/text/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b51e1405bfac83b231f5042ef6291be46212e13eed21680038c1e6d1dc3a62e2
|
3 |
+
size 7759180008
|
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0a69aa05110e06ed97dc761e16f0484697dcd672cb26b6392c243288122c031
|
3 |
+
size 87600896
|
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "gte-small(text)",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"__rowid__": {
|
7 |
+
"dtype": "string"
|
8 |
+
},
|
9 |
+
"text": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"signal": {
|
29 |
+
"signal_name": "gte-small"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"text"
|
33 |
+
],
|
34 |
+
"vector_store": "hnsw"
|
35 |
+
}
|
data/datasets/lilac/pile-of-law-us-bills/text/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1decf107ec3590af086c1e64bfbb1b7bfd0f9a378fb246d372fa9b8e5823f5e3
|
3 |
+
size 47185382
|
data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e18d165435a37859117a4ce805ab92c9129d7f9796331302feec275b3ce2476c
|
3 |
+
size 3716075
|
data/datasets/lilac/pile-of-law-us-bills/text/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "lang_detection(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"signal": {
|
25 |
+
"split_by_paragraph": false,
|
26 |
+
"signal_name": "lang_detection"
|
27 |
+
},
|
28 |
+
"enriched_path": [
|
29 |
+
"text"
|
30 |
+
]
|
31 |
+
}
|
data/datasets/lilac/pile-of-law-us-bills/text/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19f85420c88189702b09000676c9d65c597f2e0225d6cb0d72257f67ac86ea13
|
3 |
+
size 4357010
|
data/datasets/lilac/pile-of-law-us-bills/text/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "near_dup(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"signal": {
|
30 |
+
"threshold": 0.85,
|
31 |
+
"signal_name": "near_dup"
|
32 |
+
},
|
33 |
+
"enriched_path": [
|
34 |
+
"text"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/pile-of-law-us-bills/text/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97c25182b5e936da01587739a02210f1e3ee562caf2de204ca7b4ca8c8d63e46
|
3 |
+
size 3719352
|
data/datasets/lilac/pile-of-law-us-bills/text/pii/signal_manifest.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "pii(text)",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"__rowid__": {
|
9 |
+
"dtype": "string"
|
10 |
+
},
|
11 |
+
"text": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"signal": {
|
40 |
+
"signal_name": "pii"
|
41 |
+
},
|
42 |
+
"enriched_path": [
|
43 |
+
"text"
|
44 |
+
]
|
45 |
+
}
|