Spaces:
Running
Running
Commit
·
039dde6
1
Parent(s):
584650f
Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- data/datasets/lilac/medical_dialog/config.yml +46 -0
- data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/signal_manifest.json +40 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/signal_manifest.json +36 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/signal_manifest.json +41 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/signal_manifest.json +50 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet +3 -0
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/signal_manifest.json +64 -0
- data/datasets/lilac/medical_dialog/manifest.json +35 -0
.gitattributes
CHANGED
@@ -87,3 +87,11 @@ data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.pa
|
|
87 |
data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
88 |
data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
89 |
data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
88 |
data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
89 |
data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
90 |
+
data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
91 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
|
92 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
|
93 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
|
94 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
95 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
96 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
97 |
+
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
|
data/datasets/lilac/medical_dialog/config.yml
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
embeddings:
|
2 |
+
- embedding: gte-small
|
3 |
+
path:
|
4 |
+
- dialogue_turns
|
5 |
+
- utterance
|
6 |
+
- '*'
|
7 |
+
name: medical_dialog
|
8 |
+
namespace: lilac
|
9 |
+
settings:
|
10 |
+
preferred_embedding: gte-small
|
11 |
+
ui:
|
12 |
+
media_paths:
|
13 |
+
- - dialogue_turns
|
14 |
+
- utterance
|
15 |
+
- '*'
|
16 |
+
signals:
|
17 |
+
- path:
|
18 |
+
- dialogue_turns
|
19 |
+
- utterance
|
20 |
+
- '*'
|
21 |
+
signal:
|
22 |
+
signal_name: near_dup
|
23 |
+
- path:
|
24 |
+
- dialogue_turns
|
25 |
+
- utterance
|
26 |
+
- '*'
|
27 |
+
signal:
|
28 |
+
signal_name: text_statistics
|
29 |
+
- path:
|
30 |
+
- dialogue_turns
|
31 |
+
- utterance
|
32 |
+
- '*'
|
33 |
+
signal:
|
34 |
+
signal_name: pii
|
35 |
+
- path:
|
36 |
+
- dialogue_turns
|
37 |
+
- utterance
|
38 |
+
- '*'
|
39 |
+
signal:
|
40 |
+
signal_name: lang_detection
|
41 |
+
source:
|
42 |
+
config_name: en
|
43 |
+
dataset_name: medical_dialog
|
44 |
+
source_name: huggingface
|
45 |
+
tags:
|
46 |
+
- medical
|
data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91f506cf1e86dbf133857b13ccdbe1dbbb2c5e616f83197502790739257d2f06
|
3 |
+
size 134394415
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88b70375d7904939284f054ce7a116b86d670a9b467edf045e9a1263b8bc740c
|
3 |
+
size 1271359144
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b523a5b28a70edaa92063c5a151b887037fb8d14ca91583fd989c0484f8f1855
|
3 |
+
size 20992247
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/signal_manifest.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [],
|
3 |
+
"parquet_id": "dialogue_turns.utterance.gte-small",
|
4 |
+
"data_schema": {
|
5 |
+
"fields": {
|
6 |
+
"dialogue_turns": {
|
7 |
+
"fields": {
|
8 |
+
"utterance": {
|
9 |
+
"repeated_field": {
|
10 |
+
"fields": {
|
11 |
+
"gte-small": {
|
12 |
+
"repeated_field": {
|
13 |
+
"fields": {
|
14 |
+
"embedding": {
|
15 |
+
"dtype": "embedding"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"dtype": "string_span"
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"signal_name": "gte-small"
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "gte-small"
|
33 |
+
},
|
34 |
+
"enriched_path": [
|
35 |
+
"dialogue_turns",
|
36 |
+
"utterance",
|
37 |
+
"*"
|
38 |
+
],
|
39 |
+
"vector_store": "hnsw"
|
40 |
+
}
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50c7a00e5d244549fca543f29703a79d79b0be8dbd934e724020aa7acd5c7cf0
|
3 |
+
size 17593826
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c5263a90c6ab29d6b39d6a6b5393118ce58aef0d3c2d414301eb4e2f3f3c4e3
|
3 |
+
size 6380428
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/signal_manifest.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "dialogue_turns.utterance.lang_detection",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"dialogue_turns": {
|
9 |
+
"fields": {
|
10 |
+
"utterance": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"lang_detection": {
|
14 |
+
"dtype": "string",
|
15 |
+
"signal": {
|
16 |
+
"split_by_paragraph": false,
|
17 |
+
"signal_name": "lang_detection"
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"signal": {
|
28 |
+
"split_by_paragraph": false,
|
29 |
+
"signal_name": "lang_detection"
|
30 |
+
},
|
31 |
+
"enriched_path": [
|
32 |
+
"dialogue_turns",
|
33 |
+
"utterance",
|
34 |
+
"*"
|
35 |
+
]
|
36 |
+
}
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c08f38f01f169afac104875890e33f829f50d0a6582c1838681d8fc836a57a1
|
3 |
+
size 8687974
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/signal_manifest.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "dialogue_turns.utterance.near_dup",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"dialogue_turns": {
|
9 |
+
"fields": {
|
10 |
+
"utterance": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"near_dup": {
|
14 |
+
"fields": {
|
15 |
+
"cluster_id": {
|
16 |
+
"dtype": "uint32",
|
17 |
+
"categorical": true
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"signal": {
|
21 |
+
"threshold": 0.85,
|
22 |
+
"signal_name": "near_dup"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"signal": {
|
33 |
+
"threshold": 0.85,
|
34 |
+
"signal_name": "near_dup"
|
35 |
+
},
|
36 |
+
"enriched_path": [
|
37 |
+
"dialogue_turns",
|
38 |
+
"utterance",
|
39 |
+
"*"
|
40 |
+
]
|
41 |
+
}
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28bcf251e88f21fefa5df0417cc9c706730e1b4777ac6591323f209611f37f0f
|
3 |
+
size 6428720
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/signal_manifest.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "dialogue_turns.utterance.pii",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"dialogue_turns": {
|
9 |
+
"fields": {
|
10 |
+
"utterance": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"pii": {
|
14 |
+
"fields": {
|
15 |
+
"emails": {
|
16 |
+
"repeated_field": {
|
17 |
+
"dtype": "string_span"
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"ip_addresses": {
|
21 |
+
"repeated_field": {
|
22 |
+
"dtype": "string_span"
|
23 |
+
}
|
24 |
+
},
|
25 |
+
"secrets": {
|
26 |
+
"repeated_field": {
|
27 |
+
"dtype": "string_span"
|
28 |
+
}
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"signal": {
|
32 |
+
"signal_name": "pii"
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"signal": {
|
43 |
+
"signal_name": "pii"
|
44 |
+
},
|
45 |
+
"enriched_path": [
|
46 |
+
"dialogue_turns",
|
47 |
+
"utterance",
|
48 |
+
"*"
|
49 |
+
]
|
50 |
+
}
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a77052cab1e9dbb312c92d4e735aa3b18fd4706b0f3098d20f59b99ea8068e67
|
3 |
+
size 9486961
|
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/signal_manifest.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"parquet_id": "dialogue_turns.utterance.text_statistics",
|
6 |
+
"data_schema": {
|
7 |
+
"fields": {
|
8 |
+
"dialogue_turns": {
|
9 |
+
"fields": {
|
10 |
+
"utterance": {
|
11 |
+
"repeated_field": {
|
12 |
+
"fields": {
|
13 |
+
"text_statistics": {
|
14 |
+
"fields": {
|
15 |
+
"num_characters": {
|
16 |
+
"dtype": "int32"
|
17 |
+
},
|
18 |
+
"readability": {
|
19 |
+
"dtype": "float32"
|
20 |
+
},
|
21 |
+
"log(type_token_ratio)": {
|
22 |
+
"dtype": "float32"
|
23 |
+
},
|
24 |
+
"frac_non_ascii": {
|
25 |
+
"dtype": "float32",
|
26 |
+
"bins": [
|
27 |
+
[
|
28 |
+
"Low",
|
29 |
+
null,
|
30 |
+
0.15
|
31 |
+
],
|
32 |
+
[
|
33 |
+
"Medium",
|
34 |
+
0.15,
|
35 |
+
0.3
|
36 |
+
],
|
37 |
+
[
|
38 |
+
"High",
|
39 |
+
0.3,
|
40 |
+
null
|
41 |
+
]
|
42 |
+
]
|
43 |
+
}
|
44 |
+
},
|
45 |
+
"signal": {
|
46 |
+
"signal_name": "text_statistics"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
51 |
+
}
|
52 |
+
}
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"signal": {
|
57 |
+
"signal_name": "text_statistics"
|
58 |
+
},
|
59 |
+
"enriched_path": [
|
60 |
+
"dialogue_turns",
|
61 |
+
"utterance",
|
62 |
+
"*"
|
63 |
+
]
|
64 |
+
}
|
data/datasets/lilac/medical_dialog/manifest.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"files": [
|
3 |
+
"data-00000-of-00001.parquet"
|
4 |
+
],
|
5 |
+
"data_schema": {
|
6 |
+
"fields": {
|
7 |
+
"file_name": {
|
8 |
+
"dtype": "string"
|
9 |
+
},
|
10 |
+
"dialogue_id": {
|
11 |
+
"dtype": "int32"
|
12 |
+
},
|
13 |
+
"dialogue_url": {
|
14 |
+
"dtype": "string"
|
15 |
+
},
|
16 |
+
"dialogue_turns": {
|
17 |
+
"fields": {
|
18 |
+
"speaker": {
|
19 |
+
"repeated_field": {
|
20 |
+
"dtype": "int32"
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"utterance": {
|
24 |
+
"repeated_field": {
|
25 |
+
"dtype": "string"
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"__hfsplit__": {
|
31 |
+
"dtype": "string"
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|