nsthorat-lilac commited on
Commit
039dde6
·
1 Parent(s): 584650f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -87,3 +87,11 @@ data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.pa
87
  data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
88
  data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
89
  data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
87
  data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
88
  data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
89
  data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
90
+ data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
91
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
92
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
93
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
94
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
95
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
96
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
97
+ data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
data/datasets/lilac/medical_dialog/config.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ embeddings:
2
+ - embedding: gte-small
3
+ path:
4
+ - dialogue_turns
5
+ - utterance
6
+ - '*'
7
+ name: medical_dialog
8
+ namespace: lilac
9
+ settings:
10
+ preferred_embedding: gte-small
11
+ ui:
12
+ media_paths:
13
+ - - dialogue_turns
14
+ - utterance
15
+ - '*'
16
+ signals:
17
+ - path:
18
+ - dialogue_turns
19
+ - utterance
20
+ - '*'
21
+ signal:
22
+ signal_name: near_dup
23
+ - path:
24
+ - dialogue_turns
25
+ - utterance
26
+ - '*'
27
+ signal:
28
+ signal_name: text_statistics
29
+ - path:
30
+ - dialogue_turns
31
+ - utterance
32
+ - '*'
33
+ signal:
34
+ signal_name: pii
35
+ - path:
36
+ - dialogue_turns
37
+ - utterance
38
+ - '*'
39
+ signal:
40
+ signal_name: lang_detection
41
+ source:
42
+ config_name: en
43
+ dataset_name: medical_dialog
44
+ source_name: huggingface
45
+ tags:
46
+ - medical
data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f506cf1e86dbf133857b13ccdbe1dbbb2c5e616f83197502790739257d2f06
3
+ size 134394415
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88b70375d7904939284f054ce7a116b86d670a9b467edf045e9a1263b8bc740c
3
+ size 1271359144
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b523a5b28a70edaa92063c5a151b887037fb8d14ca91583fd989c0484f8f1855
3
+ size 20992247
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/signal_manifest.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "parquet_id": "dialogue_turns.utterance.gte-small",
4
+ "data_schema": {
5
+ "fields": {
6
+ "dialogue_turns": {
7
+ "fields": {
8
+ "utterance": {
9
+ "repeated_field": {
10
+ "fields": {
11
+ "gte-small": {
12
+ "repeated_field": {
13
+ "fields": {
14
+ "embedding": {
15
+ "dtype": "embedding"
16
+ }
17
+ },
18
+ "dtype": "string_span"
19
+ },
20
+ "signal": {
21
+ "signal_name": "gte-small"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "gte-small"
33
+ },
34
+ "enriched_path": [
35
+ "dialogue_turns",
36
+ "utterance",
37
+ "*"
38
+ ],
39
+ "vector_store": "hnsw"
40
+ }
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50c7a00e5d244549fca543f29703a79d79b0be8dbd934e724020aa7acd5c7cf0
3
+ size 17593826
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c5263a90c6ab29d6b39d6a6b5393118ce58aef0d3c2d414301eb4e2f3f3c4e3
3
+ size 6380428
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/signal_manifest.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "dialogue_turns.utterance.lang_detection",
6
+ "data_schema": {
7
+ "fields": {
8
+ "dialogue_turns": {
9
+ "fields": {
10
+ "utterance": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "lang_detection": {
14
+ "dtype": "string",
15
+ "signal": {
16
+ "split_by_paragraph": false,
17
+ "signal_name": "lang_detection"
18
+ }
19
+ }
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
26
+ },
27
+ "signal": {
28
+ "split_by_paragraph": false,
29
+ "signal_name": "lang_detection"
30
+ },
31
+ "enriched_path": [
32
+ "dialogue_turns",
33
+ "utterance",
34
+ "*"
35
+ ]
36
+ }
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c08f38f01f169afac104875890e33f829f50d0a6582c1838681d8fc836a57a1
3
+ size 8687974
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/signal_manifest.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "dialogue_turns.utterance.near_dup",
6
+ "data_schema": {
7
+ "fields": {
8
+ "dialogue_turns": {
9
+ "fields": {
10
+ "utterance": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "near_dup": {
14
+ "fields": {
15
+ "cluster_id": {
16
+ "dtype": "uint32",
17
+ "categorical": true
18
+ }
19
+ },
20
+ "signal": {
21
+ "threshold": 0.85,
22
+ "signal_name": "near_dup"
23
+ }
24
+ }
25
+ }
26
+ }
27
+ }
28
+ }
29
+ }
30
+ }
31
+ },
32
+ "signal": {
33
+ "threshold": 0.85,
34
+ "signal_name": "near_dup"
35
+ },
36
+ "enriched_path": [
37
+ "dialogue_turns",
38
+ "utterance",
39
+ "*"
40
+ ]
41
+ }
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28bcf251e88f21fefa5df0417cc9c706730e1b4777ac6591323f209611f37f0f
3
+ size 6428720
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/signal_manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "dialogue_turns.utterance.pii",
6
+ "data_schema": {
7
+ "fields": {
8
+ "dialogue_turns": {
9
+ "fields": {
10
+ "utterance": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "pii": {
14
+ "fields": {
15
+ "emails": {
16
+ "repeated_field": {
17
+ "dtype": "string_span"
18
+ }
19
+ },
20
+ "ip_addresses": {
21
+ "repeated_field": {
22
+ "dtype": "string_span"
23
+ }
24
+ },
25
+ "secrets": {
26
+ "repeated_field": {
27
+ "dtype": "string_span"
28
+ }
29
+ }
30
+ },
31
+ "signal": {
32
+ "signal_name": "pii"
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ }
39
+ }
40
+ }
41
+ },
42
+ "signal": {
43
+ "signal_name": "pii"
44
+ },
45
+ "enriched_path": [
46
+ "dialogue_turns",
47
+ "utterance",
48
+ "*"
49
+ ]
50
+ }
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77052cab1e9dbb312c92d4e735aa3b18fd4706b0f3098d20f59b99ea8068e67
3
+ size 9486961
data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/signal_manifest.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "parquet_id": "dialogue_turns.utterance.text_statistics",
6
+ "data_schema": {
7
+ "fields": {
8
+ "dialogue_turns": {
9
+ "fields": {
10
+ "utterance": {
11
+ "repeated_field": {
12
+ "fields": {
13
+ "text_statistics": {
14
+ "fields": {
15
+ "num_characters": {
16
+ "dtype": "int32"
17
+ },
18
+ "readability": {
19
+ "dtype": "float32"
20
+ },
21
+ "log(type_token_ratio)": {
22
+ "dtype": "float32"
23
+ },
24
+ "frac_non_ascii": {
25
+ "dtype": "float32",
26
+ "bins": [
27
+ [
28
+ "Low",
29
+ null,
30
+ 0.15
31
+ ],
32
+ [
33
+ "Medium",
34
+ 0.15,
35
+ 0.3
36
+ ],
37
+ [
38
+ "High",
39
+ 0.3,
40
+ null
41
+ ]
42
+ ]
43
+ }
44
+ },
45
+ "signal": {
46
+ "signal_name": "text_statistics"
47
+ }
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ },
56
+ "signal": {
57
+ "signal_name": "text_statistics"
58
+ },
59
+ "enriched_path": [
60
+ "dialogue_turns",
61
+ "utterance",
62
+ "*"
63
+ ]
64
+ }
data/datasets/lilac/medical_dialog/manifest.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [
3
+ "data-00000-of-00001.parquet"
4
+ ],
5
+ "data_schema": {
6
+ "fields": {
7
+ "file_name": {
8
+ "dtype": "string"
9
+ },
10
+ "dialogue_id": {
11
+ "dtype": "int32"
12
+ },
13
+ "dialogue_url": {
14
+ "dtype": "string"
15
+ },
16
+ "dialogue_turns": {
17
+ "fields": {
18
+ "speaker": {
19
+ "repeated_field": {
20
+ "dtype": "int32"
21
+ }
22
+ },
23
+ "utterance": {
24
+ "repeated_field": {
25
+ "dtype": "string"
26
+ }
27
+ }
28
+ }
29
+ },
30
+ "__hfsplit__": {
31
+ "dtype": "string"
32
+ }
33
+ }
34
+ }
35
+ }