nsthorat commited on
Commit
47b8bbe
·
1 Parent(s): 8e3df60
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -83
  2. data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml +0 -140
  3. data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet +0 -3
  4. data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json +0 -87
  5. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet +0 -0
  6. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json +0 -36
  7. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet +0 -0
  8. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json +0 -41
  9. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet +0 -0
  10. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json +0 -50
  11. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet +0 -0
  12. data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json +0 -64
  13. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet +0 -0
  14. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json +0 -36
  15. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet +0 -0
  16. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json +0 -41
  17. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet +0 -0
  18. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json +0 -50
  19. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet +0 -0
  20. data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json +0 -64
  21. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet +0 -0
  22. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json +0 -36
  23. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet +0 -0
  24. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json +0 -41
  25. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet +0 -0
  26. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json +0 -50
  27. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet +0 -0
  28. data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json +0 -64
  29. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet +0 -0
  30. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json +0 -28
  31. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet +0 -0
  32. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json +0 -33
  33. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet +0 -0
  34. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json +0 -42
  35. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet +0 -0
  36. data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json +0 -56
  37. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet +0 -0
  38. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json +0 -28
  39. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet +0 -0
  40. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json +0 -33
  41. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet +0 -0
  42. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json +0 -42
  43. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/data-00000-of-00001.parquet +0 -0
  44. data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/signal_manifest.json +0 -56
  45. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/data-00000-of-00001.parquet +0 -0
  46. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/signal_manifest.json +0 -28
  47. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/data-00000-of-00001.parquet +0 -0
  48. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/signal_manifest.json +0 -33
  49. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/data-00000-of-00001.parquet +0 -0
  50. data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/signal_manifest.json +0 -42
.gitattributes DELETED
@@ -1,83 +0,0 @@
1
- data/datasets/lilac/piqa/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
2
- data/datasets/lilac/piqa/goal/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
3
- data/datasets/lilac/piqa/goal/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
4
- data/datasets/lilac/piqa/goal/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
5
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
6
- data/datasets/lilac/piqa/sol1/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
7
- data/datasets/lilac/piqa/sol1/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
8
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
9
- data/datasets/lilac/piqa/sol2/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
10
- data/datasets/lilac/piqa/sol2/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
11
- data/datasets/lilac/pile-of-law-atticus-contracts/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
12
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
13
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
14
- data/datasets/lilac/mmlu_professional_law/choices/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
15
- data/datasets/lilac/mmlu_professional_law/choices/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
16
- data/datasets/lilac/mmlu_professional_law/choices/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
17
- data/datasets/lilac/mmlu_professional_law/choices/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
18
- data/datasets/lilac/mmlu_professional_law/choices/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
19
- data/datasets/lilac/mmlu_professional_law/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
20
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
21
- data/datasets/lilac/mmlu_professional_law/question/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
22
- data/datasets/lilac/mmlu_professional_law/question/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
23
- data/datasets/lilac/mmlu_professional_law/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
24
- data/datasets/lilac/mmlu_professional_law/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
25
- data/datasets/lilac/mmlu_professional_law/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
26
- data/datasets/lilac/mmlu_professional_law/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
27
- data/datasets/lilac/pile-of-law-r-legaladvice/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
28
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
29
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
30
- data/datasets/lilac/pile-of-law-r-legaladvice/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
31
- data/datasets/lilac/pile-of-law-r-legaladvice/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
32
- data/datasets/lilac/pile-of-law-r-legaladvice/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
33
- data/datasets/lilac/pile-of-law-r-legaladvice/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
34
- data/datasets/lilac/pile-of-law-r-legaladvice/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
35
- data/datasets/lilac/open-asssistant-conversations/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
36
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
37
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
38
- data/datasets/lilac/open-asssistant-conversations/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
39
- data/datasets/lilac/open-asssistant-conversations/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
40
- data/datasets/lilac/open-asssistant-conversations/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
41
- data/datasets/lilac/open-asssistant-conversations/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
42
- data/datasets/lilac/open-asssistant-conversations/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
43
- data/datasets/lilac/squad_v2/answers/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
44
- data/datasets/lilac/squad_v2/answers/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
45
- data/datasets/lilac/squad_v2/answers/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
46
- data/datasets/lilac/squad_v2/answers/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
47
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
48
- data/datasets/lilac/squad_v2/context/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
49
- data/datasets/lilac/squad_v2/context/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
50
- data/datasets/lilac/squad_v2/context/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
51
- data/datasets/lilac/squad_v2/context/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
52
- data/datasets/lilac/squad_v2/context/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
53
- data/datasets/lilac/squad_v2/context/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
54
- data/datasets/lilac/squad_v2/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
55
- data/datasets/lilac/squad_v2/question/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
56
- data/datasets/lilac/squad_v2/question/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
57
- data/datasets/lilac/squad_v2/question/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
58
- data/datasets/lilac/squad_v2/question/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
59
- data/datasets/lilac/imdb/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
60
- data/datasets/lilac/imdb/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
61
- data/datasets/lilac/imdb/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
62
- data/datasets/lilac/imdb/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
63
- data/datasets/lilac/imdb/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
64
- data/datasets/lilac/imdb/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
65
- data/datasets/lilac/imdb/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
66
- data/datasets/lilac/imdb/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
67
- data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
68
- data/datasets/lilac/wikitext-2-raw-v1/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
69
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
70
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
71
- data/datasets/lilac/wikitext-2-raw-v1/text/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
72
- data/datasets/lilac/wikitext-2-raw-v1/text/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
73
- data/datasets/lilac/wikitext-2-raw-v1/text/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
74
- data/datasets/lilac/wikitext-2-raw-v1/text/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
75
- data/datasets/lilac/wikitext-2-raw-v1/text/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
76
- data/datasets/lilac/medical_dialog/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
77
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.hnswlib.bin filter=lfs diff=lfs merge=lfs -text
78
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/hnsw.lookup.pkl filter=lfs diff=lfs merge=lfs -text
79
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/gte-small/spans.pkl filter=lfs diff=lfs merge=lfs -text
80
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/lang_detection/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
81
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/near_dup/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
82
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/pii/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
83
- data/datasets/lilac/medical_dialog/dialogue_turns/utterance/text_statistics/data-00000-of-00001.parquet filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/config.yml DELETED
@@ -1,140 +0,0 @@
1
- embeddings:
2
- - embedding: gte-small
3
- path: original-context
4
- - embedding: gte-small
5
- path:
6
- - new-context
7
- - value
8
- - '*'
9
- name: databricks-dolly-15k-curated-en
10
- namespace: lilac
11
- settings:
12
- preferred_embedding: gte-small
13
- ui:
14
- media_paths:
15
- - original-instruction
16
- - original-context
17
- - original-response
18
- - - new-instruction
19
- - value
20
- - '*'
21
- - - new-context
22
- - value
23
- - '*'
24
- - - new-response
25
- - value
26
- - '*'
27
- signals:
28
- - path: original-instruction
29
- signal:
30
- signal_name: near_dup
31
- - path: original-instruction
32
- signal:
33
- signal_name: text_statistics
34
- - path: original-instruction
35
- signal:
36
- signal_name: pii
37
- - path: original-instruction
38
- signal:
39
- signal_name: lang_detection
40
- - path: original-context
41
- signal:
42
- signal_name: near_dup
43
- - path: original-context
44
- signal:
45
- signal_name: text_statistics
46
- - path: original-context
47
- signal:
48
- signal_name: lang_detection
49
- - path: original-context
50
- signal:
51
- signal_name: pii
52
- - path: original-response
53
- signal:
54
- signal_name: near_dup
55
- - path: original-response
56
- signal:
57
- signal_name: text_statistics
58
- - path: original-response
59
- signal:
60
- signal_name: pii
61
- - path: original-response
62
- signal:
63
- signal_name: lang_detection
64
- - path:
65
- - new-instruction
66
- - value
67
- - '*'
68
- signal:
69
- signal_name: near_dup
70
- - path:
71
- - new-instruction
72
- - value
73
- - '*'
74
- signal:
75
- signal_name: text_statistics
76
- - path:
77
- - new-instruction
78
- - value
79
- - '*'
80
- signal:
81
- signal_name: pii
82
- - path:
83
- - new-instruction
84
- - value
85
- - '*'
86
- signal:
87
- signal_name: lang_detection
88
- - path:
89
- - new-context
90
- - value
91
- - '*'
92
- signal:
93
- signal_name: near_dup
94
- - path:
95
- - new-context
96
- - value
97
- - '*'
98
- signal:
99
- signal_name: text_statistics
100
- - path:
101
- - new-context
102
- - value
103
- - '*'
104
- signal:
105
- signal_name: lang_detection
106
- - path:
107
- - new-context
108
- - value
109
- - '*'
110
- signal:
111
- signal_name: pii
112
- - path:
113
- - new-response
114
- - value
115
- - '*'
116
- signal:
117
- signal_name: near_dup
118
- - path:
119
- - new-response
120
- - value
121
- - '*'
122
- signal:
123
- signal_name: text_statistics
124
- - path:
125
- - new-response
126
- - value
127
- - '*'
128
- signal:
129
- signal_name: pii
130
- - path:
131
- - new-response
132
- - value
133
- - '*'
134
- signal:
135
- signal_name: lang_detection
136
- source:
137
- dataset_name: argilla/databricks-dolly-15k-curated-en
138
- source_name: huggingface
139
- tags:
140
- - machine-learning
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/data-00000-of-00001.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad225b50d5880a097ea66eb4ca70fc529c0321cf8a5652bd8fbe7a638d016851
3
- size 15882489
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/manifest.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "data_schema": {
6
- "fields": {
7
- "id": {
8
- "dtype": "string"
9
- },
10
- "category": {
11
- "dtype": "string"
12
- },
13
- "original-instruction": {
14
- "dtype": "string"
15
- },
16
- "original-context": {
17
- "dtype": "string"
18
- },
19
- "original-response": {
20
- "dtype": "string"
21
- },
22
- "new-instruction": {
23
- "fields": {
24
- "user_id": {
25
- "repeated_field": {
26
- "dtype": "string"
27
- }
28
- },
29
- "value": {
30
- "repeated_field": {
31
- "dtype": "string"
32
- }
33
- },
34
- "status": {
35
- "repeated_field": {
36
- "dtype": "string"
37
- }
38
- }
39
- }
40
- },
41
- "new-context": {
42
- "fields": {
43
- "user_id": {
44
- "repeated_field": {
45
- "dtype": "string"
46
- }
47
- },
48
- "value": {
49
- "repeated_field": {
50
- "dtype": "string"
51
- }
52
- },
53
- "status": {
54
- "repeated_field": {
55
- "dtype": "string"
56
- }
57
- }
58
- }
59
- },
60
- "new-response": {
61
- "fields": {
62
- "user_id": {
63
- "repeated_field": {
64
- "dtype": "string"
65
- }
66
- },
67
- "value": {
68
- "repeated_field": {
69
- "dtype": "string"
70
- }
71
- },
72
- "status": {
73
- "repeated_field": {
74
- "dtype": "string"
75
- }
76
- }
77
- }
78
- },
79
- "external_id": {
80
- "dtype": "string"
81
- },
82
- "__hfsplit__": {
83
- "dtype": "string"
84
- }
85
- }
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/lang_detection/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-context.value.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "new-context": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "lang_detection": {
14
- "dtype": "string",
15
- "signal": {
16
- "split_by_paragraph": false,
17
- "signal_name": "lang_detection"
18
- }
19
- }
20
- }
21
- }
22
- }
23
- }
24
- }
25
- }
26
- },
27
- "signal": {
28
- "split_by_paragraph": false,
29
- "signal_name": "lang_detection"
30
- },
31
- "enriched_path": [
32
- "new-context",
33
- "value",
34
- "*"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/data-00000-of-00001.parquet DELETED
Binary file (550 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/near_dup/signal_manifest.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-context.value.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "new-context": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.85,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- }
29
- }
30
- }
31
- },
32
- "signal": {
33
- "threshold": 0.85,
34
- "signal_name": "near_dup"
35
- },
36
- "enriched_path": [
37
- "new-context",
38
- "value",
39
- "*"
40
- ]
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/data-00000-of-00001.parquet DELETED
Binary file (519 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/pii/signal_manifest.json DELETED
@@ -1,50 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-context.value.pii",
6
- "data_schema": {
7
- "fields": {
8
- "new-context": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- }
39
- }
40
- }
41
- },
42
- "signal": {
43
- "signal_name": "pii"
44
- },
45
- "enriched_path": [
46
- "new-context",
47
- "value",
48
- "*"
49
- ]
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/data-00000-of-00001.parquet DELETED
Binary file (603 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-context/value/text_statistics/signal_manifest.json DELETED
@@ -1,64 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-context.value.text_statistics",
6
- "data_schema": {
7
- "fields": {
8
- "new-context": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- }
53
- }
54
- }
55
- },
56
- "signal": {
57
- "signal_name": "text_statistics"
58
- },
59
- "enriched_path": [
60
- "new-context",
61
- "value",
62
- "*"
63
- ]
64
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/lang_detection/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-instruction.value.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "new-instruction": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "lang_detection": {
14
- "dtype": "string",
15
- "signal": {
16
- "split_by_paragraph": false,
17
- "signal_name": "lang_detection"
18
- }
19
- }
20
- }
21
- }
22
- }
23
- }
24
- }
25
- }
26
- },
27
- "signal": {
28
- "split_by_paragraph": false,
29
- "signal_name": "lang_detection"
30
- },
31
- "enriched_path": [
32
- "new-instruction",
33
- "value",
34
- "*"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/data-00000-of-00001.parquet DELETED
Binary file (602 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/near_dup/signal_manifest.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-instruction.value.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "new-instruction": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.85,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- }
29
- }
30
- }
31
- },
32
- "signal": {
33
- "threshold": 0.85,
34
- "signal_name": "near_dup"
35
- },
36
- "enriched_path": [
37
- "new-instruction",
38
- "value",
39
- "*"
40
- ]
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/data-00000-of-00001.parquet DELETED
Binary file (519 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/pii/signal_manifest.json DELETED
@@ -1,50 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-instruction.value.pii",
6
- "data_schema": {
7
- "fields": {
8
- "new-instruction": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- }
39
- }
40
- }
41
- },
42
- "signal": {
43
- "signal_name": "pii"
44
- },
45
- "enriched_path": [
46
- "new-instruction",
47
- "value",
48
- "*"
49
- ]
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/data-00000-of-00001.parquet DELETED
Binary file (581 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-instruction/value/text_statistics/signal_manifest.json DELETED
@@ -1,64 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-instruction.value.text_statistics",
6
- "data_schema": {
7
- "fields": {
8
- "new-instruction": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- }
53
- }
54
- }
55
- },
56
- "signal": {
57
- "signal_name": "text_statistics"
58
- },
59
- "enriched_path": [
60
- "new-instruction",
61
- "value",
62
- "*"
63
- ]
64
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/lang_detection/signal_manifest.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-response.value.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "new-response": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "lang_detection": {
14
- "dtype": "string",
15
- "signal": {
16
- "split_by_paragraph": false,
17
- "signal_name": "lang_detection"
18
- }
19
- }
20
- }
21
- }
22
- }
23
- }
24
- }
25
- }
26
- },
27
- "signal": {
28
- "split_by_paragraph": false,
29
- "signal_name": "lang_detection"
30
- },
31
- "enriched_path": [
32
- "new-response",
33
- "value",
34
- "*"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/data-00000-of-00001.parquet DELETED
Binary file (603 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/near_dup/signal_manifest.json DELETED
@@ -1,41 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-response.value.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "new-response": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "near_dup": {
14
- "fields": {
15
- "cluster_id": {
16
- "dtype": "uint32",
17
- "categorical": true
18
- }
19
- },
20
- "signal": {
21
- "threshold": 0.85,
22
- "signal_name": "near_dup"
23
- }
24
- }
25
- }
26
- }
27
- }
28
- }
29
- }
30
- }
31
- },
32
- "signal": {
33
- "threshold": 0.85,
34
- "signal_name": "near_dup"
35
- },
36
- "enriched_path": [
37
- "new-response",
38
- "value",
39
- "*"
40
- ]
41
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/data-00000-of-00001.parquet DELETED
Binary file (520 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/pii/signal_manifest.json DELETED
@@ -1,50 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-response.value.pii",
6
- "data_schema": {
7
- "fields": {
8
- "new-response": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "pii": {
14
- "fields": {
15
- "emails": {
16
- "repeated_field": {
17
- "dtype": "string_span"
18
- }
19
- },
20
- "ip_addresses": {
21
- "repeated_field": {
22
- "dtype": "string_span"
23
- }
24
- },
25
- "secrets": {
26
- "repeated_field": {
27
- "dtype": "string_span"
28
- }
29
- }
30
- },
31
- "signal": {
32
- "signal_name": "pii"
33
- }
34
- }
35
- }
36
- }
37
- }
38
- }
39
- }
40
- }
41
- },
42
- "signal": {
43
- "signal_name": "pii"
44
- },
45
- "enriched_path": [
46
- "new-response",
47
- "value",
48
- "*"
49
- ]
50
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/data-00000-of-00001.parquet DELETED
Binary file (651 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/new-response/value/text_statistics/signal_manifest.json DELETED
@@ -1,64 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "new-response.value.text_statistics",
6
- "data_schema": {
7
- "fields": {
8
- "new-response": {
9
- "fields": {
10
- "value": {
11
- "repeated_field": {
12
- "fields": {
13
- "text_statistics": {
14
- "fields": {
15
- "num_characters": {
16
- "dtype": "int32"
17
- },
18
- "readability": {
19
- "dtype": "float32"
20
- },
21
- "log(type_token_ratio)": {
22
- "dtype": "float32"
23
- },
24
- "frac_non_ascii": {
25
- "dtype": "float32",
26
- "bins": [
27
- [
28
- "Low",
29
- null,
30
- 0.15
31
- ],
32
- [
33
- "Medium",
34
- 0.15,
35
- 0.3
36
- ],
37
- [
38
- "High",
39
- 0.3,
40
- null
41
- ]
42
- ]
43
- }
44
- },
45
- "signal": {
46
- "signal_name": "text_statistics"
47
- }
48
- }
49
- }
50
- }
51
- }
52
- }
53
- }
54
- }
55
- },
56
- "signal": {
57
- "signal_name": "text_statistics"
58
- },
59
- "enriched_path": [
60
- "new-response",
61
- "value",
62
- "*"
63
- ]
64
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/lang_detection/signal_manifest.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-context.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "original-context": {
9
- "fields": {
10
- "lang_detection": {
11
- "dtype": "string",
12
- "signal": {
13
- "split_by_paragraph": false,
14
- "signal_name": "lang_detection"
15
- }
16
- }
17
- }
18
- }
19
- }
20
- },
21
- "signal": {
22
- "split_by_paragraph": false,
23
- "signal_name": "lang_detection"
24
- },
25
- "enriched_path": [
26
- "original-context"
27
- ]
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/data-00000-of-00001.parquet DELETED
Binary file (550 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/near_dup/signal_manifest.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-context.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "original-context": {
9
- "fields": {
10
- "near_dup": {
11
- "fields": {
12
- "cluster_id": {
13
- "dtype": "uint32",
14
- "categorical": true
15
- }
16
- },
17
- "signal": {
18
- "threshold": 0.85,
19
- "signal_name": "near_dup"
20
- }
21
- }
22
- }
23
- }
24
- }
25
- },
26
- "signal": {
27
- "threshold": 0.85,
28
- "signal_name": "near_dup"
29
- },
30
- "enriched_path": [
31
- "original-context"
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/data-00000-of-00001.parquet DELETED
Binary file (519 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/pii/signal_manifest.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-context.pii",
6
- "data_schema": {
7
- "fields": {
8
- "original-context": {
9
- "fields": {
10
- "pii": {
11
- "fields": {
12
- "emails": {
13
- "repeated_field": {
14
- "dtype": "string_span"
15
- }
16
- },
17
- "ip_addresses": {
18
- "repeated_field": {
19
- "dtype": "string_span"
20
- }
21
- },
22
- "secrets": {
23
- "repeated_field": {
24
- "dtype": "string_span"
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "pii"
30
- }
31
- }
32
- }
33
- }
34
- }
35
- },
36
- "signal": {
37
- "signal_name": "pii"
38
- },
39
- "enriched_path": [
40
- "original-context"
41
- ]
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/data-00000-of-00001.parquet DELETED
Binary file (602 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-context/text_statistics/signal_manifest.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-context.text_statistics",
6
- "data_schema": {
7
- "fields": {
8
- "original-context": {
9
- "fields": {
10
- "text_statistics": {
11
- "fields": {
12
- "num_characters": {
13
- "dtype": "int32"
14
- },
15
- "readability": {
16
- "dtype": "float32"
17
- },
18
- "log(type_token_ratio)": {
19
- "dtype": "float32"
20
- },
21
- "frac_non_ascii": {
22
- "dtype": "float32",
23
- "bins": [
24
- [
25
- "Low",
26
- null,
27
- 0.15
28
- ],
29
- [
30
- "Medium",
31
- 0.15,
32
- 0.3
33
- ],
34
- [
35
- "High",
36
- 0.3,
37
- null
38
- ]
39
- ]
40
- }
41
- },
42
- "signal": {
43
- "signal_name": "text_statistics"
44
- }
45
- }
46
- }
47
- }
48
- }
49
- },
50
- "signal": {
51
- "signal_name": "text_statistics"
52
- },
53
- "enriched_path": [
54
- "original-context"
55
- ]
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/lang_detection/signal_manifest.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-instruction.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "original-instruction": {
9
- "fields": {
10
- "lang_detection": {
11
- "dtype": "string",
12
- "signal": {
13
- "split_by_paragraph": false,
14
- "signal_name": "lang_detection"
15
- }
16
- }
17
- }
18
- }
19
- }
20
- },
21
- "signal": {
22
- "split_by_paragraph": false,
23
- "signal_name": "lang_detection"
24
- },
25
- "enriched_path": [
26
- "original-instruction"
27
- ]
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/data-00000-of-00001.parquet DELETED
Binary file (602 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/near_dup/signal_manifest.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-instruction.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "original-instruction": {
9
- "fields": {
10
- "near_dup": {
11
- "fields": {
12
- "cluster_id": {
13
- "dtype": "uint32",
14
- "categorical": true
15
- }
16
- },
17
- "signal": {
18
- "threshold": 0.85,
19
- "signal_name": "near_dup"
20
- }
21
- }
22
- }
23
- }
24
- }
25
- },
26
- "signal": {
27
- "threshold": 0.85,
28
- "signal_name": "near_dup"
29
- },
30
- "enriched_path": [
31
- "original-instruction"
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/data-00000-of-00001.parquet DELETED
Binary file (519 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/pii/signal_manifest.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-instruction.pii",
6
- "data_schema": {
7
- "fields": {
8
- "original-instruction": {
9
- "fields": {
10
- "pii": {
11
- "fields": {
12
- "emails": {
13
- "repeated_field": {
14
- "dtype": "string_span"
15
- }
16
- },
17
- "ip_addresses": {
18
- "repeated_field": {
19
- "dtype": "string_span"
20
- }
21
- },
22
- "secrets": {
23
- "repeated_field": {
24
- "dtype": "string_span"
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "pii"
30
- }
31
- }
32
- }
33
- }
34
- }
35
- },
36
- "signal": {
37
- "signal_name": "pii"
38
- },
39
- "enriched_path": [
40
- "original-instruction"
41
- ]
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/data-00000-of-00001.parquet DELETED
Binary file (581 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-instruction/text_statistics/signal_manifest.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-instruction.text_statistics",
6
- "data_schema": {
7
- "fields": {
8
- "original-instruction": {
9
- "fields": {
10
- "text_statistics": {
11
- "fields": {
12
- "num_characters": {
13
- "dtype": "int32"
14
- },
15
- "readability": {
16
- "dtype": "float32"
17
- },
18
- "log(type_token_ratio)": {
19
- "dtype": "float32"
20
- },
21
- "frac_non_ascii": {
22
- "dtype": "float32",
23
- "bins": [
24
- [
25
- "Low",
26
- null,
27
- 0.15
28
- ],
29
- [
30
- "Medium",
31
- 0.15,
32
- 0.3
33
- ],
34
- [
35
- "High",
36
- 0.3,
37
- null
38
- ]
39
- ]
40
- }
41
- },
42
- "signal": {
43
- "signal_name": "text_statistics"
44
- }
45
- }
46
- }
47
- }
48
- }
49
- },
50
- "signal": {
51
- "signal_name": "text_statistics"
52
- },
53
- "enriched_path": [
54
- "original-instruction"
55
- ]
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/data-00000-of-00001.parquet DELETED
Binary file (521 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/lang_detection/signal_manifest.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-response.lang_detection",
6
- "data_schema": {
7
- "fields": {
8
- "original-response": {
9
- "fields": {
10
- "lang_detection": {
11
- "dtype": "string",
12
- "signal": {
13
- "split_by_paragraph": false,
14
- "signal_name": "lang_detection"
15
- }
16
- }
17
- }
18
- }
19
- }
20
- },
21
- "signal": {
22
- "split_by_paragraph": false,
23
- "signal_name": "lang_detection"
24
- },
25
- "enriched_path": [
26
- "original-response"
27
- ]
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/data-00000-of-00001.parquet DELETED
Binary file (602 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/near_dup/signal_manifest.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-response.near_dup",
6
- "data_schema": {
7
- "fields": {
8
- "original-response": {
9
- "fields": {
10
- "near_dup": {
11
- "fields": {
12
- "cluster_id": {
13
- "dtype": "uint32",
14
- "categorical": true
15
- }
16
- },
17
- "signal": {
18
- "threshold": 0.85,
19
- "signal_name": "near_dup"
20
- }
21
- }
22
- }
23
- }
24
- }
25
- },
26
- "signal": {
27
- "threshold": 0.85,
28
- "signal_name": "near_dup"
29
- },
30
- "enriched_path": [
31
- "original-response"
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/data-00000-of-00001.parquet DELETED
Binary file (519 kB)
 
data/datasets/lilac/databricks-dolly-15k-curated-en/original-response/pii/signal_manifest.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "files": [
3
- "data-00000-of-00001.parquet"
4
- ],
5
- "parquet_id": "original-response.pii",
6
- "data_schema": {
7
- "fields": {
8
- "original-response": {
9
- "fields": {
10
- "pii": {
11
- "fields": {
12
- "emails": {
13
- "repeated_field": {
14
- "dtype": "string_span"
15
- }
16
- },
17
- "ip_addresses": {
18
- "repeated_field": {
19
- "dtype": "string_span"
20
- }
21
- },
22
- "secrets": {
23
- "repeated_field": {
24
- "dtype": "string_span"
25
- }
26
- }
27
- },
28
- "signal": {
29
- "signal_name": "pii"
30
- }
31
- }
32
- }
33
- }
34
- }
35
- },
36
- "signal": {
37
- "signal_name": "pii"
38
- },
39
- "enriched_path": [
40
- "original-response"
41
- ]
42
- }