Commit
·
5b5f554
1
Parent(s):
28698b8
rename tokenizer
Browse files
tokenization_luke_bert_japanese.py → tokenization_ubke_bert_japanese.py
RENAMED
@@ -12,7 +12,7 @@
|
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
-
"""Tokenization classes for
|
16 |
|
17 |
import collections
|
18 |
import copy
|
@@ -57,7 +57,7 @@ logger = logging.get_logger(__name__)
|
|
57 |
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "entity_vocab_file": "entity_vocab.json", "spm_file": "spiece.model"}
|
58 |
|
59 |
|
60 |
-
class
|
61 |
vocab_files_names = VOCAB_FILES_NAMES
|
62 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
63 |
|
|
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
15 |
+
"""Tokenization classes for UBKE."""
|
16 |
|
17 |
import collections
|
18 |
import copy
|
|
|
57 |
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "entity_vocab_file": "entity_vocab.json", "spm_file": "spiece.model"}
|
58 |
|
59 |
|
60 |
+
class UbkeBertJapaneseTokenizer(PreTrainedTokenizer):
|
61 |
vocab_files_names = VOCAB_FILES_NAMES
|
62 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
63 |
|
tokenizer_config.json
CHANGED
@@ -69,7 +69,7 @@
|
|
69 |
],
|
70 |
"auto_map": {
|
71 |
"AutoTokenizer": [
|
72 |
-
"
|
73 |
null
|
74 |
]
|
75 |
},
|
@@ -99,7 +99,7 @@
|
|
99 |
"subword_tokenizer_type": "wordpiece",
|
100 |
"sudachi_kwargs": null,
|
101 |
"task": null,
|
102 |
-
"tokenizer_class": "
|
103 |
"unk_token": "[UNK]",
|
104 |
"word_tokenizer_type": "mecab"
|
105 |
-
}
|
|
|
69 |
],
|
70 |
"auto_map": {
|
71 |
"AutoTokenizer": [
|
72 |
+
"tokenization_ubke_bert_japanese.UbkeBertJapaneseTokenizer",
|
73 |
null
|
74 |
]
|
75 |
},
|
|
|
99 |
"subword_tokenizer_type": "wordpiece",
|
100 |
"sudachi_kwargs": null,
|
101 |
"task": null,
|
102 |
+
"tokenizer_class": "UbkeBertJapaneseTokenizer",
|
103 |
"unk_token": "[UNK]",
|
104 |
"word_tokenizer_type": "mecab"
|
105 |
+
}
|