vietnamese-embedding-onnx / convert_to_onnx.py
Manh Lai
update to generate onnx tokenizer
2262653
raw
history blame
3.61 kB
from pathlib import Path
import onnx
import shutil
import json
from onnxconverter_common import float16
from onnxruntime.quantization import quantize_dynamic, QuantType
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from tokenizers import Tokenizer
# Configuration
model_name = "dangvantuan/vietnamese-embedding"
output_dir = Path("onnx")
output_dir.mkdir(parents=True, exist_ok=True)
# --------------------------------------------------
# Step 1: Export model to ONNX (FP32)
# --------------------------------------------------
print("Exporting FP32 model...")
model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True)
model.save_pretrained(output_dir)
# --------------------------------------------------
# Step 2: Convert tokenizer to JSON format
# --------------------------------------------------
print("Processing tokenizer...")
try:
# First try to get fast tokenizer directly
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.save_pretrained(output_dir, legacy_format=False)
print("βœ“ Saved modern tokenizer.json")
except Exception as e:
print(f"Couldn't create fast tokenizer directly: {e}")
print("Attempting manual conversion...")
# Load slow tokenizer
slow_tokenizer = AutoTokenizer.from_pretrained(model_name)
# Save original files first
slow_tokenizer.save_pretrained(output_dir)
# Convert to fast tokenizer format
try:
# Create Tokenizer object from the slow tokenizer
tokenizer_json = {
"version": "1.0",
"truncation": None,
"padding": None,
"added_tokens": [],
"normalizer": {
"type": "Sequence",
"normalizers": []
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": None,
"decoder": None,
"model": {
"type": "WordPiece",
"unk_token": slow_tokenizer.unk_token,
"sep_token": slow_tokenizer.sep_token,
"cls_token": slow_tokenizer.cls_token,
"pad_token": slow_tokenizer.pad_token,
"mask_token": slow_tokenizer.mask_token,
"vocab": slow_tokenizer.get_vocab(),
"max_input_chars_per_word": 100
}
}
# Save as tokenizer.json
with open(output_dir / "tokenizer.json", "w", encoding="utf-8") as f:
json.dump(tokenizer_json, f, ensure_ascii=False, indent=2)
print("βœ“ Manually created tokenizer.json")
except Exception as e:
print(f"Failed to create tokenizer.json: {e}")
print("Falling back to original tokenizer files")
# --------------------------------------------------
# Step 3: Quantize model to INT8
# --------------------------------------------------
print("Quantizing to INT8...")
quantize_dynamic(
model_input=output_dir / "model.onnx",
model_output=output_dir / "model_quantized.onnx",
weight_type=QuantType.QInt8,
)
# --------------------------------------------------
# Step 4: Clean up file organization
# --------------------------------------------------
print("Organizing files...")
# Move all JSON files to parent directory
for json_file in output_dir.glob("*.json"):
shutil.move(str(json_file), str(Path(".") / json_file.name))
print("βœ… Conversion complete!")
print(f"ONNX models saved in: {output_dir}")
print(f"Tokenizer files moved to project root")