File size: 3,613 Bytes
897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 897662d 2262653 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from pathlib import Path
import onnx
import shutil
import json
from onnxconverter_common import float16
from onnxruntime.quantization import quantize_dynamic, QuantType
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from tokenizers import Tokenizer
# Configuration
model_name = "dangvantuan/vietnamese-embedding"
output_dir = Path("onnx")
output_dir.mkdir(parents=True, exist_ok=True)
# --------------------------------------------------
# Step 1: Export model to ONNX (FP32)
# --------------------------------------------------
print("Exporting FP32 model...")
model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True)
model.save_pretrained(output_dir)
# --------------------------------------------------
# Step 2: Convert tokenizer to JSON format
# --------------------------------------------------
print("Processing tokenizer...")
try:
# First try to get fast tokenizer directly
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.save_pretrained(output_dir, legacy_format=False)
print("✓ Saved modern tokenizer.json")
except Exception as e:
print(f"Couldn't create fast tokenizer directly: {e}")
print("Attempting manual conversion...")
# Load slow tokenizer
slow_tokenizer = AutoTokenizer.from_pretrained(model_name)
# Save original files first
slow_tokenizer.save_pretrained(output_dir)
# Convert to fast tokenizer format
try:
# Create Tokenizer object from the slow tokenizer
tokenizer_json = {
"version": "1.0",
"truncation": None,
"padding": None,
"added_tokens": [],
"normalizer": {
"type": "Sequence",
"normalizers": []
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": None,
"decoder": None,
"model": {
"type": "WordPiece",
"unk_token": slow_tokenizer.unk_token,
"sep_token": slow_tokenizer.sep_token,
"cls_token": slow_tokenizer.cls_token,
"pad_token": slow_tokenizer.pad_token,
"mask_token": slow_tokenizer.mask_token,
"vocab": slow_tokenizer.get_vocab(),
"max_input_chars_per_word": 100
}
}
# Save as tokenizer.json
with open(output_dir / "tokenizer.json", "w", encoding="utf-8") as f:
json.dump(tokenizer_json, f, ensure_ascii=False, indent=2)
print("✓ Manually created tokenizer.json")
except Exception as e:
print(f"Failed to create tokenizer.json: {e}")
print("Falling back to original tokenizer files")
# --------------------------------------------------
# Step 3: Quantize model to INT8
# --------------------------------------------------
print("Quantizing to INT8...")
quantize_dynamic(
model_input=output_dir / "model.onnx",
model_output=output_dir / "model_quantized.onnx",
weight_type=QuantType.QInt8,
)
# --------------------------------------------------
# Step 4: Clean up file organization
# --------------------------------------------------
print("Organizing files...")
# Move all JSON files to parent directory
for json_file in output_dir.glob("*.json"):
shutil.move(str(json_file), str(Path(".") / json_file.name))
print("✅ Conversion complete!")
print(f"ONNX models saved in: {output_dir}")
print(f"Tokenizer files moved to project root") |