from pathlib import Path import onnx import shutil import json from onnxconverter_common import float16 from onnxruntime.quantization import quantize_dynamic, QuantType from optimum.onnxruntime import ORTModelForFeatureExtraction from transformers import AutoTokenizer from tokenizers import Tokenizer # Configuration model_name = "dangvantuan/vietnamese-embedding" output_dir = Path("onnx") output_dir.mkdir(parents=True, exist_ok=True) # -------------------------------------------------- # Step 1: Export model to ONNX (FP32) # -------------------------------------------------- print("Exporting FP32 model...") model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True) model.save_pretrained(output_dir) # -------------------------------------------------- # Step 2: Convert tokenizer to JSON format # -------------------------------------------------- print("Processing tokenizer...") try: # First try to get fast tokenizer directly tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokenizer.save_pretrained(output_dir, legacy_format=False) print("✓ Saved modern tokenizer.json") except Exception as e: print(f"Couldn't create fast tokenizer directly: {e}") print("Attempting manual conversion...") # Load slow tokenizer slow_tokenizer = AutoTokenizer.from_pretrained(model_name) # Save original files first slow_tokenizer.save_pretrained(output_dir) # Convert to fast tokenizer format try: # Create Tokenizer object from the slow tokenizer tokenizer_json = { "version": "1.0", "truncation": None, "padding": None, "added_tokens": [], "normalizer": { "type": "Sequence", "normalizers": [] }, "pre_tokenizer": { "type": "Whitespace" }, "post_processor": None, "decoder": None, "model": { "type": "WordPiece", "unk_token": slow_tokenizer.unk_token, "sep_token": slow_tokenizer.sep_token, "cls_token": slow_tokenizer.cls_token, "pad_token": slow_tokenizer.pad_token, "mask_token": slow_tokenizer.mask_token, "vocab": slow_tokenizer.get_vocab(), "max_input_chars_per_word": 100 } } # Save as tokenizer.json with open(output_dir / "tokenizer.json", "w", encoding="utf-8") as f: json.dump(tokenizer_json, f, ensure_ascii=False, indent=2) print("✓ Manually created tokenizer.json") except Exception as e: print(f"Failed to create tokenizer.json: {e}") print("Falling back to original tokenizer files") # -------------------------------------------------- # Step 3: Quantize model to INT8 # -------------------------------------------------- print("Quantizing to INT8...") quantize_dynamic( model_input=output_dir / "model.onnx", model_output=output_dir / "model_quantized.onnx", weight_type=QuantType.QInt8, ) # -------------------------------------------------- # Step 4: Clean up file organization # -------------------------------------------------- print("Organizing files...") # Move all JSON files to parent directory for json_file in output_dir.glob("*.json"): shutil.move(str(json_file), str(Path(".") / json_file.name)) print("✅ Conversion complete!") print(f"ONNX models saved in: {output_dir}") print(f"Tokenizer files moved to project root")