Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +2 -0
README.md +111 -3
config.json +26 -0
export_onnx_int8.py +191 -0
model_quantized.onnx +3 -0
ort_config.json +33 -0
requirements.txt +6 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +56 -0

.gitattributes CHANGED Viewed

@@ -14,6 +14,7 @@
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.onnx.data filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,111 @@
----
-license: mit
----

+---
+base_model: BAAI/bge-m3
+license: mit
+tags:
+  - feature-extraction
+  - sentence-similarity
+  - multilingual
+  - embedding
+  - hybrid-retrieval
+  - onnx
+  - onnxruntime
+  - optimum
+  - quantization
+---
+This model is a ONNX runtime and int8 quantized version of [BGE-M3](https://huggingface.co/BAAI/bge-m3).
+This model outputs dense, sparse and ColBERT embedding representations all at once. The output is a list of numpy arrays in previously mentioned order of representations.
+Note: dense and ColBERT embeddings are normalized like the default behavior in the original FlagEmbedding library, if you want unnormalized outputs you can modify the code in `export_onnx_int8.py` and re-run the script.
+This model also has "O2" level graph optimizations applied, you can read more about optimization levels [here](https://huggingface.co/docs/optimum/en/onnxruntime/usage_guides/optimization). If you want ONNX model with different optimization or without optimizations, you can re-run the ONNX export script `export_onnx_int8.py` with appropriate optimization argument.
+## Usage with ONNX Runtime (Python)
+If you haven't already, you can install the [ONNX Runtime](https://onnxruntime.ai/) Python library:
+```bash
+pip install onnxruntime
+```
+For tokenization, you can for example use HF Transformers by installing it:
+```bash
+pip install transformers
+```
+Clone this repository with [Git LFS](https://git-lfs.com/) to get the ONNX model files.
+You can then use the model to compute embeddings, as follows:
+```python
+import time
+from optimum.onnxruntime import ORTModelForCustomTasks
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
+model = ORTModelForCustomTasks.from_pretrained("gpahal/bge-m3-onnx-int8")
+questions = ["What is your opening hour?", "Where are your offices?"]
+input_q = tokenizer(
+    questions,
+    padding=True,
+    truncation=True,
+    return_tensors="np"
+)
+print(f"Question input keys: {list(input_q.keys())}, shapes: {[v.shape for v in input_q.values()]}")
+t0 = time.perf_counter()
+output_q = model(**input_q)
+print(f"Time taken: {(time.perf_counter()-t0)*1e3:.1f} ms")
+```
+Note: You can use following sparse token weight processor from FlagEmbedding to get same the output for the sparse representation from the ONNX model:
+```python
+from collections import defaultdict
+def process_token_weights(token_weights: np.ndarray, input_ids: list):
+    # conver to dict
+    result = defaultdict(int)
+    unused_tokens = {
+        tokenizer.cls_token_id,
+        tokenizer.eos_token_id,
+        tokenizer.pad_token_id,
+        tokenizer.unk_token_id,
+    }
+    for w, idx in zip(token_weights, input_ids):
+        if idx not in unused_tokens and w > 0:
+            idx = str(idx)
+            if w > result[idx]:
+                result[idx] = w
+    return result
+token_weights = outputs[1].squeeze(-1)
+lexical_weights = list(
+    map(process_token_weights, token_weights, inputs["input_ids"].tolist())
+)
+```
+## Export ONNX weights
+You can export ONNX weights with the provided `export_onnx_int8.py` ONNX weight export script which leverages HF Optimum.
+If needed, you can modify the model configuration to for example remove embedding normalization or to not output all three embedding representations. If you modify the number of output representations, you need to also modify the ONNX output config `BGEM3OnnxConfig` in `export_onnx_int8.py`.
+First, install needed Python requirements as follows:
+```bash
+pip install -r requirements.txt
+```
+Then you can export ONNX weights as follows:
+```bash
+python export_onnx.py --opset 17 --device cpu --optimize O2
+```
+You can read more about the optional optimization levels [here](https://huggingface.co/docs/optimum/en/onnxruntime/usage_guides/optimization).

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": ".",
+  "architectures": ["XLMRobertaModel"],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-5,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

export_onnx_int8.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import argparse
+import copy
+import logging
+import os
+from collections import OrderedDict
+import torch
+from huggingface_hub import snapshot_download
+from optimum.exporters.onnx import onnx_export_from_model
+from optimum.exporters.onnx.model_configs import XLMRobertaOnnxConfig
+from optimum.exporters.tasks import TasksManager
+from optimum.onnxruntime import ORTQuantizer
+from optimum.onnxruntime.configuration import AutoQuantizationConfig
+from torch import Tensor, nn
+from transformers import AutoConfig, AutoModel
+logger = logging.getLogger(__name__)
+class BGEM3InferenceModel(nn.Module):
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-m3",
+        colbert_dim: int = -1,
+    ) -> None:
+        super().__init__()
+        model_name = snapshot_download(
+            repo_id=model_name,
+            allow_patterns=[
+                "model.safetensors",
+                "colbert_linear.pt",
+                "sparse_linear.pt",
+                "config.json",
+            ],
+        )
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.colbert_linear = torch.nn.Linear(
+            in_features=self.model.config.hidden_size,
+            out_features=(
+                self.model.config.hidden_size if colbert_dim == -1 else colbert_dim
+            ),
+        )
+        self.sparse_linear = torch.nn.Linear(
+            in_features=self.model.config.hidden_size, out_features=1
+        )
+        colbert_state_dict = torch.load(
+            os.path.join(model_name, "colbert_linear.pt"), map_location="cpu"
+        )
+        sparse_state_dict = torch.load(
+            os.path.join(model_name, "sparse_linear.pt"), map_location="cpu"
+        )
+        self.colbert_linear.load_state_dict(colbert_state_dict)
+        self.sparse_linear.load_state_dict(sparse_state_dict)
+    def dense_embedding(self, last_hidden_state: Tensor) -> Tensor:
+        return last_hidden_state[:, 0]
+    def sparse_embedding(self, last_hidden_state: Tensor) -> Tensor:
+        with torch.no_grad():
+            return torch.relu(self.sparse_linear(last_hidden_state))
+    def colbert_embedding(
+        self, last_hidden_state: Tensor, attention_mask: Tensor
+    ) -> Tensor:
+        with torch.no_grad():
+            colbert_vecs = self.colbert_linear(last_hidden_state[:, 1:])
+        colbert_vecs = colbert_vecs * attention_mask[:, 1:][:, :, None].float()
+        return colbert_vecs
+    def forward(self, input_ids: Tensor, attention_mask: Tensor) -> dict[str, Tensor]:
+        with torch.no_grad():
+            last_hidden_state = self.model(
+                input_ids=input_ids, attention_mask=attention_mask, return_dict=True
+            ).last_hidden_state
+        output = {}
+        dense_vecs = self.dense_embedding(last_hidden_state)
+        output["dense_vecs"] = torch.nn.functional.normalize(dense_vecs, dim=-1)
+        sparse_vecs = self.sparse_embedding(last_hidden_state)
+        output["sparse_vecs"] = sparse_vecs
+        colbert_vecs = self.colbert_embedding(last_hidden_state, attention_mask)
+        output["colbert_vecs"] = torch.nn.functional.normalize(colbert_vecs, dim=-1)
+        return output
+class BGEM3OnnxConfig(XLMRobertaOnnxConfig):
+    @property
+    def outputs(self) -> dict[str, dict[int, str]]:
+        """
+        Dict containing the axis definition of the output tensors to provide to the model.
+        Returns:
+            `Dict[str, Dict[int, str]]`: A mapping of each output name to a mapping of axis position to the axes symbolic name.
+        """
+        return copy.deepcopy(
+            OrderedDict(
+                {
+                    "dense_vecs": {0: "batch_size", 1: "embedding"},
+                    "sparse_vecs": {0: "batch_size", 1: "token", 2: "weight"},
+                    "colbert_vecs": {0: "batch_size", 1: "token", 2: "embedding"},
+                }
+            )
+        )
+def main(output: str, opset: int, device: str, optimize: str, atol: str):
+    model = BGEM3InferenceModel()
+    bgem3_onnx_config = BGEM3OnnxConfig(model.config)
+    # Export to ONNX first
+    print("Exporting to ONNX...")
+    # Monkey-patch the library inference to return 'transformers'
+    original_infer = TasksManager.infer_library_from_model
+    TasksManager.infer_library_from_model = lambda model: "transformers"
+    try:
+        onnx_export_from_model(
+            model,  # Use the full custom model
+            output=output,
+            task="feature-extraction",
+            custom_onnx_configs={"model": bgem3_onnx_config},
+            opset=opset,
+            optimize=optimize,
+            atol=atol,
+            device=device,
+        )
+    finally:
+        # Restore original function
+        TasksManager.infer_library_from_model = original_infer
+    print(f"ONNX model saved to: {output}")
+    # Apply quantization
+    print("Quantizing model...")
+    quantizer = ORTQuantizer.from_pretrained(output)
+    qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
+    print("Applying dynamic int8 quantization...")
+    quantized_path = f"{output}_int8"
+    quantizer.quantize(
+        save_dir=quantized_path,
+        quantization_config=qconfig
+    )
+    print(f"Quantized model saved to: {quantized_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="onnx_model",
+        help="Path indicating the directory where to store the generated ONNX model.",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=None,
+        help="If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture will be used.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help='The device to use to do the export. Defaults to "cpu".',
+    )
+    parser.add_argument(
+        "--optimize",
+        type=str,
+        default=None,
+        choices=["O1", "O2", "O3", "O4"],
+        help=(
+            "Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT. Possible options:\n"
+            "    - O1: Basic general optimizations\n"
+            "    - O2: Basic and extended general optimizations, transformers-specific fusions\n"
+            "    - O3: Same as O2 with GELU approximation\n"
+            "    - O4: Same as O3 with mixed precision (fp16, GPU-only, requires `--device cuda`)"
+        ),
+    )
+    parser.add_argument(
+        "--atol",
+        type=float,
+        default=None,
+        help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
+    )
+    args = parser.parse_args()
+    main(args.output, args.opset, args.device, args.optimize, args.atol)

model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16de7ea1146ca427e14938ec3e9abfdcaff0e6ac76434cd693ac35d761250bcb
+size 569958496

ort_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "Conv",
+      "MatMul",
+      "Attention",
+      "LSTM",
+      "Gather",
+      "Transpose",
+      "EmbedLayerNormalization"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "use_external_data_format": false
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+accelerate==1.8.1
+huggingface-hub==0.33.0
+onnx==1.18.0
+onnxruntime==1.22.0
+optimum==1.26.1
+transformers==4.52.4

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:249df0778f236f6ece390de0de746838ef25b9d6954b68c2ee71249e0a9d8fd4
+size 17082799

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}