chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class KeyValue:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = KeyValue()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsKeyValue(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # KeyValue
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # KeyValue
+    def Key(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # KeyValue
+    def Value(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(2)
+
+
+def KeyValueStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddKey(builder, key):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
+
+
+def KeyValueAddKey(builder, key):  # noqa: N802
+    """This method is deprecated. Please switch to AddKey."""
+    return AddKey(builder, key)
+
+
+def AddValue(builder, value):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
+
+
+def KeyValueAddValue(builder, value):  # noqa: N802
+    """This method is deprecated. Please switch to AddValue."""
+    return AddValue(builder, value)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def KeyValueEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
@@ -0,0 +1,90 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TrtTable:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TrtTable()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTrtTable(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # TrtTable
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TrtTable
+    def Dict(self, j):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue  # noqa: PLC0415
+
+            obj = KeyValue()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TrtTable
+    def DictLength(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # TrtTable
+    def DictIsNone(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(1)
+
+
+def TrtTableStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddDict(builder, dict):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
+
+
+def TrtTableAddDict(builder, dict):  # noqa: N802
+    """This method is deprecated. Please switch to AddDict."""
+    return AddDict(builder, dict)
+
+
+def StartDictVector(builder, numElems):  # noqa: N802
+    return builder.StartVector(4, numElems, 4)
+
+
+def TrtTableStartDictVector(builder, numElems):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return StartDictVector(builder, numElems)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def TrtTableEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/init.py
@@ -0,0 +1,19 @@
+from .calibrate import (  # noqa: F401
+    CalibraterBase,
+    CalibrationDataReader,
+    CalibrationMethod,
+    MinMaxCalibrater,
+    create_calibrator,
+)
+from .qdq_quantizer import QDQQuantizer  # noqa: F401
+from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
+from .quantize import (
+    DynamicQuantConfig,  # noqa: F401
+    QuantizationMode,  # noqa: F401
+    StaticQuantConfig,  # noqa: F401
+    get_qdq_config,  # noqa: F401
+    quantize,  # noqa: F401
+    quantize_dynamic,  # noqa: F401
+    quantize_static,  # noqa: F401
+)
+from .shape_inference import quant_pre_process  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/base_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/base_quantizer.py
@@ -0,0 +1,529 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+from typing import Any
+
+import numpy as np
+import onnx
+import onnx.numpy_helper
+
+try:
+    from onnx.reference.op_run import to_array_extended
+except ImportError:
+    # old version of onnx.
+    to_array_extended = None
+
+from .calibrate import TensorData
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    DEQUANT_OP_NAME,
+    ONNX_TYPE_TO_NP_TYPE,
+    QUANT_OP_NAME,
+    TENSOR_NAME_QUANT_SUFFIX,
+    find_by_name,
+    get_opset_version,
+    model_has_infer_metadata,
+    normalize_axis,
+    pack_bytes_to_4bit,
+    quantize_data,
+    quantize_nparray,
+    save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
+)
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantizationParams:
+    def __init__(self, **data: dict[str, Any]):
+        self.data = {}
+        for k, v in data.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if k != "axis" and not isinstance(v, (int, str, np.ndarray, float)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "axis" and not isinstance(v, int) and v is not None:
+                raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
+            self.data[k] = v
+
+    def get(self, key, default_value=None):
+        return self.data.get(key, default_value)
+
+    def __iter__(self):
+        yield from self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __setitem__(self, key, value):
+        self.data[key] = value
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BaseQuantizer:
+    def __init__(
+        self,
+        model,
+        per_channel,
+        reduce_range,
+        weight_qType,
+        activation_qType,
+        tensors_range,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options=None,
+    ):
+        if not model_has_infer_metadata(model):
+            model = save_and_reload_model_with_shape_infer(model)
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+
+        self.model = ONNXModel(model)
+        self.opset_version = get_opset_version(model)
+        self.per_channel = per_channel  # weight-pack per channel
+        self.reduce_range = reduce_range
+
+        self.extra_options = extra_options if extra_options else {}
+        self.enable_subgraph_quantization = (
+            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
+        )
+        self.parent = None
+        self.force_quantize_no_input_check = (
+            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
+        )
+
+        # If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
+        # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
+        self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
+        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.min_real_range = self.extra_options.get("MinimumRealRange")
+
+        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
+        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
+
+        """
+            Dictionary specifying the min and max values for tensors. It has following format:
+                {
+                    "param_name": [min, max]
+                }
+            example:
+                {
+                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
+                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
+                }
+        """
+        if tensors_range is not None and any(not isinstance(t, TensorData) for t in tensors_range.values()):
+            raise TypeError(
+                f"tensors_range contains unexpected types { {type(v) for v in tensors_range.values()} }, not TensorData."
+            )
+        self.tensors_range = tensors_range
+        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
+        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
+        self.op_types_to_quantize = op_types_to_quantize
+
+        # Get tensor-level quantization overrides and ensure they are valid.
+        self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
+
+        self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
+        overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
+            self.initializers, self.value_infos.keys(), activation_qType
+        )
+        if not overrides_valid:
+            raise ValueError(overrides_err)
+
+        self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
+
+    def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
+        if self._is_weight_symmetric is not None:
+            return self._is_weight_symmetric  # Return value explicitly set by user.
+        return weight_quant_type in (
+            onnx.TensorProto.INT4,
+            onnx.TensorProto.INT8,
+            onnx.TensorProto.INT16,
+            onnx.TensorProto.FLOAT8E4M3FN,
+        )
+
+    def quantize_model(self):
+        raise NotImplementedError
+
+    def is_input_a_initializer(self, input_name):
+        initializer = find_by_name(input_name, self.model.initializer())
+        return initializer is not None
+
+    def is_per_channel(self):
+        return self.per_channel
+
+    def is_valid_quantize_weight(self, weight_name):
+        weight = find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            return False
+        return self.parent.is_valid_quantize_weight(weight_name)
+
+    def should_quantize_node(self, node):
+        if (
+            self.nodes_to_quantize is not None
+            and len(self.nodes_to_quantize) != 0
+            and node.name not in self.nodes_to_quantize
+        ):
+            return False
+
+        if node.op_type not in self.op_types_to_quantize:
+            return False
+
+        if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME):
+            return False
+
+        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
+            return False
+
+        return True
+
+    def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # get bias
+        bias_initializer = find_by_name(bias_name, self.model.initializer())
+        bias_data = tensor_proto_to_array(bias_initializer)
+        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
+
+        # quantize bias
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
+            quantized_data = data.astype(np.float32)
+            bias_scale = np.array([1], dtype=quantized_data.dtype)
+            bias_scale_data = bias_scale.reshape(-1)
+            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            node_type = "Cast"
+        else:
+            # calculate scale for bias
+            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
+            bias_scale = input_scale * weight_scale * beta
+
+            # Quantize by dividing by bias_scale
+            quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
+            quantized_data = quantized_data.round()
+
+            # Clip quantized data to the range of a int32
+            int32_min = np.float64(np.iinfo(np.int32).min)
+            int32_max = np.float64(np.iinfo(np.int32).max)
+            if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
+                logging.warning(
+                    f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
+                )
+
+            quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
+
+            # update bias initializer
+            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+
+            # Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
+            bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
+            node_type = "DequantizeLinear"
+            node_qtype = self.weight_qType
+
+        # update scale initializer
+        quantized_bias_scale_name = quantized_bias_name + "_scale"
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer_extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            tensor_type = self.weight_qType
+        else:
+            tensor_type = onnx.TensorProto.INT32
+
+        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
+        elif bias_scale.size > 1:
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        else:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
+        self.model.initializer_extend([packed_bias_zp_initializer])
+
+        return (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        )
+
+    def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
+        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight.name + "_zero_point"
+        scale_name = weight.name + "_scale"
+
+        # Quantize weight data. Use quantization overrides if provided by the user.
+        weight_data = tensor_proto_to_array(weight)
+        quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
+        if "quant_type" in quant_overrides:
+            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
+            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                f"Unexpected dtype {zero_point.dtype}"
+            )
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        else:
+            symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
+            zero_point, scale, q_weight_data = quantize_data(
+                weight_data.flatten(),
+                qType,
+                quant_overrides.get("symmetric", symmetric),
+                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                min_real_range=self.min_real_range,
+                rmin_override=quant_overrides.get("rmin"),
+                rmax_override=quant_overrides.get("rmax"),
+            )
+
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                f"Unexpected dtype {zero_point.dtype}"
+            )
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+                q_weight_initializer = onnx.TensorProto()
+                q_weight_initializer.data_type = self.weight_qType
+                q_weight_initializer.dims.extend(weight.dims)
+                q_weight_initializer.name = q_weight_name
+                # Do not remove .flatten().copy() numpy is not clear about data persistence.
+                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
+                if to_array_extended is not None:
+                    # This test should not be needed but it helped catch some issues
+                    # with data persistence and tobytes.
+                    check = to_array_extended(q_weight_initializer)
+                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
+                        raise RuntimeError(
+                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
+                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
+                            f"\nraw={str(q_weight_initializer)[:200]}."
+                        )
+            elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if q_weight_data.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
+            else:
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
+                    weight.dims
+                )
+                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel_impl(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
+        initializer = find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = tensor_proto_to_array(initializer)
+        weights_rank = len(weights.shape)
+        is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
+        if not is_axis_valid:
+            raise ValueError(
+                f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
+                f"out-of-bounds for rank {weights_rank}"
+            )
+
+        channel_axis = axis_norm
+        channel_count = weights.shape[channel_axis]
+        quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
+            weight_name, default_val=[{"axis": channel_axis}]
+        )
+
+        num_channel_overrides = len(quant_overrides_for_channels)
+        if num_channel_overrides != 1 and num_channel_overrides != channel_count:
+            raise ValueError(
+                f"Per-channel tensor quantization overrides for {weight_name} must have "
+                f"either 1 or {channel_count} elements in the list of dictionaries."
+            )
+
+        is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
+        if not is_axis_override_valid or axis_override != channel_axis:
+            raise ValueError(
+                f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
+                f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
+            )
+
+        # If user provides per-channel quantization overrides, all channels must use the same quant_type,
+        # axis, symmetric, and reduce_range values. So, just use the first channel's values.
+        if "quant_type" in quant_overrides_for_channels[0]:
+            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
+
+        symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
+        reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        weights_shape = list(weights.shape)
+        reshape_dims = list(weights_shape)  # deep copy
+        reshape_dims[channel_axis] = 1  # only one per channel for reshape
+        for i in range(channel_count):
+            per_channel_data = weights.take(i, channel_axis)
+            channel_override_index = i if i < num_channel_overrides else 0
+            channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
+
+            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
+                quantized_per_channel_data = quantize_nparray(
+                    weight_qType, per_channel_data.flatten(), scale, zero_point
+                )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                    f"Unexpected dtype {zero_point.dtype}"
+                )
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(quantized_per_channel_data, np.ndarray), (
+                    f"Unexpected type {type(quantized_per_channel_data)}"
+                )
+
+            else:
+                zero_point, scale, quantized_per_channel_data = quantize_data(
+                    per_channel_data.flatten(),
+                    weight_qType,
+                    symmetric,
+                    reduce_range=reduce_range,
+                    min_real_range=self.min_real_range,
+                    rmin_override=channel_quant_overrides.get("rmin"),
+                    rmax_override=channel_quant_overrides.get("rmax"),
+                )
+
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                    f"Unexpected dtype {zero_point.dtype}"
+                )
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(quantized_per_channel_data, np.ndarray), (
+                    f"Unexpected type {type(quantized_per_channel_data)}"
+                )
+
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(np.asarray(quantized_per_channel_data).reshape(reshape_dims))
+
+        # combine per_channel_data into one
+        quantized_weights = np.concatenate(quantized_per_channel_data_list, channel_axis)
+        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight_name + "_zero_point"
+        scale_name = weight_name + "_scale"
+
+        # Update packed weight, zero point, and scale initializers
+        zero_scale_shape = [initializer.dims[channel_axis]]
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
+        )
+
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if quantized_weights.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(
+                    q_weight_name, weight_qType, weights_shape, packed_data, raw=True
+                )
+                self.model.initializer_extend([q_weight_initializer])
+            else:
+                quantized_weights = np.asarray(
+                    quantized_weights,
+                    dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
+                ).reshape(initializer.dims)
+                q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
+                self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def adjust_tensor_ranges(self):
+        if self.tensors_range is None:
+            return
+
+        for node in self.model.nodes():
+            # adjust tensor_ranges for input of Clip and Relu node
+            if node.op_type in ["Clip", "Relu"]:
+                if not self.should_quantize_node(node):
+                    continue
+                if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                    continue
+                if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                    continue
+                td = self.tensors_range[node.output[0]]
+                if not isinstance(td, TensorData):
+                    raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+                self.tensors_range[node.input[0]] = td
+            # Adjust Softmax to range from 0.0 to 1.0
+            elif node.op_type == "Softmax":
+                if not self.should_quantize_node(node):
+                    continue
+                self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/calibrate.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/calibrate.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
@@ -0,0 +1,2 @@
+from .preprocess import qnn_preprocess_model  # noqa: F401
+from .quant_config import get_qnn_qdq_config  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
@@ -0,0 +1,132 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ...fusions import Fusion
+from ...onnx_model import ONNXModel
+
+
+class FusionLpNormalization(Fusion):
+    def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
+        super().__init__(model, "LpNormalization", "ReduceL2")
+        self.epsilon = epsilon
+
+    def fuse(
+        self,
+        reduce_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
+        LpNormalization node.
+
+        Pattern 1:
+                    [root] --> ReduceL2 -----> Clip  --> Expand ----> Div -->
+                       |      (axis=-1)    (min=epsilon) (shape=root)  ^
+                       |   (keepdims=True)                             |
+                       |                                               |
+                       +-----------------------------------------------+
+        Notes:
+          - ReduceL2 must use the last axis, and keepdims == True
+          - Clip must only have a min attribute that is ~1e-12
+          - Expand must restore the shape to root.shape
+          - The output of Expand must be the second input to Div.
+        """
+        if reduce_node.output[0] not in input_name_to_nodes:
+            return
+
+        # ReduceL2 must have one Clip child
+        children = input_name_to_nodes[reduce_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Clip":
+            return
+
+        # ReduceL2 must have keepdims == True
+        keepdims = self.get_node_attribute(reduce_node, "keepdims")
+        if not keepdims:
+            return
+
+        # ReduceL2 axes must refer only to the last dimension.
+        # Axes became an input in opset 18. Before then, axes was an attribute
+        reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
+        if not reduce_input_ttype:
+            return
+
+        reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
+        if not reduce_input_shape:
+            return
+
+        axes = self.get_node_attribute(reduce_node, "axes")
+        if not axes and len(reduce_node.input) > 1:
+            axes = self.model.get_constant_value(reduce_node.input[1])
+
+        if not axes or len(axes) != 1:
+            return
+
+        last_dim = len(reduce_input_shape) - 1
+        if axes[0] != -1 and axes[0] != last_dim:
+            return
+
+        # Clip node must have a min attribute approximately equal to 1e-12
+        clip_node = children[0]
+        clip_min = self.get_node_attribute(clip_node, "min")
+        if clip_min is None and len(clip_node.input) > 1:
+            clip_min = self.model.get_constant_value(clip_node.input[1])
+
+        clip_max = self.get_node_attribute(clip_node, "max")  # TODO: clip_max could be FLOAT_MAX
+        if clip_max is None and len(clip_node.input) > 2:
+            clip_max = self.model.get_constant_value(clip_node.input[2])
+
+        if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
+            return
+
+        if clip_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Clip must have a single Expand child.
+        children = input_name_to_nodes[clip_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Expand":
+            return
+
+        expand_node = children[0]
+        if expand_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Expand must have a single Div child
+        children = input_name_to_nodes[expand_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Div":
+            return
+
+        div_node = children[0]
+
+        # The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
+        # The second input to Div must be the output of the Expand.
+        # As long as these two inputs go to the same Div node, then ONNX validation will ensure that
+        # their shapes match.
+        if div_node.input[0] != reduce_node.input[0]:
+            return
+        if div_node.input[1] != expand_node.output[0]:
+            return
+
+        subgraph_input = reduce_node.input[0]
+        subgraph_output = div_node.output[0]
+
+        subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[subgraph_input],
+            outputs=[subgraph_output],
+            p=2,
+            axis=-1,
+        )
+        self.nodes_to_add.append(fused_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_spacetodepth.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_spacetodepth.py
@@ -0,0 +1,162 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Define SpaceToDepth fusion."""
+
+import onnx
+
+from ... import fusions, onnx_model
+
+
+class FusionSpaceToDepth(fusions.Fusion):
+    """Fusion for SpaceToDepth."""
+
+    def __init__(self, model: onnx_model.ONNXModel):
+        """Initialize.
+
+        Args:
+            model: An onnx_model.ONNXModel instance.
+        """
+        super().__init__(model, "SpaceToDepth", "Reshape")
+
+    def _fuse_yolo(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse for early version of YOLO.
+
+        Pattern:
+
+                |     [N, C, H, W]
+             Reshape
+                |     [N, C, H/blk, blk, W/blk, blk]
+            Transpose
+                |     [N, C, H/blk, W/blk, blk, blk]
+             Reshape
+                |     [N, C, H/blk * W/blk, blk * blk]
+            Transpose
+                |     [N, C, blk * blk, H/blk * W/blk]
+             Reshape
+                |     [N, C, blk * blk, H/blk, W/blk]
+            Transpose
+                |     [N, blk * blk, C, H/blk, W/blk]
+             Reshape
+                |     [N, blk * blk * C, H/blk, W/blk]
+
+        This sequence can be fused into a single SpaceToDepth with blocksize `blk`. Note that unlike DepthToSpace
+        supporting DCR or CRD mode, SpaceToDepth only supports DCR mode in its latest opset version (13), which matches
+        the pattern here.
+        """
+        reshape_node1 = node
+
+        def get_target_child(parent_node, target_op_type):
+            """Get target child of given node."""
+            if parent_node.output[0] not in input_name_to_nodes:
+                return None
+
+            children = input_name_to_nodes[parent_node.output[0]]
+            if len(children) > 1 or children[0].op_type != target_op_type:
+                return None
+
+            return children[0]
+
+        if (
+            (transpose_node1 := get_target_child(reshape_node1, "Transpose")) is None
+            or (reshape_node2 := get_target_child(transpose_node1, "Reshape")) is None
+            or (transpose_node2 := get_target_child(reshape_node2, "Transpose")) is None
+            or (reshape_node3 := get_target_child(transpose_node2, "Reshape")) is None
+            or (transpose_node3 := get_target_child(reshape_node3, "Transpose")) is None
+            or (reshape_node4 := get_target_child(transpose_node3, "Reshape")) is None
+        ):
+            return False
+
+        def get_tensor_shape(tensor_name):
+            """Get shape for given tensor name."""
+            tensor_type = self.model.get_tensor_type(tensor_name)
+            if not tensor_type:
+                return None
+
+            tensor_shape = self.tensor_shape_to_list(tensor_type)
+            if not tensor_shape:
+                return None
+
+            return tensor_shape
+
+        if (
+            (input_shape := get_tensor_shape(reshape_node1.input[0])) is None
+            or (reshape_shape1 := get_tensor_shape(reshape_node1.output[0])) is None
+            or (reshape_shape2 := get_tensor_shape(reshape_node2.output[0])) is None
+            or (reshape_shape3 := get_tensor_shape(reshape_node3.output[0])) is None
+            or (reshape_shape4 := get_tensor_shape(reshape_node4.output[0])) is None
+        ):
+            return False
+
+        transpose_perm1 = self.get_node_attribute(transpose_node1, "perm")
+        transpose_perm2 = self.get_node_attribute(transpose_node2, "perm")
+        transpose_perm3 = self.get_node_attribute(transpose_node3, "perm")
+
+        # Check rank.
+        if (
+            len(input_shape) != 4
+            or len(reshape_shape1) != 6
+            or len(reshape_shape2) != 4
+            or len(reshape_shape3) != 5
+            or len(reshape_shape4) != 4
+        ):
+            return False
+
+        # Check shape and perm.
+        batch, channel, height, width = input_shape
+        blocksize = reshape_shape1[3]
+        if (
+            reshape_shape1 != [batch, channel, height // blocksize, blocksize, width // blocksize, blocksize]
+            or transpose_perm1 != [0, 1, 2, 4, 3, 5]
+            or reshape_shape2 != [batch, channel, (height // blocksize) * (width // blocksize), blocksize**2]
+            or transpose_perm2 != [0, 1, 3, 2]
+            or reshape_shape3 != [batch, channel, blocksize**2, height // blocksize, width // blocksize]
+            or transpose_perm3 != [0, 2, 1, 3, 4]
+            or reshape_shape4 != [batch, blocksize**2 * channel, height // blocksize, width // blocksize]
+        ):
+            return False
+
+        self.nodes_to_remove.extend(
+            [
+                reshape_node1,
+                transpose_node1,
+                reshape_node2,
+                transpose_node2,
+                reshape_node3,
+                transpose_node3,
+                reshape_node4,
+            ]
+        )
+
+        s2d_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[reshape_node1.input[0]],
+            outputs=[reshape_node4.output[0]],
+            blocksize=blocksize,
+        )
+        self.nodes_to_add.append(s2d_node)
+
+        return True
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse a sequence of Reshape and Transpose nodes into a single SpaceToDepth node.
+
+        Args:
+            node: An onnx.NodeProto matching the specified search type (i.e., Reshape).
+            input_name_to_nodes: A dict mapping tensor name to consumed nodes.
+            output_name_to_node: A dict mapping tensor name to produced node.
+        """
+        self._fuse_yolo(node, input_name_to_nodes, output_name_to_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
@@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import onnx
+
+from ...quant_utils import QuantType
+from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
+
+
+@dataclass
+class TensorTypeRequest:
+    """
+    Bundles desired quantization type requests for a tensor. A distinction is made between the
+    produced type and the consumed type.
+    """
+
+    # The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
+    producer: QuantTypeInfo | None
+
+    # The tensor's quant type received by a set of consumer nodes.
+    # If None, assumed to be the default activation quant type for all consumers.
+    # consumers[1] is a set of consumer node names.
+    consumers: tuple[QuantTypeInfo, set[str]] | None
+
+
+class MixedPrecisionTensorQuantOverridesFixer:
+    """
+    Helper that generates tensor quantization overrides for mixed-precision QDQ models.
+
+    Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
+    activation quantization type to one or more tensors by doing the following:
+     - Inferring which other tensors need to be overridden to the non-default activation quantization type.
+     - Inserting quantization data type conversions.
+
+    Example:
+    --------
+
+    Float model:
+
+    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
+                                 ^
+                                 |
+    input_1 --> Op2 -+-> Op4 ----+
+                     |
+                     +-> Op7 --> output_1
+                     |
+                     +-> Op8 --> output_2
+
+    If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
+    is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
+
+    ```
+    init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
+    ```
+
+    These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
+    to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
+    are valid:
+
+    ```
+    overrides = TensorQuantOverridesHelper(init_overrides)
+
+    fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
+    fixer.apply(
+        default_activation_qtype=QuantType.QUInt8,
+        default_activation_symmetric=False,
+    )
+    ```
+
+    The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
+
+    {
+      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
+      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
+      "Op4_out": [{"quant_type": QUInt16}],
+      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
+    }
+
+    How to interpret the fixed overrides:
+    - Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
+      but Op7 and Op8 consume the original u8 type.
+    - Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
+    - Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
+    - Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
+    """
+
+    def __init__(
+        self,
+        overrides: TensorQuantOverridesHelper,
+        producers: dict[str, onnx.NodeProto],
+        consumers: dict[str, list[onnx.NodeProto]],
+        value_infos: dict[str, onnx.ValueInfoProto],
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        """
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
+            consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
+            value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
+            initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
+        """
+        self.overrides = overrides
+        self.consumers = consumers
+        self.producers = producers
+        self.value_infos = value_infos
+        self.initializers = initializers
+
+    @staticmethod
+    def create_from_model(
+        overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
+    ) -> MixedPrecisionTensorQuantOverridesFixer:
+        """
+        Helper function that creates an instance of this class from a loaded ONNX model.
+
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            model: Loaded ONNX model
+            default_activation_qtype: The intended default activation quantization type.
+                                      Used to validate the initial overrides.
+
+        Returns:
+            Initialized MixedPrecisionTensorQuantOverridesFixer object
+        """
+        model = onnx.shape_inference.infer_shapes(model)  # Need to infer shapes to get value_infos
+
+        # Build dictionaries that enable convenient lookups of initializers and value_infos by name.
+        initializers = {initializer.name: initializer for initializer in model.graph.initializer}
+        value_infos = {vi.name: vi for vi in model.graph.value_info}
+        value_infos.update({ot.name: ot for ot in model.graph.output})
+        value_infos.update({it.name: it for it in model.graph.input})
+
+        # Ensure that the user-provided initial overrides are actually valid.
+        valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = overrides.pprint_str(indent=4)
+            logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
+            raise ValueError(err)
+
+        consumers = {}
+        producers = {}
+
+        # Build dictionaries that map a tensor name to the consumer or producer nodes.
+        for node in model.graph.node:
+            for input_name in node.input:
+                if input_name:
+                    if input_name not in consumers:
+                        consumers[input_name] = []
+
+                    consumers[input_name].append(node)
+
+            for output_name in node.output:
+                producers[output_name] = node
+
+        return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
+
+    def apply(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ):
+        """
+        Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+        """
+        type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
+
+        # Use type requests to "fix" tensor quantization overrides by adding
+        # quantization type conversions where necessary.
+        for tensor_name, type_req in type_requests.items():
+            all_consumers = {node.name for node in self.consumers.get(tensor_name, [])}
+            has_producer_req = type_req.producer is not None
+            has_consumer_req = bool(type_req.consumers)
+
+            # Only producer type: Add conversion back to default activation type
+            if has_producer_req and not has_consumer_req:
+                self._update_converted_tensor(
+                    tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
+                )
+            # Only consumers
+            elif not has_producer_req and has_consumer_req:
+                prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
+                        raise ValueError(
+                            f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
+                        )
+            # Both producer and consumers
+            elif has_producer_req and has_consumer_req:
+                prod_type_info = type_req.producer
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
+
+                    if len(consumers_for_original_type) == 0:
+                        # All consumers want the overridden type, so no need for convert nodes!
+                        # Just add the override to the new new if not already present.
+                        if tensor_name not in self.overrides:
+                            self.overrides[tensor_name] = [{}]
+                            prod_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+                        assert "convert" not in self.overrides[tensor_name][0]
+                    else:
+                        # Some consumers don't want the overridden type.
+                        self._update_converted_tensor(
+                            tensor_name,
+                            prod_type_info,
+                            QuantTypeInfo(default_activation_qtype),
+                            consumers_for_original_type,
+                        )
+            else:
+                raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
+
+        # Done. Check if the overrides are valid.
+        valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = self.overrides.pprint_str(indent=4)
+            logging.error(
+                f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
+            )
+            raise ValueError(err)
+
+    def get_desired_tensor_types(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ) -> dict[str, TensorTypeRequest]:
+        """
+        Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
+        that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
+        used to generated the "fixed" overrides.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+
+        Returns:
+            TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
+        """
+        type_requests = {}
+        default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
+
+        # Scan tensor overrides for type conversion requests.
+        for tensor_name, override_list in self.overrides.items():
+            if not self.__is_tensor_quantizable(tensor_name):
+                continue  # Skip non-quantizable tensors (e.g., not a float)
+
+            if tensor_name in self.initializers:
+                continue  # Skip initializers
+
+            if not override_list or len(override_list) > 1:
+                continue  # Skip per-channel stuff
+
+            override_dict = override_list[0]
+            quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
+            producer_node = self.producers.get(tensor_name)  # None if this is a model input
+
+            if quant_type_info != default_activation_type_info and "convert" not in override_dict:
+                if producer_node is not None:
+                    self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
+
+                # Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
+                for consumer_node in self.consumers.get(tensor_name, []):
+                    self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
+
+        return type_requests
+
+    def _add_type_requests_for_node(
+        self,
+        type_requests: dict[str, TensorTypeRequest],
+        quant_type_info: QuantTypeInfo,
+        node: onnx.NodeProto,
+    ):
+        """
+        Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
+        to have the same quantization type (as specified by the `quant_type_info` parameter).
+
+        Params:
+            type_requests: Dictionary of type requests to append to for this node.
+            quant_type_info: The quantization type to use for inputs and outputs.
+            node: The node for which the TensorTypeRequest objects are created and added to type_requests.
+        """
+        # Add output side
+        for output_name in node.output:
+            if not self.__is_tensor_quantizable(output_name):
+                continue
+
+            if output_name not in type_requests:
+                type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
+            else:
+                if (
+                    type_requests[output_name].producer is not None
+                    and type_requests[output_name].producer != quant_type_info
+                ):
+                    raise ValueError(f"Tensor {output_name} has multiple types.")
+
+                type_requests[output_name].producer = quant_type_info
+
+        # Add the consumer side
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
+                if input_name not in type_requests:
+                    type_requests[input_name] = TensorTypeRequest(None, None)
+
+                if type_requests[input_name].consumers is None:
+                    type_requests[input_name].consumers = (quant_type_info, set())
+
+                if type_requests[input_name].consumers[0] != quant_type_info:
+                    raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
+
+                if not node.name:
+                    raise ValueError(
+                        f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
+                    )
+
+                type_requests[input_name].consumers[1].add(node.name)
+
+    def _update_converted_tensor(
+        self,
+        tensor_name: str,
+        producer_type_info: QuantTypeInfo,
+        consumer_type_info: QuantTypeInfo,
+        consumer_names: set[str],
+    ):
+        """
+        Updates the tensor quantization overrides for a tensor that is converted from one type to another.
+
+        Params:
+            tensor_name: The name of the tensor for which to update overrides.
+            producer_type_info: Info for the tensor's produced type.
+            consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
+            consumer_names: Nodes names of consumers that consume the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            self.overrides[tensor_name] = [{}]
+            producer_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+        overrides = self.overrides[tensor_name][0]
+        if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
+            raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
+
+        if consumer_names:
+            if "convert" not in overrides:
+                overrides["convert"] = {}
+                consumer_type_info.save_to_dict(overrides["convert"])
+
+            convert_dict = overrides["convert"]
+            if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
+                raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
+
+            if "recv_nodes" not in convert_dict:
+                convert_dict["recv_nodes"] = set()
+
+            convert_dict["recv_nodes"].update(consumer_names)
+
+    def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
+        """
+        Returns true if the given nodes do not consume/receive a converted quantization type.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            node_names: Set of node names that should not be consumers of the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            return True
+
+        overrides = self.overrides[tensor_name][0]
+
+        if "convert" not in overrides:
+            return True
+
+        convert_dict = overrides["convert"]
+
+        if "recv_nodes" not in convert_dict:
+            return False
+
+        return not convert_dict["recv_nodes"].intersection(node_names)
+
+    def __is_tensor_quantizable(self, tensor_name):
+        weight = self.initializers.get(tensor_name)
+        if weight is not None:
+            if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
+                return True
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.FLOAT16,
+            ):
+                return True
+
+        return False
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
@@ -0,0 +1,335 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import onnx
+
+from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
+from ....tools.remove_initializer_from_input import remove_initializer_from_input
+from ...fusions import FusionGelu, FusionLayerNormalization
+from ...onnx_model import ONNXModel
+from ...quant_utils import save_and_reload_model_with_shape_infer
+from .fusion_lpnorm import FusionLpNormalization
+from .fusion_spacetodepth import FusionSpaceToDepth
+
+
+def qnn_preprocess_model(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    exclude_initializer_from_input: bool = False,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
+    dynamic_input_shapes: list[tuple[str, str]] | None = None,
+) -> bool:
+    """
+    If necessary, this method creates a new "pre-processed" model in preparation for
+    quantization of a model to be used in QNN EP. Returns true if a new model was created.
+
+    This method perfoms the following operations:
+    - Fuse Erf sequence into a single Gelu node.
+    - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
+    - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
+
+    Args:
+        model_input: Path to the input model file or ModelProto.
+        model_output: Path the output model file, which is only created if this method returns True.
+        exclude_initializer_from_input: A bool specifying whether to exclude initializer from input.
+            Defaults to False.
+        fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
+            Defaults to False.
+        save_as_external_data: True if output model should be saved with external data. Defaults to false.
+        all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
+            If true, save all tensors to one external file specified by external_data_location.
+            If false, save each tensor to a file named with the tensor name.
+        external_data_location: Effective only if save_as_external_data is true. Defaults to None.
+            Specify the external file to which all tensors are saved. Path is relative
+            to the model path. If not specified, the model's name is used.
+        external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
+            Tensors with a data size >= external_data_size_threshold are converted to external data.
+            To convert every tensor with raw data to external data, set to 0.
+        external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
+            If true, convert all tensors to external data.
+            If false, convert only non-attribute tensors to external data.
+        inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
+            if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
+
+            Original:
+                input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            Updated:
+                input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
+            if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
+
+            Original:
+                <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
+
+            Updated:
+                <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        dynamic_input_shapes: A list of tuples specifying model input name to and its static shape in comma seprated
+            format, for example: [('input', '1,3,256,256')]. Defaults to None.
+    """
+    modified = False
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
+    model = save_and_reload_model_with_shape_infer(model)
+    onnx_model = ONNXModel(model)
+
+    # Optionally, fix the dynamic input shapes.
+    if dynamic_input_shapes:
+        for input_name, input_shape_str in dynamic_input_shapes:
+            input_shape = [int(i) for i in input_shape_str.split(",")]
+            make_input_shape_fixed(onnx_model.graph(), input_name, input_shape)
+        fix_output_shapes(onnx_model.model)
+        modified = True
+
+    # Exclude initializer from input if model.ir_version >= 4
+    if exclude_initializer_from_input:
+        modified |= remove_initializer_from_input(onnx_model.model)
+
+    # Fuse Erf sequence into a single Gelu
+    fusion_gelu = FusionGelu(onnx_model)
+    if fusion_gelu.apply():
+        modified = True
+
+    # Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
+    fusion_lpnorm = FusionLpNormalization(onnx_model)
+    if fusion_lpnorm.apply():
+        modified = True
+
+    # Fuse Reshape/Transpose sequence into a single SpaceToDepth.
+    fusion_s2d = FusionSpaceToDepth(onnx_model)
+    if fusion_s2d.apply():
+        modified = True
+
+    # Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
+    if fuse_layernorm:
+        onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+
+        # Need opset >= 17 to use LayerNormalization.
+        if onnx_opset.version < 17:
+            logging.warning(
+                "Unable to fuse ReduceMean sequence into a LayerNormalization node. "
+                "ONNX model must use an opset >= 17 in order to use LayerNormalization, "
+                f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
+            )
+        else:
+            fusion_layernorm = FusionLayerNormalization(onnx_model)
+            if fusion_layernorm.apply():
+                modified = True
+
+    # Optionally, transpose inputs and/or outputs to make them "channel-last".
+    if inputs_to_make_channel_last or outputs_to_make_channel_last:
+        transpose_node_prefix = "Transpose_channel_"
+        transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
+        update_io_to_channel_last(
+            onnx_model.model,
+            inputs_to_make_channel_last,
+            outputs_to_make_channel_last,
+            transpose_node_name_prefix=transpose_node_prefix,
+            transpose_node_name_start_suffix=transpose_node_suffix,
+        )
+        modified = True
+
+    # Make sure all nodes have a name.
+    unnamed_node_prefix = "qnn_preproc_node_"
+    available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
+    for node in onnx_model.model.graph.node:
+        if node.op_type != "Constant" and not node.name:
+            new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
+            available_suffix += 1
+            node.name = new_node_name
+            modified = True
+            logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
+
+    if modified:
+        onnx_model.topological_sort()
+        onnx.save_model(
+            model,
+            model_output,
+            save_as_external_data=save_as_external_data,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=external_data_convert_attribute,
+        )
+
+    return modified
+
+
+class InputOutputNameMap:
+    def __init__(
+        self,
+        orig_tensor_names: set[str],
+        orig_graph_inputs: dict[str, onnx.ValueInfoProto],
+        orig_graph_outputs: dict[str, onnx.ValueInfoProto],
+    ):
+        self.orig_tensor_names = orig_tensor_names
+        self.orig_graph_inputs = orig_graph_inputs
+        self.orig_graph_outputs = orig_graph_outputs
+        self.updated_io_names = {}
+        self.new_value_infos = []
+
+    def get_new_name(self, orig_name: str):
+        if orig_name in self.updated_io_names:
+            return self.updated_io_names[orig_name]
+
+        # Make a new tensor name that is unique among all tensors in the graph.
+        prefix: str = f"{orig_name}_channel_first_"
+        suffix: int = -1
+        for tensor_name in self.orig_tensor_names:
+            if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
+                index = int(tensor_name[len(prefix) :])
+                suffix = max(suffix, index)
+
+        suffix += 1  # This is the first available suffix.
+        new_name = f"{prefix}{suffix!s}"
+
+        # Add new value_info objects for these new tensors.
+        orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
+        value_info_proto = onnx.ValueInfoProto()
+        value_info_proto.CopyFrom(orig_value_info)
+        value_info_proto.name = new_name
+        self.new_value_infos.append(value_info_proto)
+
+        self.updated_io_names[orig_name] = new_name
+        return self.updated_io_names[orig_name]
+
+
+def update_io_to_channel_last(
+    model: onnx.ModelProto,
+    inputs_to_update: list[str] | None,
+    outputs_to_update: list[str] | None,
+    transpose_node_name_prefix: str = "Transpose_channel_",
+    transpose_node_name_start_suffix: int = 0,
+):
+    inputs_to_update = set(inputs_to_update or [])
+    outputs_to_update = set(outputs_to_update or [])
+
+    if not inputs_to_update and not outputs_to_update:
+        return
+
+    graph = model.graph
+    orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
+    orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
+
+    # Check that the user passed in actual input and output names.
+    for input_name in inputs_to_update:
+        if input_name not in orig_graph_inputs:
+            raise ValueError(f"{input_name} is not a graph input")
+
+    for output_name in outputs_to_update:
+        if output_name not in orig_graph_outputs:
+            raise ValueError(f"{output_name} is not a graph output")
+
+    orig_tensor_names = set()
+    orig_tensor_names.update(set(orig_graph_inputs))
+    orig_tensor_names.update(set(orig_graph_outputs))
+    orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+
+    # Maps original input (or output) name to its updated name used within the graph.
+    io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
+
+    # Update each node's inputs/outputs to use the transposed versions.
+    for node in graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] and node.input[i] in inputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+            elif node.input[i] and node.input[i] in outputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+
+        for i in range(len(node.output)):
+            if node.output[i] in outputs_to_update:
+                node.output[i] = io_map.get_new_name(node.output[i])
+
+    # Update graph inputs to channel-last and a Transpose (to channel-first) after each.
+    for g_input_name in inputs_to_update:
+        g_input = orig_graph_inputs[g_input_name]
+
+        if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
+
+        input_shape = g_input.type.tensor_type.shape
+        input_rank = len(input_shape.dim)
+
+        if input_rank < 3:
+            raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(input_shape.dim[1])
+        for i in range(1, input_rank - 1):
+            input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
+        input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(input_rank))
+        for i in range(input_rank):
+            transpose_perm[i] = i if i < 1 else i - 1
+        transpose_perm[1] = input_rank - 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[g_input.name],
+            outputs=[io_map.get_new_name(g_input.name)],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    # Update graph outputs to channel-last and a Transpose (from channel-first) before each.
+    for g_output_name in outputs_to_update:
+        g_output = orig_graph_outputs[g_output_name]
+        if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
+
+        output_shape = g_output.type.tensor_type.shape
+        output_rank = len(output_shape.dim)
+
+        if output_rank < 3:
+            raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(output_shape.dim[1])
+        for i in range(1, output_rank - 1):
+            output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
+        output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(output_rank))
+        for i in range(output_rank):
+            transpose_perm[i] = i if i == 0 else i + 1
+        transpose_perm[output_rank - 1] = 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[io_map.get_new_name(g_output.name)],
+            outputs=[g_output.name],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    graph.value_info.extend(io_map.new_value_infos)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
@@ -0,0 +1,406 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ...calibrate import CalibrationDataReader, CalibrationMethod
+from ...quant_utils import QuantType
+from ...quantize import StaticQuantConfig
+from ...tensor_quant_overrides import TensorQuantOverridesHelper
+from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
+
+Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
+Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
+Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
+OP_TYPES_TO_EXCLUDE = {"Cast"}
+MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
+
+
+def warn_unable_to_override(
+    node: onnx.NodeProto,
+    what_str: str,
+    tensor_name: str,
+    io_kind: str,
+):
+    logging.warning(
+        f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
+        "because it has already been overridden! Check the initial quantization overrides provided "
+        "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
+        f"Node name: {node.name}, {io_kind} name: {tensor_name}"
+    )
+
+
+def get_qnn_qdq_config(
+    model_input: str | Path | onnx.ModelProto,
+    calibration_data_reader: CalibrationDataReader,
+    calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
+    activation_type: QuantType = QuantType.QUInt8,
+    weight_type: QuantType = QuantType.QUInt8,
+    per_channel: bool = False,
+    init_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    add_qtype_converts: bool = True,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+    keep_removable_activations: bool = False,
+    stride: int | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
+    nodes_to_exclude: list[str] | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a static quantization configuration suitable for running QDQ models on QNN EP.
+    This is done primarily by setting tensor-level quantization overrides.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QUInt8.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
+            and their quantization axes.
+
+            If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
+                - Conv:
+                    - input[1] on axis 0
+                    - input[2] (bias) on axis 0
+                - ConvTranspose:
+                    - input[1] on axis 1
+                    - input[2] (bias) on axis 0
+        init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
+            of these overrides with any necessary adjustments and includes them in the returned
+            configuration object (i.e., config.extra_options['TensorQuantOverrides']).
+
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        add_qtype_converts: True if this function should automatically add "convert" entries to the provided
+            `init_overrides` to ensure that operators use valid input/output types (activations only).
+            Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
+            of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
+            appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
+            the zero-point values are 128 and 32,768, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
+        nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
+            quantization when it is not None.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    if weight_symmetric is None:
+        weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
+
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
+
+    op_types = set()
+    model_has_external_data = False
+    name_to_initializer = {}
+
+    # Build map of initializers (name -> initializer) and
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        name_to_initializer[initializer.name] = initializer
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
+
+    if not overrides_helper.empty() and add_qtype_converts:
+        # Fix mixed-precision overrides.
+        overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
+            overrides_helper, model, activation_type
+        )
+        overrides_fixer.apply(activation_type, activation_symmetric)
+
+    # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
+    qnn_compat = QnnCompatibilityOverrides(
+        activation_type,
+        weight_type,
+        activation_symmetric,
+        weight_symmetric,
+        per_channel,
+        overrides_helper,
+        name_to_initializer,
+    )
+
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
+
+    for node in model.graph.node:
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
+            continue
+        op_types.add(node.op_type)
+        qnn_compat.process_node(node)
+
+    extra_options = {
+        "MinimumRealRange": 0.0001,
+        "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
+        "QDQKeepRemovableActivations": keep_removable_activations,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
+        "CalibStridedMinMax": stride,
+    }
+
+    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+    if onnx_opset.version < 21:
+        opset21_types = Q16_TYPES.union(Q4_TYPES)
+        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
+        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+            extra_options["UseQDQContribOps"] = True
+
+    return StaticQuantConfig(
+        calibration_data_reader,
+        calibrate_method=calibrate_method,
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
+        ),
+        nodes_to_exclude=nodes_to_exclude,
+        per_channel=per_channel,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
+        extra_options=extra_options,
+    )
+
+
+class QnnCompatibilityOverrides:
+    """
+    Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
+    compatible with QNN EP.
+    """
+
+    def __init__(
+        self,
+        default_activation_qtype: QuantType,
+        default_weight_qtype: QuantType,
+        activation_symmetric: bool,
+        weight_symmetric: bool,
+        per_channel: bool,
+        overrides: TensorQuantOverridesHelper,
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        self.default_activation_qtype = default_activation_qtype
+        self.default_weight_qtype = default_weight_qtype
+        self.activation_symmetric = activation_symmetric
+        self.weight_symmetric = weight_symmetric
+        self.per_channel = per_channel
+        self.overrides = overrides
+        self.initializers = initializers
+
+        self.process_fns = {
+            "MatMul": self._process_matmul,
+            "LayerNormalization": self._process_layernorm,
+            "Sigmoid": self._process_sigmoid,
+            "Tanh": self._process_tanh,
+        }
+
+    def process_node(self, node: onnx.NodeProto):
+        process_fn = self.process_fns.get(node.op_type)
+
+        if process_fn is not None:
+            process_fn(node)
+
+    def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
+        """
+        Overrides initializer input(s) to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        - The other input is an initializer (per-tensor quantized)
+
+        This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
+        inputs the default weight type. Instead, it assigns the default activation type.
+        """
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        input_16bit_act_name = None
+        input_weight_name = None
+
+        # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
+        for i in range(2):
+            input_name = node.input[i]
+            if not input_name:
+                continue
+
+            is_weight = input_name in self.initializers
+            qtype_info = self.overrides.get_node_input_qtype_info(
+                input_name,
+                node.name,
+                default_qtype=None if is_weight else self.default_activation_qtype,
+            )
+
+            if qtype_info.axis is not None:
+                return  # Don't process MatMul with a per-channel quantized input.
+
+            if (
+                is_weight
+                and qtype_info.quant_type == self.default_weight_qtype
+                and qtype_info.symmetric == self.weight_symmetric
+            ):
+                return  # Return. Weight is already overridden to use the desired weight type.
+
+            if is_weight:
+                input_weight_name = input_name
+            elif qtype_info.quant_type in Q16_TYPES:
+                input_16bit_act_name = input_name
+
+        # Override initializer input to use the default weight type.
+        if input_16bit_act_name and input_weight_name:
+            did_update = self.overrides.update_tensor_overrides(
+                input_weight_name,
+                {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                overwrite=False,
+            )
+
+            if not did_update:
+                warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
+
+    def _process_matmul(self, node: onnx.NodeProto):
+        assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
+        # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
+        # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
+        # the user did not provide any other overrides.
+        for input_name in node.input:
+            is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
+            if is_weight_no_overrides:
+                self.overrides.update_tensor_overrides(
+                    input_name,
+                    {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                )
+
+    def _process_layernorm(self, node: onnx.NodeProto):
+        assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
+        has_bias_no_overrides = (
+            len(node.input) > 2
+            and node.input[2]
+            and node.input[2] in self.initializers
+            and node.input[2] not in self.overrides
+        )
+
+        if has_weight_no_overrides or has_bias_no_overrides:
+            # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
+            # tries to makes it per-channel if the weight is also per-channel.
+            raise ValueError(
+                "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
+                " Please try using custom overrides that make bias per-tensor quantized."
+            )
+
+    def _process_sigmoid(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
+
+    def _process_tanh(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(32768, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/init.py
@@ -0,0 +1,4 @@
+from .fusion import Fusion  # noqa: F401
+from .fusion_gelu import FusionGelu  # noqa: F401
+from .fusion_layernorm import FusionLayerNormalization  # noqa: F401
+from .replace_upsample_with_resize import ReplaceUpsampleWithResize  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion.py
@@ -0,0 +1,311 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from collections import deque
+
+import onnx
+
+from ..onnx_model import ONNXModel
+
+
+class Fusion:
+    """
+    Base class for fusions.
+    """
+
+    def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
+        self.search_op_type: str = search_op_type
+        self.fused_op_type: str = fused_op_type
+        self.model: ONNXModel = model
+        self.nodes_to_remove: list = []
+        self.nodes_to_add: list = []
+
+        self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
+        self._new_node_name_suffix = None  # int|None used to create unique node names for the fused ops.
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function for derived fusion classes. Tries to fuse a node sequence containing
+        the specified node.
+        """
+        raise NotImplementedError
+
+    def apply(self) -> bool:
+        """
+        Apply graph fusion on the entire model graph.
+        """
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        for node in self.model.nodes():
+            if node.op_type == self.search_op_type:
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add)
+
+        graph_updated = bool(self.nodes_to_remove or self.nodes_to_add)
+
+        if graph_updated:
+            self.model.remove_unused_constant()
+
+        return graph_updated
+
+    def create_unique_node_name(self):
+        prefix = self._new_node_name_prefix
+
+        if self._new_node_name_suffix is None:
+            largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
+            self._new_node_name_suffix = largest_suffix + 1
+
+        new_name = f"{prefix}{self._new_node_name_suffix!s}"
+        self._new_node_name_suffix += 1
+
+        return new_name
+
+    @staticmethod
+    def is_safe_to_fuse_nodes(
+        nodes_to_remove: list[onnx.NodeProto],
+        keep_outputs: list[str],
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            # Not safe to remove nodes since output is used by impacted_node
+                            return False
+        return True
+
+    @staticmethod
+    def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = onnx.helper.get_attribute_value(attr)
+                return value
+        return None
+
+    @staticmethod
+    def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
+        for index, input_name in enumerate(child_node.input):
+            if input_name == node_output:
+                return index
+        return -1
+
+    @staticmethod
+    def tensor_shape_to_list(tensor_type) -> list[int]:
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_constant_input(self, node: onnx.NodeProto):
+        for i, inp in enumerate(node.input):
+            value = self.model.get_constant_value(inp)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> int:
+        i, value = self.get_constant_input(node)
+        if value is not None and value.size == 1 and abs(value - expected_value) < delta:
+            return i
+
+        return -1
+
+    def has_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> bool:
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def is_constant_with_specified_rank(self, output_name: str, rank: int) -> bool:
+        value = self.model.get_constant_value(output_name)
+        if value is None:
+            return False  # Not an initializer
+
+        if len(value.shape) != rank:
+            return False  # Wrong dimensions
+
+        return True
+
+    def match_first_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+    ) -> tuple[onnx.NodeProto | None, int | None]:
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node: current node.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        for i, inp in enumerate(node.input):
+            if inp in output_name_to_node:
+                parent = output_name_to_node[inp]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+
+        return None, None
+
+    def match_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        input_index: int | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+        return_indice: list[int] | None = None,
+    ) -> onnx.NodeProto | None:
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            # Input index out of bounds.
+            return None
+
+        parent = self.model.get_parent(node, input_index, output_name_to_node)
+        if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
+            return parent
+
+        return None
+
+    def match_parent_path(
+        self,
+        node: onnx.NodeProto,
+        parent_op_types: list[str],
+        parent_input_index: list[int] | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        return_indice: list[int] | None = None,
+    ) -> list[onnx.NodeProto] | None:
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index
+                                  When there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        if parent_input_index is not None:
+            assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i] if parent_input_index is not None else None,
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def match_parent_paths(
+        self,
+        node: onnx.NodeProto,
+        paths: list[tuple[list[str], list[int]]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> tuple[int, list[onnx.NodeProto] | None, list[int] | None]:
+        """
+        Find a matching parent path to the given node.
+        """
+        for i, path in enumerate(paths):
+            return_indice = []
+            matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def find_first_child_by_type(
+        self,
+        node: onnx.NodeProto,
+        child_type: str,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]] | None = None,
+        recursive: bool = True,
+    ) -> onnx.NodeProto | None:
+        children = self.model.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.model.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
@@ -0,0 +1,272 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing an Erf node into a single
+        Gelu node.
+        """
+        if (
+            self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
+        ):
+            self.model.set_opset_import("com.microsoft", 1)
+
+    def fuse_1(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+
+        mul_after_erf = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return False
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return False
+            mul_half = children[0]
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return False
+
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+
+            if subgraph_input not in mul_half.input:
+                return False
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_2(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_after_erf = children[0]
+
+        if not self.has_constant_input(mul_after_erf, 0.5):
+            return False
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        sqrt_node = None
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return False
+            if not self.has_constant_input(sqrt_node, 2.0):
+                return False
+
+        subgraph_input = div.input[0]
+
+        if subgraph_input not in mul.input:
+            return False
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_3(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_half = children[0]
+
+        if not self.has_constant_input(mul_half, 0.5):
+            return False
+
+        first_mul = self.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return False
+
+        i = self.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return False
+
+        root_input_index = 1 - i
+        subgraph_input = first_mul.input[root_input_index]
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
+            return False
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
@@ -0,0 +1,135 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(
+        self,
+        reduce_mean_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
+        LayerNormalization node.
+
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^
+                                     |                                                 |
+                                     +-------------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = reduce_mean_node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            # Skip fusion since epsilon value is not expected.
+            return
+
+        pow_node = parent_nodes[3]
+        if self.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [reduce_mean_node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
+        if not self.is_constant_with_specified_rank(weight_input, 1):
+            return
+
+        bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
+        if not self.is_constant_with_specified_rank(bias_input, 1):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = onnx.helper.make_node(
+            "LayerNormalization",
+            name=self.create_unique_node_name(),
+            inputs=[reduce_mean_node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+        )
+        normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
+        self.nodes_to_add.append(normalize_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/replace_upsample_with_resize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/replace_upsample_with_resize.py
@@ -0,0 +1,96 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import numpy as np
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class ReplaceUpsampleWithResize(Fusion):
+    """Replace Upsample with Resize."""
+
+    def __init__(self, model: ONNXModel, opset):
+        """Initialize."""
+        super().__init__(model, "Resize", "Upsample")
+        self.opset = opset
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Replace Upsample with Resize."""
+        mode = None
+        for attr in node.attribute:
+            if attr.name == "mode":
+                mode = attr.s.decode("utf-8")
+                break
+
+        scales_input = None
+        if self.opset > 7:
+            scales_input = node.input[1] if len(node.input) > 1 else ""
+            resize_inputs = [node.input[0], node.name + "_roi", scales_input]
+        else:
+            if self.opset == 7:
+                for attr in node.attribute:
+                    if attr.name == "scales":
+                        scales_input = attr.floats
+                        break
+
+                scales_input = np.array(list(scales_input), np.float32)
+            else:
+                h_scale = 1
+                w_scale = 1
+                for attr in node.attribute:
+                    if attr.name == "height_scale":
+                        h_scale = attr.float
+                    elif attr.name == "width_scale":
+                        w_scale = attr.float
+
+                scales_input = np.array([1, 1, h_scale, w_scale], np.float32)
+
+            scales_tensor = onnx.helper.make_tensor(
+                name=node.name + "_scales",
+                data_type=onnx.TensorProto.FLOAT,
+                dims=scales_input.shape,
+                vals=scales_input.flatten().tolist(),
+            )
+
+            scales_node = onnx.helper.make_node(
+                "Constant", inputs=[], outputs=[node.name + "_scales"], value=scales_tensor
+            )
+
+            self.nodes_to_add.append(scales_node)
+
+            resize_inputs = [node.input[0], node.name + "_roi", node.name + "_scales"]
+
+        roi_tensor = onnx.helper.make_tensor(
+            name=node.name + "_roi",
+            data_type=onnx.TensorProto.FLOAT,
+            dims=(len(scales_input) * 2,),
+            vals=[0] * len(scales_input) + [1] * len(scales_input),
+        )
+
+        roi_node = onnx.helper.make_node("Constant", inputs=[], outputs=[node.name + "_roi"], value=roi_tensor)
+
+        resize_node = onnx.helper.make_node(
+            op_type="Resize", inputs=resize_inputs, outputs=node.output, mode=mode, nearest_mode="floor"
+        )
+
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(roi_node)
+        self.nodes_to_add.append(resize_node)
+
+    def apply(self) -> bool:
+        """Apply."""
+        if super().apply():
+            self.model.topological_sort()
+            return True
+        return False
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
@@ -0,0 +1,239 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import os
+
+import numpy as np
+import numpy.typing as npt
+import onnx
+from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+
+from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
+
+from .onnx_model import ONNXModel
+from .quant_utils import attribute_to_kwarg
+
+logger = logging.getLogger(__name__)
+
+
+class MatMulBnb4Quantizer:
+    """Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
+
+    ##################
+    # quantization types, must be consistent with native code type
+    # Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
+
+    # 4b floating point with bias of 3
+    FP4 = 0
+
+    # 4b NormalFloat
+    NF4 = 1
+
+    def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
+        nodes_to_exclude = nodes_to_exclude or []
+        assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
+        self.model = ONNXModel(model)
+        self.quant_type = quant_type
+        self.block_size = block_size
+        self.nodes_to_exclude = set(nodes_to_exclude)
+
+    @staticmethod
+    def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
+        """4b quantize fp32/fp16 weight"""
+
+        if len(fpweight.shape) != 2:
+            raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
+        # need to copy since the transposed weight still has the original memory layout
+        # Linear4bit quantizes its weight data which is the transposed weight
+        fpweight_t = fpweight.transpose().copy()
+
+        rows, cols = fpweight.shape
+        numel = rows * cols
+        block_size = self.block_size
+        num_blocks = (numel + block_size - 1) // block_size
+        quantized_numel = (numel + 1) // 2
+
+        packed = np.zeros(quantized_numel, dtype="uint8")
+        absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
+        # block wise quantization, fpweight_t is flattened and divided into blocks
+        quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
+
+        return (packed, absmax)
+
+    def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+
+        logger.debug(f"start to quantize {node.name} ...")
+        if node.name in self.nodes_to_exclude:
+            logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+            return node
+
+        inputB = node.input[1]  # noqa: N806
+        B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        if B is None:
+            logger.debug("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
+        if len(B_array.shape) != 2:
+            logger.debug("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+
+        packed, absmax = self.bnb4_block_quant(B_array)
+        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
+        B_quant.name = B.name + "_Bnb4"
+        for input in Bs_graph.input:
+            if input.name == inputB:
+                Bs_graph.input.remove(input)
+                break
+
+        absmax_tensor = onnx.numpy_helper.from_array(absmax)
+        absmax_tensor.name = B.name + "_absmax"
+
+        Bs_graph.initializer.extend([B_quant, absmax_tensor])
+
+        kwargs = {}
+        rows, cols = B_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["block_size"] = self.block_size
+        kwargs["quant_type"] = self.quant_type
+
+        matmul_bnb4_node = onnx.helper.make_node(
+            "MatMulBnb4",
+            inputs=[node.input[0], B_quant.name, absmax_tensor.name],
+            outputs=[node.output[0]],
+            name=node.name + "_Bnb4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.debug(f"complete quantization of {node.name} ...")
+
+        return matmul_bnb4_node
+
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
+        new_nodes = []
+        graph = graph_stack[-1]
+
+        for node in graph.node:
+            graph_attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if graph_attrs:
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        # recursive call to take care of sub-graph
+                        graph_stack.append(attr.g)
+                        kv = {attr.name: self._process_subgraph(graph_stack)}
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        value = []
+                        for subgraph in attr.graphs:
+                            # recursive call to take care of sub-graph
+                            graph_stack.append(subgraph)
+                            value.extend([self._process_subgraph(graph_stack)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx.helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_stack.pop()
+        return graph
+
+    def process(self):
+        # use a stack to keep track of sub-graphs
+        graph_stack = [self.model.graph()]
+        opset_import = self.model.opset_import()
+
+        has_ms_domain = False
+        for opset in opset_import:
+            if opset.domain == "com.microsoft":
+                has_ms_domain = True
+        if not has_ms_domain:
+            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+        self._process_subgraph(graph_stack)
+        self.model.clean_initializers()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
+
+A weight matrix is partitioned into blocks, where each block is a contiguous
+subset inside the flattened transposed weight matrix. Each block is quantized
+into a set of 4b integers with an absolute value scaling factor.
+"""
+    )
+
+    parser.add_argument("--input_model", required=True, help="Path to the input model file")
+    parser.add_argument("--output_model", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--quant_type",
+        required=False,
+        default=1,
+        choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        help="Quantization data type. 0: FP4, 1: NF4",
+    )
+    parser.add_argument(
+        "--block_size",
+        required=False,
+        default=64,
+        help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+    )
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        type=str,
+        required=False,
+        default=[],
+        help="Specify the nodes to be excluded from quantization with node names",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    input_model_path = args.input_model
+    output_model_path = args.output_model
+
+    if os.path.exists(output_model_path):
+        logger.error(f"file {output_model_path} already exists")
+        raise Exception(f"file {output_model_path} already exists")
+
+    model = onnx.load(input_model_path)
+    quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
+    quant.process()
+    quant.model.save_model_to_file(output_model_path, True)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_nbits_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_nbits_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/init.py
@@ -0,0 +1 @@
+from .weight_only import gptq_quantize, rtn_quantize  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/onnx_model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/onnx_model.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/util.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/util.py
@@ -0,0 +1,80 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper classes or functions for onnxrt adaptor."""
+
+import importlib
+import logging
+
+import numpy as np
+
+logger = logging.getLogger("neural_compressor")
+
+
+MAXIMUM_PROTOBUF = 2147483648
+
+
+def simple_progress_bar(total, i):
+    """Progress bar for cases where tqdm can't be used."""
+    progress = i / total
+    bar_length = 20
+    bar = "#" * int(bar_length * progress)
+    spaces = " " * (bar_length - len(bar))
+    percentage = progress * 100
+    print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="")
+
+
+def find_by_name(name, item_list):
+    """Helper function to find item by name in a list."""
+    items = []
+    for item in item_list:
+        assert hasattr(item, "name"), f"{item} should have a 'name' attribute defined"  # pragma: no cover
+        if item.name == name:
+            items.append(item)
+    if len(items) > 0:
+        return items[0]
+    else:
+        return None
+
+
+def to_numpy(data):
+    """Convert to numpy ndarrays."""
+    import torch  # noqa: PLC0415
+
+    if not isinstance(data, np.ndarray):
+        if not importlib.util.find_spec("torch"):
+            logger.error(
+                "Please install torch to enable subsequent data type check and conversion, "
+                "or reorganize your data format to numpy array."
+            )
+            exit(0)
+        if isinstance(data, torch.Tensor):
+            if data.dtype is torch.bfloat16:  # pragma: no cover
+                return data.detach().cpu().to(torch.float32).numpy()
+            if data.dtype is torch.chalf:  # pragma: no cover
+                return data.detach().cpu().to(torch.cfloat).numpy()
+            return data.detach().cpu().numpy()
+        else:
+            try:
+                return np.array(data)
+            except Exception:
+                assert False, (  # noqa: B011
+                    f"The input data for onnx model is {type(data)}, which is not supported to convert to numpy ndarrays."
+                )
+    else:
+        return data
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/weight_only.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/weight_only.py
@@ -0,0 +1,932 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modifications:
+# Add k-quant quantization method.
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""WeightOnly for onnxrt adaptor."""
+
+import copy
+import logging
+import os
+import sys
+
+import numpy as np
+import onnx
+from onnx import numpy_helper
+from onnx.helper import np_dtype_to_tensor_dtype
+
+import onnxruntime as ort
+
+from .onnx_model import ONNXModel
+from .util import simple_progress_bar
+
+logger = logging.getLogger("neural_compressor")
+
+
+def make_matmul_weight_only_node(
+    node,
+    weight_shape,
+    num_bits,
+    group_size,
+    k_blocks,
+    q_weight,
+    scale,
+    zero_point,
+    accuracy_level=0,
+):  # pragma: no cover
+    """Build MatMulNBits node.
+
+    Args:
+        node: original matmul node
+        weight_shape: original weight shape
+        num_bits (int): num_bits
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): block number
+        q_weight (array): quantized weight
+        scale (array): scale
+        zero_point (array): zero point
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+
+    Returns:
+        matmul_weight_only_node: MatMulNBits node
+        new_inits: initializers of the new node
+    """
+    blob_size = group_size * num_bits // 8
+    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
+    q_weight_name = node.input[1] + f"_Q{num_bits!s}G{group_size!s}"
+    input_names = [node.input[0], q_weight_name]
+    new_inits = []
+    kwargs = {}
+
+    op_type = "MatMulNBits"
+
+    # pack quantized weight
+    if num_bits == 4:
+        q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
+        packed[:, :] = q_weight_pairs[:, :blob_size]
+    elif num_bits == 8:
+        packed = q_weight
+    else:
+        logger.error(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+    packed = np.reshape(packed, (-1, k_blocks, blob_size))
+
+    # build scale tensor
+    scale = np.reshape(scale, (-1, k_blocks))
+    assert scale.dtype == np.float32 or scale.dtype == np.float16
+    scale_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_scale",
+        data_type=np_dtype_to_tensor_dtype(scale.dtype),
+        dims=scale.shape,
+        vals=scale.tobytes(),
+        raw=True,
+    )
+    input_names.append(scale_tensor.name)
+    new_inits.append(scale_tensor)
+
+    # build zero_point tensor
+    if zero_point is not None:
+        if num_bits == 8:
+            packed_zp = zero_point.astype("uint8")
+        elif num_bits == 4:
+            # For 4-bit case, the default zeros is 0x8. So it is 0x88 = 136 if we fill lower/higher 4 bits with 0x8.
+            packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
+            # create an index array
+            idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
+            # separate odd and even indices
+            even_idx = idx[::2]
+            odd_idx = idx[1::2]
+            # vectorized operation for even and odd indices
+            packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
+            packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
+        else:
+            raise ValueError(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+        packed_zp = np.reshape(packed_zp, (weight_shape[1], -1))
+        zp_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
+        )
+        input_names.append(zp_tensor.name)
+        new_inits.append(zp_tensor)
+
+    # set kwargs
+    kwargs["K"] = weight_shape[0]
+    kwargs["N"] = weight_shape[1]
+    kwargs["bits"] = num_bits
+    kwargs["block_size"] = group_size
+    if accuracy_level > 0:
+        # require onnxruntime > 1.16.3
+        kwargs["accuracy_level"] = accuracy_level
+
+    q_weight_tensor = onnx.helper.make_tensor(
+        name=q_weight_name,
+        data_type=2,
+        dims=packed.shape,
+        vals=packed.tobytes(),
+        raw=True,
+    )
+    new_inits.append(q_weight_tensor)
+
+    matmul_weight_only_node = onnx.helper.make_node(
+        op_type,
+        inputs=input_names,
+        outputs=node.output,
+        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
+        domain="com.microsoft",
+        **kwargs,
+    )
+    return matmul_weight_only_node, new_inits
+
+
+def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quantize tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size))
+    if scheme == "asym" or dtype == "uint":
+        maxq = 2**num_bits - 1
+        minq = 0
+    elif scheme == "sym":
+        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
+        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
+
+    rmin = np.min(data, axis=1, keepdims=True) * ratio
+    rmax = np.max(data, axis=1, keepdims=True) * ratio
+    if scheme == "sym":
+        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
+        scale = np.ones(rmax.shape)
+        mask = max_range > 0
+        scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
+        zero_point = (
+            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
+        )
+    else:
+        scale = np.ones(rmax.shape)
+        scale[rmin != rmax] = np.array(
+            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
+        )
+        zero_point = (
+            ((np.zeros(scale.shape) - rmin) / scale).round()
+            if dtype == "int"
+            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
+        )
+
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size)).astype(np.float32)  # nb = data.shape[0], (nb, group_size)
+    maxq = 2**num_bits - 1
+    minq = 0
+    sum_x2 = np.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+    av_x = np.sqrt(sum_x2 / group_size)  # (nb, 1)
+    weights = np.add(av_x, np.abs(data))  # (nb, group_size)
+    rmin = np.min(data, axis=1, keepdims=True)  # (nb, 1)
+    rmax = np.max(data, axis=1, keepdims=True)  # (nb, 1)
+    sum_w = np.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+    sum_x = np.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+    iscale = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+    mask = rmin != rmax
+    iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+    scale = 1 / iscale
+    quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+    diff = scale * quant_data + rmin - data  # (nb, group_size)
+    best_mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+    nstep = 20
+    rdelta = 0.1
+    # nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
+    rrmin = -1
+    for is_ in range(nstep):
+        iscale_new = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+        factor = np.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+        mask = rmin != rmax
+        iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+        quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+        mul_weights_quant_data_new = weights * quant_data_new
+        sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+        D = np.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+        diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+        mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+        mad_1 = np.array(mad)
+        best_mad_1 = np.array(best_mad)
+        idx_to_replace = np.where(mad_1 < best_mad_1)[0]
+        quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+        best_mad[idx_to_replace] = mad[idx_to_replace]
+        scale[idx_to_replace] = this_scale[idx_to_replace]
+        rmin[idx_to_replace] = this_min[idx_to_replace]
+
+    zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+    scale = scale.astype(np.float64)
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    try:
+        import cupy as cp  # noqa: PLC0415
+        import torch  # noqa: PLC0415
+
+        if torch.cuda.is_available():
+            data = cp.asarray(data)
+            data = data.reshape((-1, group_size)).astype(cp.float32)  # nb = data.shape[0], (nb, group_size)
+            maxq = 2**num_bits - 1
+            minq = 0
+            sum_x2 = cp.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+            av_x = cp.sqrt(sum_x2 / group_size)  # (nb, 1)
+            weights = cp.add(av_x, cp.abs(data))  # (nb, group_size)
+            rmin = cp.min(data, axis=1, keepdims=True)  # (nb, 1)
+            rmax = cp.max(data, axis=1, keepdims=True)  # (nb, 1)
+            sum_w = cp.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+            sum_x = cp.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+            iscale = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+            mask = rmin != rmax
+            iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+            scale = 1 / iscale
+            quant_data = cp.clip(cp.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+            diff = scale * quant_data + rmin - data  # (nb, group_size)
+            best_mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+            nstep = 20
+            rdelta = 0.1
+            rrmin = -1
+            for is_ in range(nstep):
+                iscale_new = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+                factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+                mask = rmin != rmax
+                iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+                quant_data_new = cp.clip(cp.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+                mul_weights_quant_data_new = weights * quant_data_new
+                sum_l = cp.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_l2 = cp.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_xl = cp.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+                D = cp.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+                this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+                this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+                diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+                mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+                mad_1 = cp.array(mad)
+                best_mad_1 = cp.array(best_mad)
+                idx_to_replace = cp.where(mad_1 < best_mad_1)[0]
+                quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+                best_mad[idx_to_replace] = mad[idx_to_replace]
+                scale[idx_to_replace] = this_scale[idx_to_replace]
+                rmin[idx_to_replace] = this_min[idx_to_replace]
+
+            zero_point = cp.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+            scale = scale.astype(cp.float64)
+            q_weight = cp.empty_like(data, dtype=scale.dtype)
+            cp.divide(data, scale, out=q_weight)
+            cp.add(q_weight, zero_point, out=q_weight)
+            cp.round(q_weight, out=q_weight)
+            cp.clip(q_weight, minq, maxq, out=q_weight)
+
+            return q_weight.get(), scale.get(), zero_point.get()
+        else:
+            logger.warning(
+                "Try to use k-quant quantization on CUDA. However, CUDA is not available."
+                "Fall back to k-quant quantization on CPU."
+            )
+            return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+    except ImportError:
+        logger.info(
+            "Now we are using k-quant quantization on cpu, which is time consuming."
+            "Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
+            "Please also install torch to check CUDA availability."
+        )
+        return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+
+
+def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quant dequant tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quant-dequant weight
+    """
+    org_shape = data.shape
+    weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
+    return np.reshape(scale * (weight - zp), org_shape)
+
+
+def pad_tensor(weight, group_size, k_blocks):
+    """Pad tensor rowi so that it can be is divisible by group_size.
+
+    Args:
+        weight (array): weight
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): the number of block
+
+    Returns:
+        weight: paded weight
+    """
+    if group_size == -1:
+        return weight
+
+    org_w_shape = weight.shape
+    padded_rows = k_blocks * group_size
+    pad_len = padded_rows - org_w_shape[0]
+
+    if pad_len > 0:
+        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
+
+    return weight
+
+
+def rtn_quantize(
+    model,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    ratios={},  # noqa: B006
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+    algorithm="k_quant",
+):
+    """Quant the model with round to nearst method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'RTN'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        ratios (dict, optional): percentile of clip. Defaults to {}.
+        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+    new_nodes = []
+    remove_nodes = []
+    total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
+    curr_id = 0
+    for node in model.nodes():
+        if node.op_type in ["MatMul"]:
+            curr_id += 1
+            simple_progress_bar(total_num, curr_id)
+        if (
+            node.op_type in ["MatMul"]
+            and model.get_initializer(node.input[1]) is not None
+            and weight_config.get(node.name, {}) != "fp32"
+        ):
+            weight_tensor = model.get_initializer(node.input[1])
+            weight = numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
+            if len(weight.shape) != 2:
+                continue
+
+            dtype = weight.dtype
+
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+
+            org_w_shape = weight.shape  # ic, oc
+            group_size = group_size if group_size != -1 else org_w_shape[0]
+
+            k_blocks = (org_w_shape[0] - 1) // group_size + 1
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            weight = pad_tensor(weight, group_size, k_blocks)
+
+            satisfy_MatMulNBits_condition = num_bits == 4 or num_bits == 8  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                if algorithm == "k_quant":
+                    q_weight, scale, zp = quant_tensor_k_quant_cuda(weight.T, num_bits, group_size)
+                else:
+                    q_weight, scale, zp = quant_tensor(
+                        weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+                    )
+
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_w_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" or algorithm == "k_quant" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                remove_nodes.append(node)
+                new_nodes.append(q_matmul_node)
+            else:
+                q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
+                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
+                q_weight = np.transpose(q_weight)
+                q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=weight.shape,
+                    vals=q_weight.tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.add_nodes(new_nodes)
+    model.remove_nodes(remove_nodes)
+    model.topological_sort()
+    return model
+
+
+def get_weight_scale(weight, group_size):
+    """Get the scale of weight."""
+    org_shape = weight.shape
+    weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight
+    scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0)
+    return scale
+
+
+def prepare_inputs(model, n_samples, dataloader, providers):
+    """Prepare inputs for weight only quantization.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        n_samples (int, optional): calibration sample number. -1 means all samples.
+        dataloader (object): dataloader for calibration.
+        providers (list): providers to use
+
+    Returns:
+        inputs: prepared inputs.
+        so: session options
+    """
+    from importlib.util import find_spec  # noqa: PLC0415
+
+    from .util import to_numpy  # noqa: PLC0415
+
+    so = ort.SessionOptions()
+    if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
+        from onnxruntime_extensions import get_library_path  # noqa: PLC0415
+
+        so.register_custom_ops_library(get_library_path())
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+    inputs_names = [i.name for i in session.get_inputs()]
+    del session
+
+    inputs = []
+    for i, data in enumerate(dataloader):
+        if n_samples != -1 and ((i + 1) * dataloader.batch_size) > n_samples:
+            break
+        if len(inputs_names) != 1 or isinstance(data[0], dict):
+            assert len(data[0]) == len(inputs_names), (
+                f"Input number mismatch, require {len(inputs_names)} but get {len(data[0])}"
+            )
+
+        if isinstance(data[0], dict):
+            inputs.append(dict([(name, to_numpy(inp_data)) for name, inp_data in data[0].items()]))  # noqa: C404
+        elif isinstance(data[0], np.ndarray):  # pragma: no cover
+            inputs.append(dict([(name, inp) for name, inp in zip(inputs_names, [data[0]], strict=False)]))  # noqa: C404
+        else:  # pragma: no cover
+            inputs.append(dict([(name, to_numpy(inp)) for name, inp in zip(inputs_names, data[0], strict=False)]))  # noqa: C404
+    return inputs, so
+
+
+def gptq(
+    W,
+    H,
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    blocksize=128,
+    percdamp=0.01,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+):
+    """Quant the weight with GPTQ method.
+
+    Args:
+        W (array): weight.
+        H (array): Hessian matrix.
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        blocksize (int, optional): blocksize to quantize weight.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+
+    Returns:
+        Q: fake quantized weight
+    """
+    maxq = 2**num_bits - 1
+    grid = 100
+    maxshrink = 0.8
+    norm = 2.4
+
+    def find_params(weight):
+        org_shape = weight.shape
+        # find zp, scale
+        if not perchannel:
+            weight = np.expand_dims(weight.flatten(), axis=1)
+        tmp = np.zeros(weight.shape[1])
+        xmin = np.minimum(np.min(weight, axis=0), tmp)
+        xmax = np.maximum(np.max(weight, axis=0), tmp)
+        if scheme == "sym":
+            xmax = np.maximum(np.abs(xmin), xmax)
+            tmp = xmin < 0
+            if np.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        scale = (xmax - xmin) / maxq
+        if scheme == "sym":
+            zero = np.ones(scale.shape) * (maxq + 1) / 2
+        else:
+            zero = np.round(-xmin / scale)
+        if mse:
+            best = np.ones([weight.shape[1]]) * float("inf")
+            for i in range(int(maxshrink * grid)):
+                p = 1 - i / grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / maxq
+                zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
+                q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
+                q -= weight
+                q = np.power(np.abs(q), norm)
+                err = np.sum(q, 0)
+                tmp = err < best
+                if np.any(tmp):
+                    best[tmp] = err[tmp]
+                    scale[tmp] = scale1[tmp]
+                    zero[tmp] = zero1[tmp]
+        if not perchannel:
+            tmp = org_shape[1]
+            scale = np.repeat(scale, tmp)
+            zero = np.repeat(zero, tmp)
+        shape = [-1] + [1] * (len(org_shape) - 1)
+        scale = np.reshape(scale, shape)
+        zero = np.reshape(zero, shape)
+        return scale, zero
+
+    shape = W.shape
+    scale, zp = find_params(W)
+    dead = np.diag(H) == 0
+    H[dead, dead] = 1
+    W[dead, :] = 0  # such channel makes no contribution to quantization computation
+
+    # rearrange considering the diag's value
+    if actorder:
+        perm = np.argsort(np.diag(H))[::-1]
+        W = W[perm, :]  # noqa: N806
+        H = H[perm, :][:, perm]  # noqa: N806
+    Losses = np.zeros_like(W)  # noqa: N806
+    Q = np.zeros_like(W)  # noqa: N806
+    damp = percdamp * np.mean(np.diag(H))
+    diag = np.arange(shape[0])
+    H[diag, diag] += damp  # add a average value of
+    H = np.linalg.cholesky(np.linalg.inv(H)).T  # noqa: N806
+    Hinv = H  # noqa: N806
+    for i1 in range(0, shape[0], blocksize):
+        i2 = min(i1 + blocksize, shape[0])
+        count = i2 - i1
+
+        W1 = copy.deepcopy(W[i1:i2, :])  # noqa: N806
+        Q1 = np.zeros_like(W1)  # noqa: N806
+        Err1 = np.zeros_like(W1)  # noqa: N806
+        Losses1 = np.zeros_like(W1)  # noqa: N806
+        Hinv1 = Hinv[i1:i2, i1:i2]  # noqa: N806
+
+        for i in range(count):  # within a block, channel wise
+            w = W1[i, :]
+            d = Hinv1[i, i]
+
+            if group_size != -1:
+                if (i1 + i) % group_size == 0:
+                    scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
+
+            q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
+            Q1[i, :] = q
+            Losses1[i, :] = (w - q) ** 2 / d**2
+
+            err1 = (w - q) / d
+            W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0))
+            Err1[i, :] = err1
+
+        Q[i1:i2, :] = Q1
+        Losses[i1:i2, :] = Losses1 / 2
+
+        W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1)
+
+    if actorder:
+        invperm = np.argsort(perm)
+        Q = Q[invperm, :]  # noqa: N806
+
+    Q = np.reshape(Q, W.shape)  # noqa: N806
+    del W
+    return Q
+
+
+def gptq_quantize(
+    model,
+    dataloader,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    n_samples=128,
+    percdamp=0.01,
+    blocksize=128,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+):
+    """Quant the model with GPTQ method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        dataloader (object): dataloader for calibration.
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'GPTQ'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        n_samples (int, optional): calibration sample number.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        blocksize (int, optional): blocksize to quantize weight.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+
+    inputs, so = prepare_inputs(model, n_samples, dataloader, providers)
+    del dataloader
+    org_output = copy.deepcopy(model.model.graph.output)
+    model.remove_tensors_from_outputs([i.name for i in org_output])
+    output_names = []
+    for node in model.nodes():
+        if (
+            node.op_type in ["MatMul"]
+            and weight_config.get(node.name, {}) != "fp32"
+            and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+        ):
+            output_names.append(node.input[0])
+    output_names = list(set(output_names))
+    model.add_tensors_to_outputs(output_names)
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+
+    for idx, input_name in enumerate(output_names):
+        simple_progress_bar(len(output_names), idx + 1)
+        node_list = []
+        weights = []
+
+        for node in model.input_name_to_nodes[input_name]:
+            if (
+                node.op_type in ["MatMul"]
+                and weight_config.get(node.name, {}) != "fp32"
+                and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+                and model.get_initializer(node.input[1]) is not None
+            ):
+                weight = numpy_helper.to_array(
+                    model.get_initializer(model.get_node(node.name).input[1]), base_dir
+                ).copy()
+                if len(weight.shape) != 2:
+                    continue
+
+                weights.append(weight)
+                node_list.append(model.get_node(node.name))
+
+        if len(weights) == 0:
+            continue
+
+        Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights]  # noqa: N806
+        nsamples = 0
+        for data in inputs:
+            inp = session.run([input_name], data)[0]
+            tmp = inp.shape[0]
+            inp = np.reshape(inp, (-1, inp.shape[-1]))
+            Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs]  # noqa: N806
+            nsamples += tmp
+            inp = np.sqrt(2 / nsamples) * inp
+            Hs = [i + np.matmul(inp.T, inp) for i in Hs]  # noqa: N806
+
+        for (
+            node,
+            weight,
+            H,  # noqa: N806
+        ) in zip(node_list, weights, Hs, strict=False):
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+            group_size = group_size if group_size != -1 else weight.shape[0]
+            dtype = weight.dtype
+
+            q_weight = gptq(
+                weight,
+                H,
+                num_bits=num_bits,
+                group_size=group_size,
+                scheme=scheme,
+                blocksize=blocksize,
+                percdamp=percdamp,
+                actorder=actorder,
+                mse=mse,
+                perchannel=perchannel,
+            )
+
+            weight_tensor = model.get_initializer(node.input[1])
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            satisfy_MatMulNBits_condition = num_bits == 4  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                org_shape = weight.shape
+                k_blocks = (org_shape[0] + group_size - 1) // group_size
+                q_weight = pad_tensor(q_weight, group_size, k_blocks)
+                q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                model.remove_node(node)
+                model.add_node(q_matmul_node)
+            else:
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=q_weight.shape,
+                    vals=q_weight.astype(dtype).tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.remove_tensors_from_outputs(output_names)
+    model.model.graph.output.MergeFrom(org_output)
+
+    model.topological_sort()
+
+    # reload external data to prevent external data file path errors
+    if model.is_large_model:
+        from onnx.external_data_helper import load_external_data_for_model  # noqa: PLC0415
+
+        load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
+
+    return model
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_model.py
@@ -0,0 +1,600 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from pathlib import Path
+
+import onnx
+import onnx.helper as onnx_helper
+import onnx.numpy_helper as onnx_numpy_helper
+from onnx.onnx_pb import ModelProto
+
+from .quant_utils import attribute_to_kwarg, find_by_name
+
+
+def _clean_initializers_helper(graph, model):
+    """Clean unused initializers from graph.
+
+    Returns:
+        A cleaned graph without unused initializers
+        A list of tensor names, which are not produced by this graph and its subgraphes
+    """
+    requesting_tensor_names = set()
+    requesting_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+    requesting_tensor_names.update(g_out.name for g_out in graph.output if g_out.name)
+
+    new_nodes = []
+    for node in graph.node:
+        new_node = node
+        graph_attrs = [
+            attr
+            for attr in node.attribute
+            if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+        ]
+        if graph_attrs:
+            kwargs = {}
+            for attr in node.attribute:
+                new_attribute = {}
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    (
+                        cleaned_sub_graph,
+                        sub_requesting_tensor_names,
+                    ) = _clean_initializers_helper(attr.g, model)
+                    new_attribute = {attr.name: cleaned_sub_graph}
+                    requesting_tensor_names.update(sub_requesting_tensor_names)
+                elif attr.type == onnx.AttributeProto.GRAPHS:
+                    cleaned_graphes = []
+                    for subgraph in attr.graphs:
+                        (
+                            cleaned_sub_graph,
+                            sub_requesting_tensor_names,
+                        ) = _clean_initializers_helper(subgraph, model)
+                        cleaned_graphes.append(cleaned_sub_graph)
+                        requesting_tensor_names.update(sub_requesting_tensor_names)
+                    new_attribute = {attr.name: cleaned_graphes}
+                else:
+                    new_attribute = attribute_to_kwarg(attr)
+                kwargs.update(new_attribute)
+            new_node = onnx_helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
+        new_nodes.append(new_node)
+
+    graph.ClearField("node")
+    graph.node.extend(new_nodes)
+
+    requesting_tensor_names.difference_update(output for node in graph.node for output in node.output)
+
+    unused_initializer = []
+    for initializer in graph.initializer:
+        if initializer.name in requesting_tensor_names:
+            requesting_tensor_names.remove(initializer.name)
+        else:
+            # mark it to remove, remove here directly will cause mis-behavier
+            unused_initializer.append(initializer)
+
+    name_to_input = {input.name: input for input in graph.input}
+    for initializer in unused_initializer:
+        graph.initializer.remove(initializer)
+        if initializer.name in name_to_input:
+            try:
+                graph.input.remove(name_to_input[initializer.name])
+            except StopIteration:
+                if model.ir_version < 4:
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
+
+    requesting_tensor_names.difference_update(input.name for input in graph.input)
+
+    return graph, requesting_tensor_names
+
+
+class ONNXModel:
+    def __init__(self, model: ModelProto):
+        self.model = model
+
+    def nodes(self):
+        return self.model.graph.node
+
+    def initializer(self):
+        return self.model.graph.initializer
+
+    def initializer_extend(self, inits):
+        if len(inits) == 0:
+            raise ValueError("Can add an empty list.")
+        for init in self.initializer():
+            self._check_init(init, "gain")
+        for init in inits:
+            self._check_init(init)
+            self.model.graph.initializer.append(init)
+
+    def graph(self):
+        return self.model.graph
+
+    def ir_version(self):
+        return self.model.ir_version
+
+    def opset_import(self):
+        return self.model.opset_import
+
+    def set_opset_import(self, domain, version):
+        for opset in self.model.opset_import:
+            if opset.domain == domain:
+                opset.version = version
+                return
+
+        self.model.opset_import.extend([onnx_helper.make_opsetid(domain, version)])
+
+    def remove_node(self, node):
+        if node in self.model.graph.node:
+            self.model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        self.model.graph.node.extend([self._check_node(node)])
+
+    def add_nodes(self, nodes_to_add):
+        for node in nodes_to_add:
+            self.add_node(node)
+
+    def add_initializer(self, tensor):
+        if find_by_name(tensor.name, self.model.graph.initializer) is None:
+            self._check_init(tensor)
+            self.model.graph.initializer.extend([tensor])
+
+    def get_initializer(self, name):
+        for tensor in self.model.graph.initializer:
+            if tensor.name == name:
+                return tensor
+        return None
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_tensor_type(self, tensor_name: str):
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if tensor_name in tensor_type_map:
+            return tensor_type_map[tensor_name].tensor_type
+
+        g_input = self.find_graph_input(tensor_name)
+        if g_input:
+            return g_input.type.tensor_type
+
+        g_output = self.find_graph_output(tensor_name)
+        if g_output:
+            return g_output.type.tensor_type
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.model.graph.node:
+            if node.op_type == "Constant":
+                if node.output[0] == output_name:
+                    for attr in node.attribute:
+                        if attr.name == "value":
+                            return onnx_numpy_helper.to_array(attr.t)
+
+        # Fallback to initializer since constant folding may have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return onnx_numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_initializer_name_set(self):
+        return {initializer.name for initializer in self.model.graph.initializer}
+
+    def remove_initializer(self, tensor):
+        if tensor in self.model.graph.initializer:
+            self.model.graph.initializer.remove(tensor)
+            for input in self.model.graph.input:
+                if input.name == tensor.name:
+                    self.model.graph.input.remove(input)
+                    break
+
+    def remove_initializers(self, init_to_remove):
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def get_non_initializer_inputs(self):
+        initializer_names = self.get_initializer_name_set()
+        non_initializer_inputs = set()
+        for input in self.model.graph.input:
+            if input.name not in initializer_names:
+                non_initializer_inputs.add(input.name)
+        return non_initializer_inputs
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.model.graph.node:
+            for input_name in node.input:
+                if input_name:  # Could be empty when it is optional
+                    if input_name not in input_name_to_nodes:
+                        input_name_to_nodes[input_name] = [node]
+                    else:
+                        input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.model.graph.node:
+            for output_name in node.output:
+                if output_name:  # Could be empty when it is optional
+                    output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)  # noqa: PERF402
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, idx, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def find_node_by_name(self, node_name, new_nodes_list, graph):
+        """Find out if a node exists in a graph or a node is in the
+        new set of nodes created during quantization.
+
+        Returns:
+            The node found or None.
+        """
+        graph_nodes_list = list(graph.node)  # deep copy
+        graph_nodes_list.extend(new_nodes_list)
+        node = find_by_name(node_name, graph_nodes_list)
+        return node
+
+    def get_largest_node_name_suffix(self, node_name_prefix):
+        """
+        Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
+        Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
+        """
+        suffix = -1
+
+        for node in self.model.graph.node:
+            if node.name and node.name.startswith(node_name_prefix):
+                try:
+                    index = int(node.name[len(node_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
+    def get_largest_initializer_name_suffix(self, initializer_name_prefix):
+        """
+        Gets the largest initializer name integer suffix for all initializer names that begin
+        with `initializer_name_prefix`. This can be used to create unique initializer names.
+
+        Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
+                 `initializer_name_prefix` is 'my_weight_'.
+        """
+        suffix = -1
+
+        for initializer in self.model.graph.initializer:
+            if initializer.name.startswith(initializer_name_prefix):
+                try:
+                    index = int(initializer.name[len(initializer_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
+    def find_nodes_by_initializer(self, graph, initializer):
+        """
+        Find all nodes with given initializer as an input.
+        """
+        nodes = []
+        for node in graph.node:
+            for node_input in node.input:
+                if node_input == initializer.name:
+                    nodes.append(node)
+        return nodes
+
+    @staticmethod
+    def __get_initializer(name, graph_path):
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    @staticmethod
+    def __replace_gemm_with_matmul(graph_path):
+        new_nodes = []
+        graph = graph_path[-1]
+        for node in graph.node:
+            graph_attrs = [attr for attr in node.attribute if attr.type == 5 or attr.type == 10]
+            if graph_attrs:
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == 5:
+                        graph_path.append(attr.g)
+                        kv = {attr.name: ONNXModel.__replace_gemm_with_matmul(graph_path)}
+                    elif attr.type == 10:
+                        value = []
+                        for subgraph in attr.graphs:
+                            graph_path.append(subgraph)
+                            value.extend([ONNXModel.__replace_gemm_with_matmul(graph_path)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx_helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            if node.op_type == "Gemm":
+                alpha = 1.0
+                beta = 1.0
+                transA = 0  # noqa: N806
+                transB = 0  # noqa: N806
+                for attr in node.attribute:
+                    if attr.name == "alpha":
+                        alpha = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "beta":
+                        beta = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "transA":
+                        transA = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                    elif attr.name == "transB":
+                        transB = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                if alpha == 1.0 and beta == 1.0 and transA == 0:
+                    inputB = node.input[1]  # noqa: N806
+                    if transB == 1:
+                        B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path)  # noqa: N806
+                        if B:
+                            # assume B is not used by any other node
+                            B_array = onnx_numpy_helper.to_array(B)  # noqa: N806
+                            B_trans = onnx_numpy_helper.from_array(B_array.T)  # noqa: N806
+                            B_trans.name = B.name
+                            Bs_graph.initializer.remove(B)
+                            for input in Bs_graph.input:
+                                if input.name == inputB:
+                                    Bs_graph.input.remove(input)
+                                    break
+                            Bs_graph.initializer.extend([B_trans])
+                        else:
+                            inputB += "_Transposed"  # noqa: N806
+                            transpose_node = onnx_helper.make_node(
+                                "Transpose",
+                                inputs=[node.input[1]],
+                                outputs=[inputB],
+                                name=node.name + "_Transpose" if node.name else "",
+                            )
+                            new_nodes.append(transpose_node)
+
+                    matmul_node = onnx_helper.make_node(
+                        "MatMul",
+                        inputs=[node.input[0], inputB],
+                        outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
+                        name=node.name + "_MatMul" if node.name else "",
+                    )
+                    new_nodes.append(matmul_node)
+
+                    if len(node.input) > 2:
+                        add_node = onnx_helper.make_node(
+                            "Add",
+                            inputs=[node.output[0] + "_MatMul", node.input[2]],
+                            outputs=node.output,
+                            name=node.name + "_Add" if node.name else "",
+                        )
+                        new_nodes.append(add_node)
+
+                # unsupported
+                else:
+                    new_nodes.append(node)
+
+            # not GEMM
+            else:
+                new_nodes.append(node)
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_path.pop()
+        return graph
+
+    def replace_gemm_with_matmul(self):
+        graph_path = [self.graph()]
+        ONNXModel.__replace_gemm_with_matmul(graph_path)
+
+    def save_model_to_file(self, output_path, use_external_data_format=False):
+        """
+        Save model to external data, which is needed for model size > 2GB
+        """
+        self.topological_sort()
+        if use_external_data_format:
+            onnx.external_data_helper.convert_model_to_external_data(
+                self.model,
+                all_tensors_to_one_file=True,
+                location=Path(output_path).name + ".data",
+                convert_attribute=True,
+            )
+        for init in self.model.graph.initializer:
+            self._check_init(init, "end")
+        onnx.save_model(self.model, output_path)
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if (
+                node.op_type == "Constant"
+                and not self.is_graph_output(node.output[0])
+                and node.output[0] not in input_name_to_nodes
+            ):
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        ununsed_weights = []
+        for w in self.initializer():
+            if w.name not in input_name_to_nodes and not self.is_graph_output(w.name):
+                ununsed_weights.append(w)
+                # Remove from graph.input
+                for graph_input in self.graph().input:
+                    if graph_input.name == w.name:
+                        self.graph().input.remove(graph_input)
+
+        self.remove_initializers(ununsed_weights)
+
+    def is_graph_output(self, output_name):
+        return any(output.name == output_name for output in self.model.graph.output)
+
+    def is_graph_input(self, tensor_name: str) -> bool:
+        return any(input.name == tensor_name for input in self.model.graph.input)
+
+    # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
+    # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
+    def topological_sort(self):
+        deps_count = [0] * len(self.nodes())  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(self.nodes()):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(self.nodes()[node_idx])
+                continue
+
+            for input_name in node.input:
+                if not input_name:
+                    continue
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        initializer_names = [init.name for init in self.initializer()]
+        graph_input_names = [input.name for input in self.model.graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(self.nodes()[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(self.nodes()[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        assert end == len(self.graph().node), "Graph is not a DAG"
+        self.graph().ClearField("node")
+        self.graph().node.extend(sorted_nodes)
+
+    def clean_initializers(self):
+        return _clean_initializers_helper(self.graph(), self.model)
+
+    def _check_init(self, init, test=None):
+        if init.data_type == onnx.TensorProto.FLOAT8E4M3FN:
+            if init.HasField("raw_data"):
+                b = list(init.raw_data)
+                if any((i & 127) == 127 for i in b):
+                    raise ValueError(f"Initializer {init.name!r} has nan.")
+        return init
+
+    def _check_node(self, node):
+        """
+        A quantization to float 8 does not use quantized bias but float 16 bias.
+        This function checks that DequantizeLinear is not used to
+        dequantize from float 16.
+        """
+        if node.op_type == "DequantizeLinear":
+            zero_point = node.input[2]
+            init = self.get_initializer(zero_point)
+            dtype = init.data_type
+            if dtype in {
+                onnx.TensorProto.FLOAT16,
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.DOUBLE,
+                onnx.TensorProto.BFLOAT16,
+            }:
+                raise RuntimeError(f"Unsupported DequantizeLinear operator, dequantization from {dtype}.")
+        return node
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/init.py
@@ -0,0 +1,2 @@
+# from .base_operator import QuantOperatorBase
+# from .matmul import MatMulInteger
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/activation.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/activation.py
@@ -0,0 +1,119 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearActivation(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def QuantizeClipRelu(self):  # noqa: N802
+        node = self.node
+        assert node.op_type == "Relu" or node.op_type == "Clip"
+
+        # When mode is QLinearOps, the output quantization params are calculated based on outputs from
+        # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
+        # If input to this node is not quantized then keep this node
+        # If activation is symmetric, not quantize the op and simply return
+        if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
+            return super().quantize()
+
+        quantized_value = self.quantizer.quantized_value_map[node.input[0]]
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_value
+
+    def quantize(self):
+        node = self.node
+        if node.op_type == "Relu" or node.op_type == "Clip":
+            self.QuantizeClipRelu()
+            return
+
+        nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
+        sigmoid_nnapi_mode = (
+            node.op_type == "Sigmoid"
+            and nnapi_sigmoid_option in self.quantizer.extra_options
+            and self.quantizer.extra_options[nnapi_sigmoid_option]
+        )
+        use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
+        use_zeropoint = 0 if sigmoid_nnapi_mode else None
+
+        # No assert on op_type as it is controlled by registry
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_activation_name = ""
+        if node.name:
+            qlinear_activation_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_activation_inputs = [
+            quantized_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            output_scale_name,
+            output_zp_name,
+        ]
+
+        qlinear_activation_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_activation_inputs,
+            [qlinear_activation_output],
+            qlinear_activation_name,
+            **kwargs,
+        )
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_activation_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        nodes.append(qlinear_activation_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQRemovableActivation(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # If input to this node is not quantized then keep this node
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            return
+
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
+        ):
+            self.quantizer.remove_node(self.node)
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/argmax.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/argmax.py
@@ -0,0 +1,18 @@
+from .base_operator import QuantOperatorBase
+
+
+# Use the quantized tensor as input without DQ.
+class QArgMax(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+        if quantized_input_value is None:
+            self.quantizer.new_nodes += [node]
+            return
+
+        node.input[0] = quantized_input_value.q_name
+        self.quantizer.new_nodes += [node]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/attention.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/attention.py
@@ -0,0 +1,73 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize Attention
+"""
+
+
+class AttentionQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        """
+        parameter node: Attention node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "Attention"
+
+        # TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
+        # attribute. This needs to be removed once the QAttention for varied q,k,v sizes
+        # is implemented
+        for attr in node.attribute:
+            if attr.name == "qkv_hidden_sizes":
+                return super().quantize()
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qattention_name = "" if not node.name else node.name + "_quant"
+
+        inputs = []
+        inputs.extend(quantized_input_names)
+        inputs.extend([node.input[2]])
+        inputs.extend(scale_names)
+        inputs.extend([node.input[3] if len(node.input) > 3 else ""])
+        inputs.extend(zero_point_names)
+        inputs.extend([node.input[4] if len(node.input) > 4 else ""])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
+        nodes.append(qattention_node)
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/base_operator.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/base_operator.py
@@ -0,0 +1,26 @@
+class QuantOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_float_tensor(self.node.input[0])
+
+    def quantize(self):
+        """
+        Given a node which does not support quantization, this method checks whether the input to
+        this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
+            parameter node: Current node
+            parameter new_nodes_list: List of new nodes created before processing current node
+            return: List of new nodes created
+        """
+        for _, node_input in enumerate(self.node.input):
+            dequantize_node = self.quantizer._dequantize_value(node_input)
+            if dequantize_node is not None:
+                self.quantizer.new_nodes.append(dequantize_node)
+
+        # Append the original node
+        self.quantizer.new_nodes.append(self.node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/binary_op.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/binary_op.py
@@ -0,0 +1,72 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearBinaryOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0, 1])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_binary_math_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_binary_math_inputs = []
+        # Input 0
+        qlinear_binary_math_inputs.append(quantized_input_names[0])
+        qlinear_binary_math_inputs.append(scale_names[0])
+        qlinear_binary_math_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_binary_math_inputs.append(quantized_input_names[1])
+        qlinear_binary_math_inputs.append(scale_names[1])
+        qlinear_binary_math_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_binary_math_inputs.append(output_scale_name)
+        qlinear_binary_math_inputs.append(output_zp_name)
+
+        qlinear_binary_math_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_binary_math_inputs,
+            [qlinear_binary_math_output],
+            qlinear_binary_math_name,
+            **kwargs,
+        )
+        nodes.append(qlinear_binary_math_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_binary_math_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/concat.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/concat.py
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import (  # noqa: F401
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase  # noqa: F401
+
+
+class QLinearConcat(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            quantized_input_value.value_type,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qlconcat_inputs = [output_scale_name, output_zp_name]
+        for i in range(len(q_input_names)):
+            qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
+        qlconcat_node = onnx.helper.make_node(
+            "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlconcat_node]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/conv.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/conv.py
@@ -0,0 +1,260 @@
+import numpy as np
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,
+    get_mul_node,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class ConvInteger(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def add_bias(self, nodes, scaled_output):
+        """
+        Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
+            parameter nodes: new nodes would be appended into nodes
+            parameter node: current node (Conv)
+            parameter scaled_output: output of quant conv without bias
+            parameter output: output of Conv
+            parameter bias_name: bias of Conv
+            return: the name of output
+        """
+        node = self.node
+        model = self.quantizer.model
+        # Add tensors for the shape to be reshaped to
+        weight = find_by_name(node.input[1], model.initializer())
+        if weight is None:
+            raise ValueError(f"Expected {node.input[1]} to be an initializer")
+
+        # Add reshape for correct broadcase
+        output = node.output[0]
+        reshape_input_data = node.input[2]  # bias of Conv
+        reshape_input_shape = output + "_bias_reshape_shape"
+        reshape_output = output + "_bias_reshape_output"
+
+        shape = np.ones((len(weight.dims)), dtype=np.int64)
+        shape[1] = -1
+        init_shape = onnx.helper.make_tensor(
+            reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
+        )
+        model.add_initializer(init_shape)
+
+        reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
+        nodes.append(reshape_node)
+
+        # Add an Add operation for bias
+        add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
+        nodes.append(add_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        conv_integer_output = node.output[0] + "_output_quantized"
+        conv_integer_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node(
+            "ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
+        )
+        nodes.append(conv_integer_node)
+
+        # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [conv_integer_output],
+            [cast_op_output],
+            conv_integer_output + "_cast",
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        if conv_integer_name:
+            scales_mul_op = conv_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        has_bias = len(node.input) == 3
+        scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                scaled_output_name,
+                output_scale_mul_op,
+            )
+        )
+
+        if has_bias:
+            self.add_bias(nodes, scaled_output_name)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QLinearConv(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                onnx_proto.TensorProto.INT8,
+                0,  # self.quantizer.weight_qType?
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        bias_present = False
+        if len(node.input) == 3:
+            if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+                raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
+            quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
+            bias_present = True
+
+        qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        qlinear_conv_inputs = []
+        # Input 0
+        qlinear_conv_inputs.append(quantized_input_names[0])
+        qlinear_conv_inputs.append(scale_names[0])
+        qlinear_conv_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_conv_inputs.append(quantized_input_names[1])
+        qlinear_conv_inputs.append(scale_names[1])
+        qlinear_conv_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_conv_inputs.append(output_scale_name)
+        qlinear_conv_inputs.append(output_zp_name)
+
+        if bias_present:
+            qlinear_conv_inputs.append(quantized_bias_name)
+
+        qlinear_conv_node = onnx.helper.make_node(
+            "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
+        )
+        nodes.append(qlinear_conv_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_conv_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQConv(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if node.op_type == "Conv" else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/direct_q8.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/direct_q8.py
@@ -0,0 +1,78 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+# For operators that support 8bits operations directly, and output could
+# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
+class Direct8BitOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        if not self.quantizer.force_quantize_no_input_check:
+            # Keep backward compatibility
+            # Quantize when input[0] is quantized already. Otherwise keep it.
+            quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+            if quantized_input_value is None:
+                self.quantizer.new_nodes += [node]
+                return
+
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                quantized_input_value.value_type,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_value.q_name
+            node.output[0] = quantized_output_value.q_name
+            self.quantizer.new_nodes += [node]
+
+        else:
+            # Force quantize those ops if possible, use exclude node list if this is not you want
+            if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+                super().quantize()
+                return
+
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            if quantized_input_names is None:
+                return super().quantize()
+
+            # Create an entry for output quantized value
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_names[0]
+            node.output[0] = quantized_output_value.q_name
+            nodes.append(node)
+
+            self.quantizer.new_nodes += nodes
+
+
+class QDQDirect8BitOp(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        if self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(self.node.input[0])
+            if not self.disable_qdq_for_node_output:
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+        elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
@@ -0,0 +1,121 @@
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
+
+This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
+weight inputs associated with the node to uint8.
+"""
+
+
+class EmbedLayerNormalizationQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "EmbedLayerNormalization"
+
+        if len(node.output) > 2:
+            logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
+            return super().quantize()
+
+        """
+        Pre-quantization EmbedLayerNorm inputs:
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (float32)
+        [3] position_embedding (float32)
+        [4] segment_embedding (float32)
+        [5] gamma (float32)
+        [6] beta (float32)
+        [7] mask (int32) (optional)
+        """
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
+
+        """
+        Quantized Input Tensor List
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (uint8)
+        [3] position_embedding (uint8)
+        [4] segment_embedding (uint8)
+        [5] gamma (uint8)
+        [6] beta (uint8)
+        [7] mask (int32) (optional)
+        [8] word_embedding_scale (float)
+        [9] position_embedding_scale (float)
+        [10] segment_embedding_scale (float)
+        [11] gamma_scale (float)
+        [12] beta_scale (float)
+        [13] word_embedding_zero_point (uint8)
+        [14] position_embedding_zero_point (uint8)
+        [15] segment_embedding_zero_point (uint8)
+        [16] gamma_zero_point (uint8)
+        [17] beta_zero_point (uint8)
+        """
+        inputs = []
+        # 'input_ids'
+        inputs.extend([node.input[0]])
+        # 'segment_ids'
+        inputs.extend([node.input[1]])
+        # 'word_embedding_quant'
+        inputs.extend([quantized_input_names[0]])
+        # 'position_embedding_quant'
+        inputs.extend([quantized_input_names[1]])
+        # 'segment_embedding_quant'
+        inputs.extend([quantized_input_names[2]])
+        # 'gamma_quant'
+        inputs.extend([quantized_input_names[3]])
+        # 'beta_quant'
+        inputs.extend([quantized_input_names[4]])
+        # 'mask' (optional)
+        inputs.extend([node.input[7] if len(node.input) > 7 else ""])
+
+        # Add all scales:
+        inputs.extend([scale_names[0]])
+        inputs.extend([scale_names[1]])
+        inputs.extend([scale_names[2]])
+        inputs.extend([scale_names[3]])
+        inputs.extend([scale_names[4]])
+
+        # Add all zero points:
+        inputs.extend([zero_point_names[0]])
+        inputs.extend([zero_point_names[1]])
+        inputs.extend([zero_point_names[2]])
+        inputs.extend([zero_point_names[3]])
+        inputs.extend([zero_point_names[4]])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qembed_layer_norm_node = onnx.helper.make_node(
+            "QEmbedLayerNormalization",
+            inputs,
+            node.output,
+            qembed_layer_norm_name,
+            **kwargs,
+        )
+        nodes.append(qembed_layer_norm_node)
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gather.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gather.py
@@ -0,0 +1,64 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+"""
+    Quantize Gather
+"""
+
+
+class GatherQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_valid_quantize_weight(self.node.input[0])
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather"
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            gather_new_output,
+            scale_names[0],
+            zero_point_names[0],
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        node.output[0] = gather_new_output
+        node.input[0] = quantized_input_names[0]
+        nodes.append(node)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGather(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather" or node.op_type == "GatherElements"
+
+        if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
+        elif self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gavgpool.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gavgpool.py
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QGlobalAveragePool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "GlobalAveragePool"
+
+        # If input to this node is not quantized then keep this node.
+        if node.input[0] not in self.quantizer.quantized_value_map:
+            return super().quantize()
+
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        # Create an entry for output quantized value.
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        (
+            data_found,
+            output_scale_name_from_parameter,
+            output_zp_name_from_parameter,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        # Just use input scale and zp if parameters for output is not specified.
+        output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
+        output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        kwargs["channels_last"] = 0
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_value.q_name,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                output_scale_name,
+                output_zp_name,
+            ],
+            [quantized_output_value.q_name],
+            qnode_name,
+            **kwargs,
+        )
+        self.quantizer.new_nodes += [qnode]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gemm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gemm.py
@@ -0,0 +1,172 @@
+import logging
+
+import numpy as np  # noqa: F401
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,  # noqa: F401
+    get_mul_node,  # noqa: F401
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase  # noqa: F401
+from .matmul import QOpMatMul
+from .qdq_base_operator import QDQOperatorBase
+
+
+def is_B_transposed(gemm_node):  # noqa: N802
+    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]  # noqa: N806
+    if transB_attribute:
+        return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
+
+    return False
+
+
+def get_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        return onnx.helper.get_attribute_value(beta_attribute[0])
+
+    return 1.0
+
+
+def set_default_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        beta_attribute[0].f = 1.0
+
+    return 1.0
+
+
+class QLinearGemm(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                self.quantizer.weight_qType,
+                0 if is_B_transposed(node) else 1,
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            #  Get Quantized from both activation(input[0]) and weight(input[1])
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        if len(node.input) == 3:
+            if not self.quantizer.is_input_a_initializer(node.input[2]):
+                return super().quantize()
+
+            # Note: if the quantized type is float 8, the bias is converted into float 16.
+            # cublasLtMatMul only supports (b)float16 or float32 bias.
+            quantized_bias_name = self.quantizer.quantize_bias_static(
+                node.input[2], node.input[0], node.input[1], get_beta(self.node)
+            )
+
+        qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qgemm_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name != "beta":
+                kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        # generate input
+        qgemm_inputs = []
+        for i in range(2):
+            qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
+
+        qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
+
+        qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
+        nodes.append(qgemm_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qgemm_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+            node_type=node.op_type,
+            node_qtype=self.quantizer.weight_qType,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGemm(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if is_B_transposed(node) else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            if self.quantizer.is_input_a_initializer(node.input[2]):
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
+                set_default_beta(self.node)
+            else:
+                logging.warning(
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
+                )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/lstm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/lstm.py
@@ -0,0 +1,121 @@
+import numpy
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain  # noqa: F401
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize LSTM
+"""
+
+
+class LSTMQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """
+        parameter node: LSTM node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "LSTM"
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
+            node.input[2]
+        ):
+            super().quantize()
+            return
+
+        model = self.quantizer.model
+        W = model.get_initializer(node.input[1])  # noqa: N806
+        R = model.get_initializer(node.input[2])  # noqa: N806
+
+        if len(W.dims) != 3 or len(R.dims) != 3:
+            super().quantize()
+            return
+
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims  # noqa: N806
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            del W.dims[0]
+            del R.dims[0]
+            W.dims[0] = W_num_dir * W_4_hidden_size
+            R.dims[0] = R_num_dir * R_4_hidden_size
+
+        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[1],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[2],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)  # noqa: N806
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)  # noqa: N806
+
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))  # noqa: N806
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))  # noqa: N806
+
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))  # noqa: N806
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))  # noqa: N806
+
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        model.remove_initializers([W_quant_weight, R_quant_weight])
+        model.add_initializer(W_quant_tranposed)
+        model.add_initializer(R_quant_tranposed)
+
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])  # noqa: N806
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])  # noqa: N806
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])  # noqa: N806
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
+            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
+
+        inputs = []
+        input_len = len(node.input)
+        inputs.extend([node.input[0]])
+        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
+        inputs.extend([node.input[3] if input_len > 3 else ""])
+        inputs.extend([node.input[4] if input_len > 4 else ""])
+        inputs.extend([node.input[5] if input_len > 5 else ""])
+        inputs.extend([node.input[6] if input_len > 6 else ""])
+        inputs.extend([node.input[7] if input_len > 7 else ""])
+        inputs.extend(
+            [
+                quant_input_weight_tuple[2],
+                quant_input_weight_tuple[1],
+                quant_recurrent_weight_tuple[2],
+                quant_recurrent_weight_tuple[1],
+            ]
+        )
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        quant_lstm_name = "" if not node.name else node.name + "_quant"
+        quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
+        self.quantizer.new_nodes.append(quant_lstm_node)
+
+        dequantize_node = self.quantizer._dequantize_value(node.input[0])
+        if dequantize_node is not None:
+            self.quantizer.new_nodes.append(dequantize_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/matmul.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/matmul.py
@@ -0,0 +1,231 @@
+import itertools
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QOpMatMul(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
+            return False
+
+        if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
+            not self.quantizer.is_float_tensor(self.node.input[0])
+        ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
+            return False
+
+        # do not quantize non-constant B matrices for matmul
+        if self.quantizer.q_matmul_const_b_only:
+            if not self.quantizer.find_initializer_in_path(self.node.input[1]):
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                return False
+        return True
+
+
+"""
+    Used when quantize mode is QuantizationMode.IntegerOps.
+"""
+
+
+class MatMulInteger(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        matmul_integer_output = node.output[0] + "_output_quantized"
+        matmul_integer_name = node.name + "_quant" if node.name else ""
+        matmul_integer_node = onnx.helper.make_node(
+            "MatMulInteger",
+            quantized_input_names + zero_point_names,
+            [matmul_integer_output],
+            matmul_integer_name,
+        )
+        nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [matmul_integer_output],
+            [cast_op_output],
+            matmul_integer_output + "_cast",
+            to=otype,
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        scales_mul_op = (
+            matmul_integer_name + "_scales_mul"
+            if matmul_integer_name
+            else scale_names[0] + "_" + scale_names[1] + "_mul"
+        )
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = ""
+        if matmul_integer_name:
+            output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                node.output[0],
+                output_scale_mul_op,
+            )
+        )
+        self.quantizer.new_nodes += nodes
+
+
+"""
+    Used when quantize mode is QuantizationMode.QLinearOps
+"""
+
+
+class QLinearMatMul(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+
+        nodes.extend(nodes_weight)
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_matmul_name = node.name + "_quant" if node.name else ""
+
+        qlinear_matmul_inputs = []
+        # Input 0
+        qlinear_matmul_inputs.append(quantized_input_names[0])
+        qlinear_matmul_inputs.append(scale_names[0])
+        qlinear_matmul_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_matmul_inputs.append(quantized_input_names[1])
+        qlinear_matmul_inputs.append(scale_names[1])
+        qlinear_matmul_inputs.append(zero_point_names[1])
+        # Output quantization parameter
+        qlinear_matmul_inputs.append(output_scale_name)
+        qlinear_matmul_inputs.append(output_zp_name)
+
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
+        qlinear_matmul_node = onnx.helper.make_node(
+            "QLinearMatMul",
+            qlinear_matmul_inputs,
+            [qlinear_matmul_output],
+            qlinear_matmul_name,
+            domain=domain,
+        )
+        nodes.append(qlinear_matmul_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_matmul_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQMatMul(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+
+        if self.disable_qdq_for_node_output:
+            nodes_to_iterate = node.input
+        else:
+            nodes_to_iterate = itertools.chain(node.input, node.output)
+
+        for tensor_name in nodes_to_iterate:
+            if find_by_name(tensor_name, self.quantizer.model.initializer()):
+                is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
+                    tensor_name, default_axis=1, op_type=node.op_type
+                )
+                if is_per_channel:
+                    self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
+                else:
+                    self.quantizer.quantize_weight_tensor(tensor_name)
+            else:
+                self.quantizer.quantize_activation_tensor(tensor_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/maxpool.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/maxpool.py
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QMaxPool(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, go to normal quantize.
+        if self.quantizer.opset_version < 12:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQMaxPool(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, just no change
+        if self.quantizer.opset_version < 12:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/norm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/norm.py
@@ -0,0 +1,40 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QDQNormalization(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
+
+        # Input
+        self.quantizer.quantize_activation_tensor(node.input[0])
+
+        # Scale
+        scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
+        scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=1, op_type=node.op_type
+        )
+
+        if scale_is_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
+        elif scale_is_initializer:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[1])
+
+        # Bias
+        if len(node.input) > 2 and node.input[2]:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
+
+        # Output
+        if not self.disable_qdq_for_node_output:
+            for output_name in node.output:
+                self.quantizer.quantize_activation_tensor(output_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pad.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pad.py
@@ -0,0 +1,172 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    quantize_nparray,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Pad"
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if "mode" not in kwargs or kwargs["mode"] == b"constant":
+            if len(node.input) > 2 and node.input[2] != "":  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        self.quantizer.activation_qType,
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
+                    )
+                    quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array,
+                        quantized_padding_constant_name,
+                    )
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    # TODO: check quantize_inputs after sub graph is supported
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node,
+                        2,
+                        self.quantizer.activation_qType,
+                        quantized_input_value.scale_name,
+                        quantized_input_value.zp_name,
+                        initial_type=scale_tensor.data_type,
+                    )
+                    self.quantizer.new_nodes.extend(pad_value_qnodes)
+                    node.input[2] = pad_value_qnodes[0].output[0]
+            else:
+                # In quantized format, the `zero` before quantization is mapped
+                # to quantized_input_value.zp_name. Thus, padding 0 to
+                # original tensor should become padding zero point to quantized
+                # tensor.
+                if len(node.input) == 2:
+                    # Feed quantization's zero point to padding node.
+                    node.input.append(quantized_input_value.zp_name)
+                else:
+                    # Assign quantization's zero point to padding node.
+                    assert node.input[2] == ""
+                    node.input[2] = quantized_input_value.zp_name
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            quantized_input_value.scale_name,
+            quantized_input_value.zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
+
+
+class QDQPad(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
+        """
+        Returns the Pad's constant padding value. Returns `None` if the padding value is
+        not constant (i.e., comes from a dynamic input).
+        """
+        const_val = None
+        onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
+        if onnx_tensor_type is None:
+            return None
+
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
+        if self.quantizer.opset_version < 11:
+            const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
+        elif len(self.node.input) >= 3 and self.node.input[2]:
+            const_val = self.quantizer.model.get_constant_value(self.node.input[2])
+        else:
+            const_val = np.array(0, dtype=np_dtype)
+
+        return const_val
+
+    def _should_quantize_output_same_as_input(self) -> bool:
+        """
+        Returns true if Pad's output should use the same quantization parameters as input[0]
+        """
+        attrs_dict = {}
+        for attribute in self.node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            attrs_dict.update(kv)
+
+        pad_mode = attrs_dict.get("mode", b"constant")
+        if pad_mode in (b"reflect", b"edge", b"wrap"):
+            # These modes pad the output with a value that already exists in the input.
+            # So, we can quantize the output the same as the input.
+            return True
+
+        # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
+        # because our quantization floating-point range always includes 0.
+        if pad_mode == b"constant":
+            pad_val = self._get_pad_const_val(attrs_dict)
+            if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
+                return float(pad_val.item()) == 0
+
+        return False
+
+    def quantize(self):
+        assert self.node.op_type == "Pad"
+
+        for input_name in self.node.input:
+            if input_name:
+                self.quantizer.quantize_activation_tensor(input_name)
+
+        if not self.disable_qdq_for_node_output:
+            if self._should_quantize_output_same_as_input():
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+            else:
+                self.quantizer.quantize_activation_tensor(self.node.output[0])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pooling.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pooling.py
@@ -0,0 +1,67 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearPool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear pool node for given type (AveragePool, etc)
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
@@ -0,0 +1,22 @@
+import itertools
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray  # noqa: F401
+from .base_operator import QuantOperatorBase  # noqa: F401
+
+
+class QDQOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+        self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
+
+    def quantize(self):
+        node = self.node
+
+        if self.disable_qdq_for_node_output:
+            tensors_to_quantize = node.input
+        else:
+            tensors_to_quantize = itertools.chain(node.input, node.output)
+
+        for tensor_name in tensors_to_quantize:
+            self.quantizer.quantize_activation_tensor(tensor_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/resize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/resize.py
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QResize(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, go to normal quantize.
+        if self.quantizer.opset_version < 11:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQResize(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, just keep this node
+        if self.quantizer.opset_version < 11:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/softmax.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/softmax.py
@@ -0,0 +1,74 @@
+import onnx
+import onnx.helper
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearSoftmax(QuantOperatorBase):
+    def quantize(self):
+        node = self.node
+        # set limitations for softmax output scale and zp, because the output of softmax is always 0-1
+        if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+            out_scale = 1 / 256.0
+            out_zero_point = 0
+        else:
+            out_scale = 1 / 256.0
+            out_zero_point = -128
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear softmax node for given type
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        # make qlinearsoft has the real opset_version, its default SinceVersion would be 1
+        kwargs["opset"] = self.quantizer.opset_version
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
+        return None
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/split.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/split.py
@@ -0,0 +1,63 @@
+import onnx
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QSplit(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        quantized_node_name = ""
+        if node.name:
+            quantized_node_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        # Output just derive the scale/zero from input
+        quantized_output_names = []
+        for output_name in node.output:
+            quantized_output_name = output_name + "quantized"
+            quantized_output_names.append(quantized_output_name)
+            q_output = QuantizedValue(
+                output_name,
+                quantized_output_name,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[output_name] = q_output
+
+        if len(node.input) > 1:
+            quantized_input_names.extend(node.input[1:])
+        quantized_node = onnx.helper.make_node(
+            node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
+        )
+
+        nodes.append(quantized_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQSplit(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Split"
+
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            for output in node.output:
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/where.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/where.py
@@ -0,0 +1,87 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearWhere(QuantOperatorBase):
+    def should_quantize(self):
+        return True
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if not self.quantizer.force_quantize_no_input_check:
+            self.quantizer.new_nodes += [node]
+            return
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [1, 2])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+        qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_output_name = node.name + "_quant" if node.name else ""
+
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlwhere_inputs = [
+            node.input[0],
+            q_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            q_input_names[1],
+            scale_names[1],
+            zero_point_names[1],
+            output_scale_name,
+            output_zp_name,
+        ]
+        qlwhere_node = onnx.helper.make_node(
+            "QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlwhere_node]
+
+
+class QDQWhere(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if self.quantizer.force_quantize_no_input_check:
+            if not self.quantizer.is_tensor_quantized(node.input[1]):
+                self.quantizer.quantize_activation_tensor(node.input[1])
+            if not self.quantizer.is_tensor_quantized(node.input[2]):
+                self.quantizer.quantize_activation_tensor(node.input[2])
+            if not self.disable_qdq_for_node_output:
+                for output in node.output:
+                    self.quantizer.quantize_activation_tensor(output)
+        elif (
+            self.quantizer.is_tensor_quantized(node.input[1])
+            and self.quantizer.is_tensor_quantized(node.input[2])
+            and not self.disable_qdq_for_node_output
+        ):
+            for output in node.output:
+                self.quantizer.quantize_activation_tensor(output)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/preprocess.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/preprocess.py
@@ -0,0 +1,141 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import sys
+
+from .shape_inference import quant_pre_process
+
+logger = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="""Model optimizer and shape inferencer, in preparation for quantization,
+Consists of three optional steps:
+1. Symbolic shape inference (best for transformer models).
+2. Model optimization.
+3. ONNX shape inference.
+
+Model quantization with QDQ format, i.e. inserting QuantizeLinear/DeQuantizeLinear on
+the tensor, requires tensor shape information to perform its best. Currently, shape inferencing
+works best with optimized model. As a result, it is highly recommended to run quantization
+on optimized model with shape information. This is the tool for optimization and shape
+inferencing.
+
+Essentially this tool performs the following three (skippable) steps:
+
+1. Symbolic shape inference.
+2. Model optimization
+3. ONNX shape inference"""
+    )
+
+    parser.add_argument("--input", required=True, help="Path to the input model file")
+    parser.add_argument("--output", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--skip_optimization",
+        type=bool,
+        default=False,
+        help="Skip model optimization step if true. It's a known issue that ORT"
+        " optimization has difficulty with model size greater than 2GB, rerun with"
+        " this option to get around this issue.",
+    )
+    parser.add_argument(
+        "--skip_onnx_shape",
+        type=bool,
+        default=False,
+        help="Skip ONNX shape inference. Symbolic shape inference is most effective"
+        " with transformer based models. Skipping all shape inferences may"
+        " reduce the effectiveness of quantization, as a tensor with unknown"
+        " shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--skip_symbolic_shape",
+        type=bool,
+        default=False,
+        help="Skip symbolic shape inference. Symbolic shape inference is most"
+        " effective with transformer based models. Skipping all shape"
+        " inferences may reduce the effectiveness of quantization, as a tensor"
+        " with unknown shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default=None,
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    if args.skip_optimization and args.skip_onnx_shape and args.skip_symbolic_shape:
+        logger.error("Skipping all three steps, nothing to be done. Quitting...")
+        sys.exit()
+
+    if (not args.skip_optimization) and args.save_as_external_data:
+        logger.error("ORT model optimization does not support external data yet!")
+        sys.exit()
+
+    logger.info("input model: %s", args.input)
+    logger.info("output model: %s", args.output)
+    quant_pre_process(
+        args.input,
+        args.output,
+        args.skip_optimization,
+        args.skip_onnx_shape,
+        args.skip_symbolic_shape,
+        args.auto_merge,
+        args.int_max,
+        args.guess_output_rank,
+        args.verbose,
+        args.save_as_external_data,
+        args.all_tensors_to_one_file,
+        args.external_data_location,
+        args.external_data_size_threshold,
+    )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_loss_debug.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_loss_debug.py
@@ -0,0 +1,389 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""Utilities to run a given ONNX model, while saving input/output tensors of
+eligible operator nodes.
+
+A use case is to debug quantization induced accuracy drop. An AI engineer can
+run the original float32 model and the quantized model with the same inputs,
+then compare the corresponding activations between the two models to find
+where the divergence is.
+
+Example Usage:
+
+```python
+    class ExampleDataReader(CalibrationDataReader):
+        def __init__(self):
+            ...
+        def get_next(self):
+            ...
+
+    input_data_reader = ExampleDataReader()
+
+    augmented_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_model.onnx"))
+    modify_model_output_intermediate_tensors (path_to_onnx_model, augmented_model_path)
+
+    tensor_dict = collect_activations(augmented_model_path, input_data_reader)
+```
+
+`tensor_dict` points to a dictionary where the keys are tensor names and each value
+is a list of tensors, one from each model run
+
+"""
+
+import logging
+import math
+import time
+from collections.abc import Callable, Sequence
+from pathlib import Path
+
+import numpy
+import onnx
+from onnx import helper, numpy_helper
+
+import onnxruntime
+
+from .calibrate import CalibraterBase, CalibrationDataReader
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    DEQUANT_OP_NAME,
+    DEQUANT_OUTPUT_SUFFIX,
+    QUANT_INPUT_SUFFIX,
+    TENSOR_NAME_QUANT_SUFFIX,
+    find_by_name,
+    load_model_with_shape_infer,
+)
+
+_TENSOR_SAVE_POSTFIX = "_ReshapedSavedOutput"
+_TENSOR_SAVE_POSTFIX_LEN = len(_TENSOR_SAVE_POSTFIX)
+
+
+def modify_model_output_intermediate_tensors(
+    input_model_path: str | Path,
+    output_model_path: str | Path,
+    op_types_for_saving: Sequence[str] | None = None,
+    save_as_external_data: bool = False,
+) -> None:
+    """Augment a given ONNX model to save node input/output tensors.
+
+    Add all input/output tensors of operator nodes to model outputs
+    so that their values can be retrieved for debugging purposes.
+
+    Args:
+        input_model: the path to load the model.
+        op_types_for_saving: Operator types for which the
+                input/output should be saved. By default, saving all the
+                float32/float16 tensors.
+
+    Returns:
+        The augmented ONNX model
+    """
+
+    if op_types_for_saving is None:
+        op_types_for_saving = []
+    saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
+    model_to_augment = saver.model
+    tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
+    reshape_shape_name = "LinearReshape_" + str(time.time())
+    reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
+    model_to_augment.graph.initializer.append(reshape_shape)
+
+    for tensor_name in tensors:
+        reshape_output = tensor_name + _TENSOR_SAVE_POSTFIX
+        reshape_node = onnx.helper.make_node(
+            "Reshape",
+            inputs=[tensor_name, reshape_shape_name],
+            outputs=[reshape_output],
+            name=reshape_output,
+        )
+        model_to_augment.graph.node.append(reshape_node)
+        reshape_output_value_info = helper.make_tensor_value_info(
+            reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
+        )
+        model_to_augment.graph.output.append(reshape_output_value_info)
+
+    onnx.save(
+        model_to_augment,
+        output_model_path,
+        save_as_external_data=save_as_external_data,
+    )
+
+
+def collect_activations(
+    augmented_model: str,
+    input_reader: CalibrationDataReader,
+    session_options=None,
+    execution_providers: Sequence[str] | None = None,
+) -> dict[str, list[numpy.ndarray]]:
+    """Run augmented model and collect activations tensors.
+
+    Args:
+        augmented_model: Path to augmented model created by modify_model_output_intermediate_tensors ()
+        input_reader: Logic for reading input for the model, augmented model have the same
+            input with the original model.
+        session_options: Optional OnnxRuntime session options for controlling model run.
+            By default graph optimization is turned off
+        execution_providers: Collection of execution providers for running the model.
+            Only CPU EP is used by default.
+
+    Returns:
+        A dictionary where the key is tensor name and values are list of tensors from each batch
+    """
+
+    if session_options is None:
+        session_options = onnxruntime.SessionOptions()
+        session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+    if execution_providers is None:
+        execution_providers = ["CPUExecutionProvider"]
+
+    inference_session = onnxruntime.InferenceSession(
+        augmented_model,
+        sess_options=session_options,
+        providers=execution_providers,
+    )
+
+    intermediate_outputs = []
+    for input_d in input_reader:
+        intermediate_outputs.append(inference_session.run(None, input_d))
+    if not intermediate_outputs:
+        raise RuntimeError("No data is collected while running augmented model!")
+
+    output_dict = {}
+    output_info = inference_session.get_outputs()
+    for batch in intermediate_outputs:
+        for output, output_data in zip(output_info, batch, strict=False):
+            if output.name.endswith(_TENSOR_SAVE_POSTFIX):
+                output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN]
+                output_dict.setdefault(output_name, []).append(output_data)
+
+    return output_dict
+
+
+_POST_QDQ_POSTFIX1 = DEQUANT_OUTPUT_SUFFIX + "_1"
+
+
+def _add_pre_post_qdq_pair(
+    qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]],
+    activation_name: str,
+    pre_qdq_tensors: Sequence[numpy.ndarray] | None,
+    post_qdq_tensors: Sequence[numpy.ndarray] | None,
+) -> None:
+    if post_qdq_tensors is not None and pre_qdq_tensors is not None:
+        qdq_cmp[activation_name] = {}
+        qdq_cmp[activation_name]["pre_qdq"] = pre_qdq_tensors
+        qdq_cmp[activation_name]["post_qdq"] = post_qdq_tensors
+
+
+def create_activation_matching(
+    qdq_activations: dict[str, Sequence[numpy.ndarray]],
+    float_activations: dict[str, Sequence[numpy.ndarray]] | None = None,
+) -> dict[str, dict[str, Sequence[numpy.ndarray]]]:
+    """Comparing activation values to help debugging accuracy loss due to quantization.
+
+    This functions takes saved activations from the QDQ model and (optionally) the
+    float point model, and provides a data structure for comparing:
+        * from the qdq model, activation values before and after QDQ operation
+        * across both models, activations from the orignal model vs the corresponding
+          activations in the QDQ model
+
+    Arg:
+        qdq_activations: Output of `collect_activations`. This must be from a quantized
+            model with QDQ format.
+        float_activations: Output of `collect_activations`. This must be from the float
+            point model.
+
+    Returns:
+        Dict for comparing pre and post quantized activation tensors. E.g.
+        ```
+        qdq_cmp = cmp_qdq_input_output(qdq_activations)
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+
+
+        qdq_cmp = cmp_qdq_input_output(qdq_activations, float_activations)
+        print(qdq_cmp['activation1']['float'][0])
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+        ```
+    """
+
+    qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]] = {}
+    for tensor_name, tensors in qdq_activations.items():
+        if tensor_name.endswith(QUANT_INPUT_SUFFIX):
+            pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)]
+            post_qdq_tensors = qdq_activations.get(pre_name)
+            pre_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(DEQUANT_OUTPUT_SUFFIX):
+            pre_name = tensor_name[: -len(DEQUANT_OUTPUT_SUFFIX)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(_POST_QDQ_POSTFIX1):
+            pre_name = tensor_name[: -len(_POST_QDQ_POSTFIX1)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+
+    if not float_activations:
+        return qdq_cmp
+
+    for act_name, act_values in qdq_cmp.items():
+        float_acts = float_activations.get(act_name)
+        if float_acts is not None:
+            act_values["float"] = float_acts
+
+    return qdq_cmp
+
+
+def _run_dequantize_linear(
+    weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int
+) -> numpy.ndarray | None:
+    assert weight_scale.shape == weight_zp.shape
+    if weight_zp.size == 1:
+        return (weight_tensor - weight_zp) * weight_scale
+
+    assert weight_zp.ndim == 1
+    reshape_dims = list(weight_tensor.shape)  # deep copy
+    reshape_dims[channel_axis] = 1  # only one per channel for reshape
+    channel_count = weight_tensor.shape[channel_axis]
+    dequantized_weights = None
+    for i in range(channel_count):
+        per_channel_data = weight_tensor.take(i, channel_axis)
+        dequantized_per_channel_data = (per_channel_data - weight_zp[i]) * weight_scale[i]
+        if i == 0:
+            dequantized_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+        else:
+            channel_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+            dequantized_weights = numpy.concatenate((dequantized_weights, channel_weights), channel_axis)
+
+    if dequantized_weights is None:
+        return None
+
+    dequantized_weights.reshape(weight_tensor.shape)
+    return dequantized_weights
+
+
+def create_weight_matching(float_model_path: str, qdq_model_path: str) -> dict[str, dict[str, numpy.ndarray]]:
+    """Comparing weight values to help debugging accuracy loss due to quantization.
+
+    This functions takes the float model and the qdq model, and provides a data structure for comparing
+    their corresponding weights to locate quantization errors
+
+    Arg:
+        float_model_path: Path points to the float point model.
+        qdq_model_path: Path points to the qdq model.
+
+    Returns:
+        Dict for comparing weight tensors. E.g.
+        ```
+        qdq_weight_cmp = create_weight_matching(float_model, qdq_model)
+        print(qdq_weight_cmp['activation1']['float'])
+        print(qdq_weight_cmp['activation1']['dequantized'])
+        ```
+    """
+    float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path)))
+    qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path)))
+
+    matched_weights: dict[str, dict[str, numpy.ndarray]] = {}
+    initializers = qdq_onnx_model.initializer()
+    for node in qdq_onnx_model.nodes():
+        if node.op_type != DEQUANT_OP_NAME:
+            continue  # Only care about DQ node
+        weight_name: str = node.input[0]
+        weight_values = find_by_name(weight_name, initializers)
+        if not weight_values:
+            continue  # Only care about DQ node with const inputs
+        if not weight_name.endswith(TENSOR_NAME_QUANT_SUFFIX):
+            logging.error(f"Model Error in '{qdq_model_path}': Dequantized tensor name '{weight_name}' not recognized!")
+            continue
+
+        axis = -1
+        for attr in node.attribute:
+            if attr.name == "axis":
+                axis = attr.i
+
+        weight_tensor = numpy_helper.to_array(weight_values)
+        weight_scale = numpy_helper.to_array(find_by_name(node.input[1], initializers))
+        if len(node.input) > 2:
+            weight_zp = numpy_helper.to_array(find_by_name(node.input[2], initializers))
+        else:
+            weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
+
+        # Perform dequantization:
+        if weight_scale.size == weight_zp.size == 1:
+            # Avoids the confusion between a scaler and a tensor of one element.
+            weight_scale = weight_scale.reshape(())
+            weight_zp = weight_zp.reshape(())
+        if weight_scale.shape != weight_zp.shape:
+            raise RuntimeError(
+                f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
+            )
+        weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
+        weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
+        if weight_quant is None:
+            logging.error(f"Model Error in '{qdq_model_path}': '{weight_name}' per-channel quantization on 0 channel")
+            continue
+
+        float_values = find_by_name(weight_name, float_onnx_model.initializer())
+        if not float_values:
+            logging.error(f"Model Error in '{float_model_path}': weight tensor '{weight_name}' not found!")
+            continue
+        weight_float = numpy_helper.to_array(float_values)
+        matched_weights[weight_name] = {"float": weight_float, "dequantized": weight_quant}
+
+    return matched_weights
+
+
+def compute_signal_to_quantization_noice_ratio(
+    x: Sequence[numpy.ndarray] | numpy.ndarray, y: Sequence[numpy.ndarray] | numpy.ndarray
+) -> float:
+    if isinstance(x, numpy.ndarray):
+        xlist = [x]
+    else:
+        xlist = x
+    if isinstance(y, numpy.ndarray):
+        ylist = [y]
+    else:
+        ylist = y
+    if len(xlist) != len(ylist):
+        raise RuntimeError("Unequal number of tensors to compare!")
+
+    left = numpy.concatenate(xlist).flatten()
+    right = numpy.concatenate(ylist).flatten()
+
+    epsilon = numpy.finfo("float").eps
+    tensor_norm = max(numpy.linalg.norm(left), epsilon)
+    diff_norm = max(numpy.linalg.norm(left - right), epsilon)
+    res = tensor_norm / diff_norm
+    return 20 * math.log10(res)
+
+
+def compute_weight_error(
+    weights_match: dict[str, dict[str, numpy.ndarray]],
+    err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio,
+) -> dict[str, float]:
+    result: dict[str, float] = {}
+    for weight_name, weight_match in weights_match.items():
+        result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"])
+    return result
+
+
+def compute_activation_error(
+    activations_match: dict[str, dict[str, Sequence[numpy.ndarray]]],
+    err_func: Callable[
+        [Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float
+    ] = compute_signal_to_quantization_noice_ratio,
+) -> dict[str, dict[str, float]]:
+    result: dict[str, dict[str, float]] = {}
+    for name, match in activations_match.items():
+        err_result: dict[str, float] = {}
+        err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"])
+        float_activation = match["float"]
+        if float_activation:
+            err_result["xmodel_err"] = err_func(float_activation, match["post_qdq"])
+        result[name] = err_result
+    return result
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quant_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quant_utils.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quantize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quantize.py
@@ -0,0 +1,953 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
+import tempfile
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import onnx
+
+from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
+from .onnx_quantizer import ONNXQuantizer
+from .qdq_quantizer import QDQQuantizer
+from .quant_utils import (
+    MODEL_SIZE_THRESHOLD,
+    QuantFormat,
+    QuantizationMode,
+    QuantType,
+    load_model_with_shape_infer,
+    model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
+    update_opset_version,
+)
+from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantConfig:
+    def __init__(
+        self,
+        activation_type=QuantType.QUInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+    ):
+        """
+        This is the Base class for both Static and Dynamic Quantize Configuration
+        Args:
+            activation_type:
+                quantization data type of activation. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            weight_type:
+                quantization data type of weight. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+            nodes_to_quantize:
+                List of nodes names to quantize. When this list is not None only the nodes in this list
+                are quantized.
+                example:
+                [
+                    'Conv__224',
+                    'Conv__252'
+                ]
+            nodes_to_exclude:
+                List of nodes names to exclude. The nodes in this list will be excluded from quantization
+                when it is not None.
+            per_channel: quantize weights per channel
+            reduce_range:
+                quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+                especially for per-channel mode
+            use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        """
+
+        nodes_to_exclude = nodes_to_exclude or []
+        nodes_to_quantize = nodes_to_quantize or []
+        op_types_to_quantize = op_types_to_quantize or []
+        self.op_types_to_quantize = op_types_to_quantize
+        self.per_channel = per_channel
+        self.reduce_range = reduce_range
+        self.weight_type = weight_type
+        self.activation_type = activation_type
+        self.nodes_to_quantize = nodes_to_quantize
+        self.nodes_to_exclude = nodes_to_exclude
+        self.use_external_data_format = use_external_data_format
+
+
+class StaticQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        calibrate_method=CalibrationMethod.MinMax,
+        quant_format=QuantFormat.QDQ,
+        activation_type=QuantType.QInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,
+        extra_options=None,
+    ):
+        """
+        This is the derived class for static Quantize Configuration
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            calibrate_method:
+                Current calibration methods supported are MinMax, Entropy and Percentile.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+            calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+                [ "CPUExecutionProvider" ].
+            extra_options:
+                key value pair dictionary for various options in different case. Current used:
+                    extra.Sigmoid.nnapi = True/False  (Default is False)
+                    ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                    WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                    EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                                  Dyanmic mode currently is supported. Will support more in future.
+                    ForceQuantizeNoInputCheck = True/False :
+                        By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                        quantized already. Setting to True to force such operator always quantize input and so generate
+                        quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                    MatMulConstBOnly = True/False:
+                        Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                    AddQDQPairToWeight = True/False :
+                        Default is False which quantizes floating-point weight and feeds it to solely inserted
+                        DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                        QuantizeLinear/DeQuantizeLinear nodes to weight.
+                    OpTypesToExcludeOutputQuantization = list of op type :
+                        Default is []. If any op type is specified, it won't quantize the output of ops with this
+                        specific op types.
+                    DedicatedQDQPair = True/False :
+                        Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                        inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                    QDQOpTypePerChannelSupportToAxis = dictionary :
+                        Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                        effective only when per channel quantization is supported and per_channel is True. If specific
+                        op type supports per channel quantization but not explicitly specified with channel axis,
+                        default channel axis will be used.
+                    CalibTensorRangeSymmetric = True/False :
+                        Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                        set to symmetric to central point "0".
+                    CalibMovingAverage = True/False :
+                        Default is False. If enabled, the moving average of the minimum and maximum values will be
+                        computed when the calibration method selected is MinMax.
+                    CalibMovingAverageConstant = float :
+                        Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                        minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                        when CalibMovingAverage is set to True.
+                    QuantizeBias = True/False :
+                        Default is True which quantizes floating-point biases and it solely inserts
+                        a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
+                        any quantization nodes associated with biases.
+                        This extra option is only effective when quant_format is QuantFormat.QDQ.
+                    SmoothQuant = True/False :
+                        Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                        fake input channel quantization.
+                    SmoothQuantAlpha = float :
+                        Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                        and activation quantization. A larger alpha value could be used on models with more significant
+                        activation outliers to migrate more quantization difficulty to weights.
+                    SmoothQuantFolding = True/False :
+                        Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                        SmoothQuant will be folded into the previous op if the previous op is foldable.
+                    UseQDQContribOps = True/False :
+                        Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                        `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                        contrib op implementations. The contrib op implementations may support features not standardized
+                        into the ONNX specification (e.g., 16-bit quantization types).
+                    MinimumRealRange = float|None :
+                        Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                        (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
+                        is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                        necessary for EPs like QNN that require a minimum floating-point range when determining
+                        quantization parameters.
+                    TensorQuantOverrides = dictionary :
+                        Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                        list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                        per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                        Each dictionary contains optional overrides with the following keys and values.
+                            'quant_type' = QuantType : The tensor's quantization data type.
+                            'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                            'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                            'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                            'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                    QDQKeepRemovableActivations = True/False:
+                        Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                        will be explicitly represented in the QDQ model. If false, these activations are automatically
+                        removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                        optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                        operators from the model.
+                    QDQDisableWeightAdjustForInt32Bias = True/False:
+                        Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
+                        has a scale (input_scale * weight_scale) that is too small.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+
+        super().__init__(
+            activation_type=activation_type,
+            weight_type=weight_type,
+            op_types_to_quantize=op_types_to_quantize,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            use_external_data_format=use_external_data_format,
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.calibrate_method = calibrate_method
+        self.quant_format = quant_format
+        self.calibration_providers = calibration_providers
+        self.extra_options = extra_options or {}
+
+
+def get_qdq_config(
+    model_input: str | Path | onnx.ModelProto,
+    calibration_data_reader: CalibrationDataReader,
+    calibrate_method=CalibrationMethod.MinMax,
+    calibrate_args: dict[str, Any] | None = None,
+    activation_type=QuantType.QUInt8,
+    weight_type=QuantType.QInt8,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+    per_channel: bool = False,
+    reduce_range: bool = False,
+    keep_removable_activations: bool = False,
+    min_real_range: float | None = None,
+    tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
+    nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None,
+    extra_options: dict | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a configuration suitable that quantizes the entire model to integer precision.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QInt8.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
+            the zero-point values are 127 and 32,767, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if a weight's quant type is a signed int.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
+            and their quantization axes.
+        reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
+            May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
+        min_real_range: Default is None. If set to a floating-point value, the calculation of the quantization parameters
+            (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
+            is less than the specified minimum range, rmax will be set to rmin + min_real_range.
+        tensor_quant_overrides: tensor-level quantization overrides. Defaults to None.
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
+            and QuantizeLinear are quantized.
+        nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
+            accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
+            should be excluded from quantization.
+        extra_options: Additional options specified as string key/value pairs. Refer to the documentation for
+            `quantize_static` for valid keys and values.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    q16_types = {QuantType.QInt16, QuantType.QUInt16}
+    q4_types = {QuantType.QInt4, QuantType.QUInt4}
+    op_types_to_exclude = {"Cast", "DequantizeLinear", "QuantizeLinear"}
+
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
+
+    op_types = set()
+    model_has_external_data = False
+    overrides_helper = TensorQuantOverridesHelper(
+        copy.deepcopy(tensor_quant_overrides) if tensor_quant_overrides else {}
+    )
+
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if isinstance(nodes_to_exclude, list) else set()
+
+    # Iterate through nodes to get all operator types in the model and
+    # call user's function to filter out nodes from quantization.
+    for node in model.graph.node:
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if node.name in nodes_to_exclude_set:
+            continue
+        if callable(nodes_to_exclude) and nodes_to_exclude(model, node):
+            nodes_to_exclude_set.add(node.name)
+        else:
+            op_types.add(node.op_type)
+
+    final_extra_options = {
+        "MinimumRealRange": min_real_range,
+        "QDQKeepRemovableActivations": keep_removable_activations,
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
+        "ForceQuantizeNoInputCheck": True,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+    }
+
+    # Pass along known calibration options
+    if calibrate_args:
+        calib_extra_options_keys = [
+            ("symmetric", "CalibTensorRangeSymmetric"),
+            ("moving_average", "CalibMovingAverage"),
+            ("averaging_constant", "CalibMovingAverageConstant"),
+            ("max_intermediate_outputs", "CalibMaxIntermediateOutputs"),
+            ("percentile", "CalibPercentile"),
+        ]
+        calib_extra_options = {
+            key: calibrate_args.get(name) for (name, key) in calib_extra_options_keys if name in calibrate_args
+        }
+        final_extra_options.update(calib_extra_options)
+
+    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+    if onnx_opset.version < 21:
+        opset21_types = q16_types.union(q4_types)
+        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
+        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+            final_extra_options["UseQDQContribOps"] = True
+
+    # Allow user's extra_options to override our final_extra_options.
+    if extra_options:
+        final_extra_options.update(extra_options)
+
+    return StaticQuantConfig(
+        calibration_data_reader,
+        calibrate_method=calibrate_method,
+        quant_format=QuantFormat.QDQ,
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(op_types_to_exclude))
+        ),
+        nodes_to_exclude=list(nodes_to_exclude_set),
+        per_channel=per_channel,
+        reduce_range=reduce_range,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
+        extra_options=final_extra_options,
+    )
+
+
+class DynamicQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        extra_options=None,
+    ):
+        """
+        This is a class for dynamic Quant Configuration
+
+        Args:
+            extra_options: key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+        super().__init__(
+            op_types_to_quantize=op_types_to_quantize,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            weight_type=weight_type,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            use_external_data_format=use_external_data_format,
+        )
+        self.extra_options = extra_options or {}
+
+
+def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format:"
+            "activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
+        )
+    if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
+            "!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
+        )
+
+    if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
+            f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
+        )
+
+    q16_types = [QuantType.QInt16, QuantType.QUInt16]
+
+    if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
+        raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
+
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
+        logging.warning(
+            "Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
+            "Or it will lead to bad performance on x64."
+        )
+
+
+def quantize_static(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QDQ,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    activation_type=QuantType.QInt8,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    calibrate_method=CalibrationMethod.MinMax,
+    calibration_providers=None,
+    extra_options=None,
+):
+    """
+    Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
+    It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
+    = QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
+    targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
+    accuracy.
+
+    Args:
+
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        calibration_data_reader: a calibration data reader. It
+            enumerates calibration data and generates inputs for the
+            original model.
+        quant_format: QuantFormat{QOperator, QDQ}.
+            QOperator format quantizes the model with quantized operators directly.
+            QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+        activation_type:
+            quantization data type of activation. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        calibrate_method:
+            Current calibration methods supported are MinMax and Entropy.
+                Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
+        op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ]
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                              Dyanmic mode currently is supported. Will support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                AddQDQPairToWeight = True/False :
+                    Default is False which quantizes floating-point weight and feeds it to solely inserted
+                    DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                    QuantizeLinear/DeQuantizeLinear nodes to weight.
+                OpTypesToExcludeOutputQuantization = list of op type :
+                    Default is []. If any op type is specified, it won't quantize the output of ops with this
+                    specific op types.
+                DedicatedQDQPair = True/False :
+                    Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                    inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                QDQOpTypePerChannelSupportToAxis = dictionary :
+                    Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                    effective only when per channel quantization is supported and per_channel is True. If specific
+                    op type supports per channel quantization but not explicitly specified with channel axis,
+                    default channel axis will be used.
+                CalibTensorRangeSymmetric = True/False :
+                    Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                    set to symmetric to central point "0".
+                CalibStridedMinMax = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max, only stride amount of
+                    data will be used and then all results will be merged in the end.
+                CalibMovingAverage = True/False :
+                    Default is False. If enabled, the moving average of the minimum and maximum values will be
+                    computed when the calibration method selected is MinMax.
+                CalibMovingAverageConstant = float :
+                    Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                    minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                    when CalibMovingAverage is set to True.
+                CalibMaxIntermediateOutputs = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max range of the tensors
+                    it will load at max value number of outputs before computing and merging the range. This will
+                    produce the same result as all computing with None, but is more memory efficient.
+                SmoothQuant = True/False :
+                    Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                    fake input channel quantization.
+                SmoothQuantAlpha = float :
+                    Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                    and activation quantization. A larger alpha value could be used on models with more significant
+                    activation outliers to migrate more quantization difficulty to weights.
+                SmoothQuantFolding = True/False :
+                    Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                    SmoothQuant will be folded into the previous op if the previous op is foldable.
+                UseQDQContribOps = True/False :
+                    Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                    `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                    contrib op implementations. The contrib op implementations may support features not standardized
+                    into the ONNX specification (e.g., 16-bit quantization types).
+                MinimumRealRange = float|None :
+                    Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                    (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
+                    is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                    necessary for EPs like QNN that require a minimum floating-point range when determining
+                    quantization parameters.
+                TensorQuantOverrides = dictionary :
+                    Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                    list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                    per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                    Each dictionary contains optional overrides with the following keys and values.
+                        'quant_type' = QuantType : The tensor's quantization data type.
+                        'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                        'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                        'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                        'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                QDQKeepRemovableActivations = True/False:
+                    Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                    will be explicitly represented in the QDQ model. If false, these activations are automatically
+                    removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                    optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                    operators from the model.
+                QDQDisableWeightAdjustForInt32Bias = True/False:
+                    Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
+                    has a scale (input_scale * weight_scale) that is too small.
+    """
+    if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
+        if calibrate_method != CalibrationMethod.Distribution:
+            raise ValueError("Only Distribution calibration method is supported for float quantization.")
+
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+    mode = QuantizationMode.QLinearOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        q_linear_ops = list(QLinearOpsRegistry.keys())
+        qdq_ops = list(QDQRegistry.keys())
+        op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    calib_extra_options_keys = [
+        ("CalibTensorRangeSymmetric", "symmetric"),
+        ("CalibMovingAverage", "moving_average"),
+        ("CalibMovingAverageConstant", "averaging_constant"),
+        ("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
+        ("CalibPercentile", "percentile"),
+    ]
+    calib_extra_options = {
+        key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
+    }
+
+    if extra_options.get("SmoothQuant", False):
+        import importlib  # noqa: PLC0415
+
+        try:
+            importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
+        except Exception as e:
+            logging.error(f"{e}.")
+            raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
+
+        from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant  # noqa: PLC0415
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        orig_nodes = [i.name for i in model.graph.node]
+        dataloader = inc_dataloader()
+        sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
+        del dataloader
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
+        sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
+        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
+        model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
+
+    updated_model = update_opset_version(model, weight_type)
+    is_model_updated = updated_model is not model
+    if is_model_updated:
+        model = updated_model
+
+    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if is_model_updated:
+            # Update model_input and avoid to use the original one
+            model_input = copy.deepcopy(model)
+
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
+        calibrator = create_calibrator(
+            Path(model_input),
+            op_types_to_quantize,
+            augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
+            calibrate_method=calibrate_method,
+            use_external_data_format=use_external_data_format,
+            providers=calibration_providers,
+            extra_options=calib_extra_options,
+        )
+
+        stride = extra_options.get("CalibStridedMinMax", None)
+        if stride:
+            total_data_size = len(calibration_data_reader)
+            if total_data_size % stride != 0:
+                raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
+
+            for start in range(0, total_data_size, stride):
+                end_index = start + stride
+                calibration_data_reader.set_range(start_index=start, end_index=end_index)
+                calibrator.collect_data(calibration_data_reader)
+        else:
+            calibrator.collect_data(calibration_data_reader)
+        tensors_range = calibrator.compute_data()
+        if not isinstance(tensors_range, TensorsData):
+            raise TypeError(
+                f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
+            )
+        del calibrator
+
+    check_static_quant_arguments(quant_format, activation_type, weight_type)
+
+    if quant_format is QuantFormat.QOperator:
+        quantizer = ONNXQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            mode,
+            True,  # static
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+    else:
+        quantizer = QDQQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+    if not pre_processed:
+        logging.warning(
+            "Please consider pre-processing before quantization. See "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if extra_options.get("SmoothQuant", False):
+        sq_path.cleanup()
+
+
+def quantize_dynamic(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    extra_options=None,
+):
+    """Given an onnx model, create a quantized onnx model and save it into a file
+
+    Args:
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        op_types_to_quantize:
+            specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+            It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+    """
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+
+    mode = QuantizationMode.IntegerOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        op_types_to_quantize = list(IntegerOpsRegistry.keys())
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if "MatMulConstBOnly" not in extra_options:
+        extra_options["MatMulConstBOnly"] = True
+
+    model = update_opset_version(model, weight_type)
+
+    quantizer = ONNXQuantizer(
+        model,
+        per_channel,
+        reduce_range,
+        mode,
+        False,  # static
+        weight_type,
+        QuantType.QUInt8,  # dynamic activation only supports uint8
+        None,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options,
+    )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+
+
+def quantize(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    quant_config: QuantConfig,
+):
+    """Quantize a model with QuantConfig.
+
+    Args:
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
+        quant_config (QuantConfig | WeightOnlyQuantConfig): Quantization Configuration.
+    """
+    if isinstance(quant_config, StaticQuantConfig):
+        quantize_static(
+            model_input,
+            model_output,
+            quant_config.calibration_data_reader,
+            calibrate_method=quant_config.calibrate_method,
+            quant_format=quant_config.quant_format,
+            activation_type=quant_config.activation_type,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            calibration_providers=quant_config.calibration_providers,
+            extra_options=quant_config.extra_options,
+        )
+
+    elif isinstance(quant_config, DynamicQuantConfig):
+        quantize_dynamic(
+            model_input,
+            model_output,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            extra_options=quant_config.extra_options,
+        )
+    else:
+        # training package doesn't has quantize_matmul_4bits, avoid global import
+        from .matmul_nbits_quantizer import MatMulNBitsQuantizer, WeightOnlyQuantConfig  # noqa: PLC0415
+
+        if isinstance(quant_config, WeightOnlyQuantConfig):
+            model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load(model_input)
+            quant = MatMulNBitsQuantizer(model, algo_config=quant_config)
+            quant.process()
+            quant.model.save_model_to_file(model_output, True)
+        else:
+            raise TypeError(
+                "Invalid quantization config type, it must be either StaticQuantConfig, "
+                "DynamicQuantConfig, or WeightOnlyQuantConfig."
+            )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/registry.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/registry.py
@@ -0,0 +1,109 @@
+from .operators.activation import QDQRemovableActivation, QLinearActivation
+from .operators.argmax import QArgMax
+from .operators.attention import AttentionQuant
+from .operators.base_operator import QuantOperatorBase
+from .operators.binary_op import QLinearBinaryOp
+from .operators.concat import QLinearConcat
+from .operators.conv import ConvInteger, QDQConv, QLinearConv
+from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
+from .operators.embed_layernorm import EmbedLayerNormalizationQuant
+from .operators.gather import GatherQuant, QDQGather
+from .operators.gavgpool import QGlobalAveragePool
+from .operators.gemm import QDQGemm, QLinearGemm
+from .operators.lstm import LSTMQuant
+from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
+from .operators.maxpool import QDQMaxPool, QMaxPool
+from .operators.norm import QDQNormalization
+from .operators.pad import QDQPad, QPad
+from .operators.pooling import QLinearPool
+from .operators.qdq_base_operator import QDQOperatorBase
+from .operators.resize import QDQResize, QResize
+from .operators.softmax import QLinearSoftmax
+from .operators.split import QDQSplit, QSplit
+from .operators.where import QDQWhere, QLinearWhere
+from .quant_utils import QuantizationMode
+
+CommonOpsRegistry = {
+    "Gather": GatherQuant,
+    "Transpose": Direct8BitOp,
+    "EmbedLayerNormalization": EmbedLayerNormalizationQuant,
+}
+
+IntegerOpsRegistry = {
+    "Conv": ConvInteger,
+    "MatMul": MatMulInteger,
+    "Attention": AttentionQuant,
+    "LSTM": LSTMQuant,
+}
+IntegerOpsRegistry.update(CommonOpsRegistry)
+
+QLinearOpsRegistry = {
+    "ArgMax": QArgMax,
+    "Conv": QLinearConv,
+    "Gemm": QLinearGemm,
+    "MatMul": QLinearMatMul,
+    "Add": QLinearBinaryOp,
+    "Mul": QLinearBinaryOp,
+    "Relu": QLinearActivation,
+    "Clip": QLinearActivation,
+    "LeakyRelu": QLinearActivation,
+    "Sigmoid": QLinearActivation,
+    "MaxPool": QMaxPool,
+    "GlobalAveragePool": QGlobalAveragePool,
+    "Split": QSplit,
+    "Pad": QPad,
+    "Reshape": Direct8BitOp,
+    "Squeeze": Direct8BitOp,
+    "Unsqueeze": Direct8BitOp,
+    "Resize": QResize,
+    "AveragePool": QLinearPool,
+    "Concat": QLinearConcat,
+    "Softmax": QLinearSoftmax,
+    "Where": QLinearWhere,
+}
+QLinearOpsRegistry.update(CommonOpsRegistry)
+
+QDQRegistry = {
+    "Conv": QDQConv,
+    "ConvTranspose": QDQConv,
+    "Gemm": QDQGemm,
+    "Clip": QDQRemovableActivation,
+    "Relu": QDQRemovableActivation,
+    "Reshape": QDQDirect8BitOp,
+    "Transpose": QDQDirect8BitOp,
+    "Squeeze": QDQDirect8BitOp,
+    "Unsqueeze": QDQDirect8BitOp,
+    "Resize": QDQResize,
+    "MaxPool": QDQMaxPool,
+    "AveragePool": QDQDirect8BitOp,
+    "Slice": QDQDirect8BitOp,
+    "Pad": QDQPad,
+    "MatMul": QDQMatMul,
+    "Split": QDQSplit,
+    "Gather": QDQGather,
+    "GatherElements": QDQGather,
+    "Where": QDQWhere,
+    "InstanceNormalization": QDQNormalization,
+    "LayerNormalization": QDQNormalization,
+    "BatchNormalization": QDQNormalization,
+    "TopK": QDQDirect8BitOp,
+}
+
+
+def CreateDefaultOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
+    if node.op_type in registry:
+        op_quantizer = registry[node.op_type](onnx_quantizer, node)
+        if op_quantizer.should_quantize():
+            return op_quantizer
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateQDQQuantizer(onnx_quantizer, node):  # noqa: N802
+    if node.op_type in QDQRegistry:
+        return QDQRegistry[node.op_type](onnx_quantizer, node)
+    return QDQOperatorBase(onnx_quantizer, node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/shape_inference.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/shape_inference.py
@@ -0,0 +1,209 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+
+import logging
+import tempfile
+import traceback
+from pathlib import Path
+
+import onnx
+
+import onnxruntime
+from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
+
+from .fusions import ReplaceUpsampleWithResize
+from .onnx_model import ONNXModel
+from .quant_utils import add_pre_process_metadata, save_and_reload_model_with_shape_infer
+
+logger = logging.getLogger(__name__)
+
+
+def quant_pre_process(
+    input_model: str | Path | onnx.ModelProto | None = None,
+    output_model_path: str | Path | None = None,
+    skip_optimization: bool = False,
+    skip_onnx_shape: bool = False,
+    skip_symbolic_shape: bool = False,
+    auto_merge: bool = False,
+    int_max: int = 2**31 - 1,
+    guess_output_rank: bool = False,
+    verbose: int = 0,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    **deprecated_kwargs,
+) -> None:
+    """Shape inference and model optimization, in preparation for quantization.
+
+    Args:
+        input_model: Path to the input model file or ModelProto
+        output_model_path: Path to the output model file
+        skip_optimization: Skip model optimization step if true. This may result in ONNX shape
+            inference failure for some models.
+        skip_onnx_shape: Skip ONNX shape inference. Symbolic shape inference is most effective
+            with transformer based models. Skipping all shape inferences may
+            reduce the effectiveness of quantization, as a tensor with unknown
+            shape can not be quantized.
+        skip_symbolic_shape: Skip symbolic shape inference. Symbolic shape inference is most
+            effective with transformer based models. Skipping all shape
+            inferences may reduce the effectiveness of quantization, as a tensor
+            with unknown shape can not be quantized.
+        auto_merge: For symbolic shape inference, automatically merge symbolic dims when
+            conflict happens.
+        int_max: For symbolic shape inference, specify the maximum value for integer to be
+            treated as boundless for ops like slice
+        guess_output_rank: Guess output rank to be the same as input 0 for unknown ops
+        verbose: Logs detailed info of inference, 0: turn off, 1: warnings, 3: detailed
+        save_as_external_data: Saving an ONNX model to external data
+        all_tensors_to_one_file: Saving all the external data to one file
+        external_data_location: The file location to save the external file
+        external_data_size_threshold: The size threshold for external data
+    """
+
+    if input_model is None:
+        input_model = deprecated_kwargs.pop("input_model_path", None)
+    assert input_model is not None
+
+    assert output_model_path is not None, "output_model_path is required."
+
+    with tempfile.TemporaryDirectory(prefix="pre.quant.") as quant_tmp_dir:
+        temp_path = Path(quant_tmp_dir)
+        model = None
+
+        if not skip_symbolic_shape:
+            logger.info("Performing symbolic shape inference...")
+            loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
+            model = SymbolicShapeInference.infer_shapes(
+                loaded_model,
+                int_max,
+                auto_merge,
+                guess_output_rank,
+                verbose,
+            )
+
+        # Since Upsample is deprecated after opset v10, and the model's opset will
+        # be upgraded to at least v11 during quantization, we need to replace Upsample
+        # with Resize first to avoid generating an invalid model.
+        if model:
+            ai_onnx_domain = [opset for opset in model.opset_import if not opset.domain or opset.domain == "ai.onnx"]
+            if len(ai_onnx_domain) == 1:
+                opset_version = ai_onnx_domain[0].version
+                if opset_version < 10:
+                    ReplaceUpsampleWithResize(ONNXModel(model), opset_version).apply()
+                    model.opset_import.remove(ai_onnx_domain[0])
+                    opset_version = 11
+                    model.opset_import.extend([onnx.helper.make_opsetid("", opset_version)])
+                    model = onnx.version_converter.convert_version(model, opset_version)
+                    model = save_and_reload_model_with_shape_infer(model)
+
+        if not skip_optimization:
+            # Use ORT optimizers (native code) to optimize model
+            if not skip_symbolic_shape:
+                # Need to save the inferenced model to file so as to run the optimizer
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
+                if save_as_external_data:
+                    onnx.save_model(
+                        model,
+                        input_model,
+                        save_as_external_data=True,
+                        all_tensors_to_one_file=all_tensors_to_one_file,
+                        size_threshold=external_data_size_threshold,
+                        convert_attribute=False,
+                    )
+                else:
+                    onnx.save(model, input_model)
+                model = None
+
+            opt_model_path = str(temp_path / "optimized.onnx")
+            try:
+                sess_option = onnxruntime.SessionOptions()
+                sess_option.optimized_model_filepath = opt_model_path
+                sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
+                # For large model, extract external data from model and add to session options
+                if isinstance(input_model, onnx.ModelProto):
+                    if has_external_data(input_model):
+                        raise ValueError(
+                            "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                            "Please load external data before calling this function. "
+                            "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+                        )
+                    external_names, external_values = extract_raw_data_from_model(input_model)
+                    sess_option.add_external_initializers(list(external_names), list(external_values))
+                    input_model = input_model.SerializeToString()
+                # the saved optimized model otherwise points to the original external data file name
+                # which is not available relative to the optimized model file
+                elif skip_symbolic_shape and save_as_external_data:
+                    sess_option.add_session_config_entry(
+                        "session.optimized_model_external_initializers_file_name", "optimized.onnx.data"
+                    )
+
+                sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
+                # Close the session to avoid the cleanup error on Windows for temp folders
+                # https://github.com/microsoft/onnxruntime/issues/17627
+                del sess
+            except Exception:
+                logger.error(
+                    "ONNX Runtime Model Optimization Failed! Consider rerun with option `--skip_optimization'."
+                )
+                logger.error(traceback.format_exc())
+
+            input_model = opt_model_path
+
+        if not skip_onnx_shape:
+            # ONNX shape inference.
+            # According to docs, infer_shapes_path should be used for 2G+ models.
+            # If the skip optimization is specified, we could be dealing with a
+            # large model. So be on the safe side, save the model
+            if model is not None:
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
+                if save_as_external_data:
+                    onnx.save_model(
+                        model,
+                        input_model,
+                        save_as_external_data=True,
+                        all_tensors_to_one_file=all_tensors_to_one_file,
+                        size_threshold=external_data_size_threshold,
+                        convert_attribute=False,
+                    )
+                else:
+                    onnx.save(model, input_model)
+                model = None
+
+            if isinstance(input_model, onnx.ModelProto):
+                input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
+                onnx.save_model(
+                    model,
+                    input_model,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=all_tensors_to_one_file,
+                    size_threshold=external_data_size_threshold,
+                    convert_attribute=False,
+                )
+
+            inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
+            onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
+            model = onnx.load(inferred_model_path)
+
+    if model is None:
+        model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
+
+    add_pre_process_metadata(model)
+
+    if save_as_external_data:
+        onnx.save_model(
+            model,
+            output_model_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=False,
+        )
+    else:
+        onnx.save(model, output_model_path)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/static_quantize_runner.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/static_quantize_runner.py
@@ -0,0 +1,256 @@
+import argparse
+import json
+import os
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
+from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
+
+
+class OnnxModelCalibrationDataReader(CalibrationDataReader):
+    def __init__(self, model_path):
+        self.model_dir = os.path.dirname(model_path)
+        data_dirs = [
+            os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
+        ]
+        model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
+        name2tensors = []
+        for data_dir in data_dirs:
+            name2tensor = {}
+            data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
+            data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
+            for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
+                name2tensor[model_input.name] = data_ndarray
+            name2tensors.append(name2tensor)
+        assert len(name2tensors) == len(data_dirs)
+        assert len(name2tensors[0]) == len(model_inputs)
+
+        self.calibration_data = iter(name2tensors)
+
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        return next(self.calibration_data, None)
+
+    def read_onnx_pb_data(self, file_pb):
+        tensor = onnx.TensorProto()
+        with open(file_pb, "rb") as f:
+            tensor.ParseFromString(f.read())
+        ret = onnx.numpy_helper.to_array(tensor)
+        return ret
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="The arguments for static quantization")
+    parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
+    parser.add_argument(
+        "-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
+    )
+    parser.add_argument(
+        "--activation_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="quint8",
+        help="Activation quantization type used",
+    )
+    parser.add_argument(
+        "--weight_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="qint8",
+        help="Weight quantization type used",
+    )
+    parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
+    parser.add_argument(
+        "--force_quantize_no_input_check",
+        action="store_true",
+        help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
+        " quantized already. Setting to True to force such operator always quantize input and so generate"
+        " quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
+    )
+    parser.add_argument(
+        "--matmul_const_b_only",
+        action="store_true",
+        help="If set, only MatMul with const B will be quantized.",
+    )
+    parser.add_argument(
+        "--add_qdq_pair_to_weight",
+        action="store_true",
+        help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
+        " nodes to weight.",
+    )
+    parser.add_argument(
+        "--dedicated_qdq_pair",
+        action="store_true",
+        help="If set, it will create identical and dedicated QDQ pair for each node.",
+    )
+    parser.add_argument(
+        "--op_types_to_exclude_output_quantization",
+        nargs="+",
+        default=[],
+        help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
+    )
+    parser.add_argument(
+        "--calibration_method",
+        default="minmax",
+        choices=["minmax", "entropy", "percentile", "distribution"],
+        help="Calibration method used",
+    )
+    parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
+    parser.add_argument(
+        "--calib_tensor_range_symmetric",
+        action="store_true",
+        help="If enabled, the final range of tensor during calibration will be explicitly"
+        " set to symmetric to central point 0",
+    )
+    # TODO: --calib_strided_minmax"
+    # TODO: --calib_moving_average_constant"
+    # TODO: --calib_max_intermediate_outputs"
+    parser.add_argument(
+        "--calib_moving_average",
+        action="store_true",
+        help="If enabled, the moving average of"
+        " the minimum and maximum values will be computed when the calibration method selected is MinMax.",
+    )
+    parser.add_argument(
+        "--disable_quantize_bias",
+        action="store_true",
+        help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
+        " If not set, it remains floating-point bias and does not insert any quantization nodes"
+        " associated with biases.",
+    )
+
+    # TODO: Add arguments related to Smooth Quant
+
+    parser.add_argument(
+        "--use_qdq_contrib_ops",
+        action="store_true",
+        help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
+        " which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
+    )
+    parser.add_argument(
+        "--minimum_real_range",
+        type=float,
+        default=0.0001,
+        help="If set to a floating-point value, the calculation of the quantization parameters"
+        " (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
+        " is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
+        " necessary for EPs like QNN that require a minimum floating-point range when determining "
+        " quantization parameters.",
+    )
+    parser.add_argument(
+        "--qdq_keep_removable_activations",
+        action="store_true",
+        help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
+        " and will be explicitly represented in the QDQ model.",
+    )
+    parser.add_argument(
+        "--qdq_disable_weight_adjust_for_int32_bias",
+        action="store_true",
+        help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
+        " has a scale (input_scale * weight_scale) that is too small.",
+    )
+    parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
+    parser.add_argument(
+        "--nodes_to_quantize",
+        nargs="+",
+        default=None,
+        help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
+    )
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        default=None,
+        help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
+    )
+    parser.add_argument(
+        "--op_per_channel_axis",
+        nargs=2,
+        action="append",
+        metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
+        default=[],
+        help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
+        " effective only when per channel quantization is supported and per_channel is True. If specific"
+        " op type supports per channel quantization but not explicitly specified with channel axis,"
+        " default channel axis will be used.",
+    )
+    parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
+    return parser.parse_args()
+
+
+def get_tensor_quant_overrides(file):
+    # TODO: Enhance the function to handle more real cases of json file
+    if not file:
+        return {}
+    with open(file) as f:
+        quant_override_dict = json.load(f)
+    for tensor in quant_override_dict:
+        for enc_dict in quant_override_dict[tensor]:
+            enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
+            enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
+    return quant_override_dict
+
+
+def main():
+    args = parse_arguments()
+    data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
+    arg2quant_type = {
+        "qint8": QuantType.QInt8,
+        "quint8": QuantType.QUInt8,
+        "qint16": QuantType.QInt16,
+        "quint16": QuantType.QUInt16,
+        "qint4": QuantType.QInt4,
+        "quint4": QuantType.QUInt4,
+        "qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
+    }
+    activation_type = arg2quant_type[args.activation_type]
+    weight_type = arg2quant_type[args.weight_type]
+    qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
+    extra_options = {
+        "EnableSubgraph": args.enable_subgraph,
+        "ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
+        "MatMulConstBOnly": args.matmul_const_b_only,
+        "AddQDQPairToWeight": args.add_qdq_pair_to_weight,
+        "OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
+        "DedicatedQDQPair": args.dedicated_qdq_pair,
+        "QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
+        "CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
+        "CalibMovingAverage": args.calib_moving_average,
+        "QuantizeBias": not args.disable_quantize_bias,
+        "UseQDQContribOps": args.use_qdq_contrib_ops,
+        "MinimumRealRange": args.minimum_real_range,
+        "QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
+        "QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
+        # Load json file for encoding override
+        "TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
+    }
+    arg2calib_method = {
+        "minmax": CalibrationMethod.MinMax,
+        "entropy": CalibrationMethod.Entropy,
+        "percentile": CalibrationMethod.Percentile,
+        "distribution": CalibrationMethod.Distribution,
+    }
+    arg2quant_format = {
+        "qdq": QuantFormat.QDQ,
+        "qoperator": QuantFormat.QOperator,
+    }
+    sqc = StaticQuantConfig(
+        calibration_data_reader=data_reader,
+        calibrate_method=arg2calib_method[args.calibration_method],
+        quant_format=arg2quant_format[args.quant_format],
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=None,
+        nodes_to_quantize=args.nodes_to_quantize,
+        nodes_to_exclude=args.nodes_to_exclude,
+        per_channel=args.per_channel,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,  # Use CPUExecutionProvider
+        extra_options=extra_options,
+    )
+    quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
+
+
+if __name__ == "__main__":
+    main()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/tensor_quant_overrides.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/tensor_quant_overrides.py
@@ -0,0 +1,520 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import json
+from collections.abc import MutableMapping
+from dataclasses import dataclass
+from typing import Any
+
+import onnx
+
+from .quant_utils import QuantType
+
+
+@dataclass
+class QuantTypeInfo:  # noqa: PLW1641
+    """
+    The quantization type information for a tensor override.
+    """
+
+    quant_type: QuantType
+    symmetric: bool | None = None  # If None, assumes default is used.
+    reduce_range: bool | None = None  # If None, assumes default is used.
+    axis: int | None = None  # If None, assumes per-tensor quantization
+
+    def __eq__(self, other: object):
+        if isinstance(other, QuantTypeInfo):
+            return (
+                self.quant_type == other.quant_type
+                and (self.symmetric is None or other.symmetric is None or self.symmetric == other.symmetric)
+                and (self.reduce_range is None or other.reduce_range is None or self.reduce_range == other.reduce_range)
+                and (self.axis == other.axis)
+            )
+        return NotImplemented
+
+    @staticmethod
+    def load_from_dict(
+        raw_dict: dict[str, Any],
+        default_qtype: QuantType | None = None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        return QuantTypeInfo(
+            raw_dict.get("quant_type", default_qtype),
+            raw_dict.get("symmetric", default_symmetric),
+            raw_dict.get("reduce_range", default_reduce_range),
+            raw_dict.get("axis"),
+        )
+
+    def save_to_dict(self, raw_dict: dict[str, Any]):
+        raw_dict["quant_type"] = self.quant_type
+        if self.symmetric is not None:
+            raw_dict["symmetric"] = self.symmetric
+        if self.reduce_range is not None:
+            raw_dict["reduce_range"] = self.reduce_range
+        if self.axis is not None:
+            raw_dict["axis"] = self.axis
+
+
+class TensorQuantOverridesHelper(MutableMapping):
+    """
+    Utility wrapper over the tensor quantization overrides passed via extra_options.
+    """
+
+    def __init__(self, raw_overrides: dict[str, list[dict[str, Any]]]):
+        self.overrides = raw_overrides
+        self.quant_types = None
+        self.keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
+
+    def has_per_tensor_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" not in overrides_list[0]
+
+    def has_per_channel_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" in overrides_list[0]
+
+    def overrides_scale_zp(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and ("scale" in overrides_list[0]) and ("zero_point" in overrides_list[0])
+
+    def get_per_tensor_overrides(
+        self,
+        tensor_name: str,
+        default_val: dict[str, Any] | None = None,
+    ) -> dict[str, Any] | None:
+        default_list_val = [default_val] if default_val is not None else None
+        overrides_list = self.overrides.get(tensor_name, default_list_val)
+        if overrides_list and "axis" in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
+                f"but found per-channel overrides."
+            )
+
+        return overrides_list[0] if overrides_list else None
+
+    def get_per_channel_overrides(
+        self,
+        tensor_name: str,
+        default_val: list[dict[str, Any]] | None = None,
+    ) -> list[dict[str, Any]] | None:
+        overrides_list = self.overrides.get(tensor_name, default_val)
+
+        if not overrides_list:
+            return None
+
+        if "axis" not in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to have per-channel quantization overrides (axis value is missing).",
+            )
+
+        return overrides_list
+
+    def get_quant_types(self) -> set[QuantType]:
+        if self.quant_types is not None:
+            return self.quant_types
+
+        self.quant_types = set()
+
+        if self.overrides:
+            for quant_overrides_list in self.overrides.values():
+                for quant_overrides in quant_overrides_list:
+                    if "quant_type" in quant_overrides:
+                        self.quant_types.add(quant_overrides["quant_type"])
+
+                    if "convert" in quant_overrides and "quant_type" in quant_overrides["convert"]:
+                        self.quant_types.add(quant_overrides["convert"]["quant_type"])
+
+        return self.quant_types
+
+    def _is_valid_per_tensor(
+        self,
+        initializers,
+        default_activation_qtype,
+        tensor_name: str,
+        quant_overrides: dict[str, Any],
+    ) -> tuple[bool, str | None]:
+        if not isinstance(quant_overrides, dict):
+            return (
+                False,
+                f"Tensor quantization overrides for '{tensor_name}' are not in a dict",
+            )
+
+        is_initializer = tensor_name in initializers
+
+        quant_type = quant_overrides.get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        has_scale = "scale" in quant_overrides
+        has_zero_point = "zero_point" in quant_overrides
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        if "reduce_range" in quant_overrides and not is_initializer:
+            return (
+                False,
+                f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+            )
+
+        if "convert" in quant_overrides:
+            if is_initializer:
+                return False, "Cannot use 'convert' override for initializers"
+
+            if "quant_type" not in quant_overrides["convert"]:
+                return False, f"'convert' options (tensor '{tensor_name}') must specify a 'quant_type'"
+
+            if "reduce_range" in quant_overrides["convert"]:
+                return (
+                    False,
+                    f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+                )
+
+            convert_quant_type = quant_overrides["convert"]["quant_type"]
+            original_quant_type = quant_type if quant_type is not None else default_activation_qtype
+            if convert_quant_type == original_quant_type:
+                return (
+                    False,
+                    f"'convert' quant_type must differ from original quant_type (tensor '{tensor_name}')",
+                )
+
+            convert_has_scale = "scale" in quant_overrides["convert"]
+            convert_has_zero_point = "zero_point" in quant_overrides["convert"]
+
+            if (convert_has_scale and not convert_has_zero_point) or (convert_has_zero_point and not convert_has_scale):
+                return (
+                    False,
+                    f"Must provide both 'scale' and 'zero_point' if one of the overrides is provided (tensor '{tensor_name}')",
+                )
+
+            if convert_has_scale:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides["convert"]))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point' "
+                        f"(tensor '{tensor_name}')",
+                    )
+
+            self.quant_types.add(convert_quant_type)
+
+        return True, None
+
+    def _is_valid_per_channel(
+        self,
+        initializers,
+        tensor_name: str,
+        quant_overrides_list: list[dict[str, Any]],
+    ) -> tuple[bool, str | None]:
+        is_initializer = tensor_name in initializers
+
+        if not is_initializer:
+            return (
+                False,
+                f"Tensor '{tensor_name}' has per-channel overrides, but is not an initializer",
+            )
+
+        axis = quant_overrides_list[0].get("axis")
+
+        if axis is None:
+            return (
+                False,
+                f"Per-channel overrides for tensor {tensor_name} is missing an 'axis' value in "
+                "the first channel dictionary.",
+            )
+
+        weight_shape = list(initializers[tensor_name].dims)
+        weight_rank = len(weight_shape)
+        norm_axis = axis
+        if norm_axis < 0:
+            norm_axis += weight_rank
+
+        if norm_axis < 0 or norm_axis >= len(weight_shape):
+            return (
+                False,
+                f"Axis override value is out-of-bounds for tensor {tensor_name} (rank {len(weight_shape)})",
+            )
+
+        if len(quant_overrides_list) > 1 and len(quant_overrides_list) != weight_shape[norm_axis]:
+            return (
+                False,
+                f"Incorrect number of channel overrides for tensor {tensor_name} (axis {axis}), "
+                f"expected {weight_shape[axis]}, but found {len(quant_overrides_list)}.",
+            )
+
+        if "convert" in quant_overrides_list[0]:
+            return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+        quant_type = quant_overrides_list[0].get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        symmetric = quant_overrides_list[0].get("symmetric")
+        reduce_range = quant_overrides_list[0].get("reduce_range")
+
+        has_scale = "scale" in quant_overrides_list[0]
+        has_zero_point = "zero_point" in quant_overrides_list[0]
+        has_scale_zp = has_scale and has_zero_point
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale_zp:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides_list[0]))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        has_rmin = "rmin" in quant_overrides_list[0]
+        has_rmax = "rmax" in quant_overrides_list[0]
+        has_rmin_rmax = has_rmin and has_rmax
+        if (has_rmin and not has_rmax) or (not has_rmin and has_rmax):
+            return (
+                False,
+                "Must provide both 'rmin' and 'rmax' if one is provided",
+            )
+
+        for index, quant_overrides in enumerate(quant_overrides_list[1:]):
+            if not isinstance(quant_overrides, dict):
+                return (
+                    False,
+                    f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict",
+                )
+
+            if "convert" in quant_overrides:
+                return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+            # For per-channel quantization, all channels must use the same quantization type, axis, symmetric
+            # and reduce_range values. And, if specified, they must be present in the first channel dict
+            # (i.e., quant_overrides_list[0]).
+            if "quant_type" in quant_overrides and quant_type != quant_overrides["quant_type"]:
+                return (
+                    False,
+                    "Channel quantization types for tensor '{tensor_name}' do not match at index {index}.",
+                )
+            if "axis" in quant_overrides and axis != quant_overrides["axis"] and norm_axis != quant_overrides["axis"]:
+                return (
+                    False,
+                    "Channel axis for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "symmetric" in quant_overrides and symmetric != quant_overrides["symmetric"]:
+                return (
+                    False,
+                    "Channel symmetric value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "reduce_range" in quant_overrides and reduce_range != quant_overrides["reduce_range"]:
+                return (
+                    False,
+                    "Channel reduce_range value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+
+            # If override scale/zp, must do so for all channels.
+            chan_has_scale_zp = "scale" in quant_overrides and "zero_point" in quant_overrides
+
+            if has_scale_zp and not chan_has_scale_zp:
+                return (
+                    False,
+                    "Per-channel overrides that specify scale/zero_point must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+            if chan_has_scale_zp:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                    )
+
+            # If override rmin/rmax, must do so for all channels.
+            chan_has_rmin_rmax = "rmin" in quant_overrides and "rmax" in quant_overrides
+            if has_rmin_rmax and not chan_has_rmin_rmax:
+                return (
+                    False,
+                    "Per-channel overrides that specify rmin/rmax must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+        return True, None
+
+    def is_valid(
+        self,
+        initializers: dict[str, onnx.TensorProto],
+        activation_names: set[str],
+        default_activation_qtype,
+    ) -> tuple[bool, str | None]:
+        self.quant_types = set()
+
+        # Validate that compatible/valid overrides are provided.
+        if self.overrides:
+            for tensor_name, quant_overrides_list in self.overrides.items():
+                if tensor_name not in initializers and tensor_name not in activation_names:
+                    return False, f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model"
+
+                if not isinstance(quant_overrides_list, list):
+                    return False, f"Tensor quantization overrides for '{tensor_name}' are not in a list"
+
+                if not quant_overrides_list:
+                    continue
+
+                if not isinstance(quant_overrides_list[0], dict):
+                    return False, f"Tensor quantization overrides at index 0 for '{tensor_name}' are not in a dict"
+
+                if not quant_overrides_list[0]:
+                    continue
+
+                axis = quant_overrides_list[0].get("axis")
+                is_per_channel = len(quant_overrides_list) > 1 or axis is not None
+
+                if is_per_channel:
+                    return self._is_valid_per_channel(initializers, tensor_name, quant_overrides_list)
+
+                return self._is_valid_per_tensor(
+                    initializers, default_activation_qtype, tensor_name, quant_overrides_list[0]
+                )
+
+        return True, None
+
+    def update_tensor_overrides(
+        self,
+        tensor_name: str,
+        new_vals: dict[str, Any],
+        channels: list[int] | None = None,
+        overwrite: bool = True,
+    ) -> bool:
+        if not new_vals:
+            return False
+
+        channels = set(channels) if channels is not None else None
+        have_overrides = self.overrides.get(tensor_name)
+
+        # If `overwrite` is False, check if we would overwrite anything.
+        do_update = True
+        if not overwrite and have_overrides:
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                if set(new_vals).intersection(set(overrides)):
+                    do_update = False
+                    break
+
+        # Do the update if `overwrite` is True or if nothing is overwritten (do not want partial overwrites).
+        if do_update:
+            if not have_overrides:
+                self.overrides[tensor_name] = [{}]
+
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                overrides.update(new_vals)
+
+        return do_update
+
+    def get_node_output_qtype_info(
+        self,
+        output_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+    ) -> QuantTypeInfo:
+        # Outputs are activations, which do not support 'reduce_range' or 'axis'
+        if output_name not in self.overrides:
+            return QuantTypeInfo(default_qtype, default_symmetric)
+
+        tensor_overrides = self.overrides[output_name][0]
+
+        return QuantTypeInfo(
+            tensor_overrides.get("quant_type", default_qtype),
+            tensor_overrides.get("symmetric", default_symmetric),
+        )
+
+    def get_node_input_qtype_info(
+        self,
+        input_name: str,
+        node_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        if input_name not in self.overrides or not self.overrides[input_name]:
+            return QuantTypeInfo(default_qtype, default_symmetric, default_reduce_range)
+
+        # Get the first overrides dict in the list. This works for both per-tensor and per-channel
+        # quantization because all channels must use the same quant type.
+        tensor_overrides = self.overrides[input_name][0]
+        producer_type = tensor_overrides.get("quant_type", default_qtype)
+
+        if "convert" not in tensor_overrides:
+            return QuantTypeInfo(
+                producer_type,
+                tensor_overrides.get("symmetric", default_symmetric),
+                tensor_overrides.get("reduce_range", default_reduce_range),
+                tensor_overrides.get("axis"),
+            )
+
+        # This tensor is converted. Check if the node gets the original qtype or the converted qtype.
+        convert_dict = tensor_overrides["convert"]
+        qtype_info = QuantTypeInfo(
+            producer_type,
+            convert_dict.get("symmetric", default_symmetric),
+            # Converted tensors are not initializers, so do not have 'axis' or 'reduce_range'.
+        )
+
+        # Check if all nodes receive the converted type (i.e., recv_nodes is None) or this node
+        # is in the list of consumers (recv_nodes).
+        if ("recv_nodes" not in convert_dict) or (node_name in convert_dict["recv_nodes"]):
+            qtype_info.quant_type = convert_dict["quant_type"]
+
+        return qtype_info
+
+    def pprint_str(self, indent=None) -> str:
+        return json.dumps(self.overrides, default=str, indent=indent)
+
+    def empty(self) -> bool:
+        return not self.overrides
+
+    def get_dict(self) -> dict[str, list[dict[str, Any]]]:
+        return self.overrides
+
+    # Required implementations of abstract methods in collections.abc.MutableMapping
+    # so that this class can be used like a dict.
+    def __setitem__(self, key: str, value: list[dict]):
+        self.overrides[key] = value
+
+    def __getitem__(self, key: str) -> list[dict]:
+        return self.overrides[key]
+
+    def __delitem__(self, key: str):
+        del self.overrides[key]
+
+    def __iter__(self):
+        return iter(self.overrides)
+
+    def __len__(self):
+        return len(self.overrides)
+
+    def __str__(self) -> str:
+        return str(self.overrides)
+
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}, TensorQuantOverridesHelper({self.overrides})"
				`@@ -0,0 +1 @@`
				`from .weight_only import gptq_quantize, rtn_quantize # noqa: F401`