chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/LICENSE
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/Privacy.md
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/Privacy.md
@@ -0,0 +1,21 @@
+# Privacy
+
+## Data Collection
+The software may collect information about you and your use of the software and send it to Microsoft. Microsoft may use this information to provide services and improve our products and services. You may turn off the telemetry as described in the repository. There are also some features in the software that may enable you and Microsoft to collect data from users of your applications. If you use these features, you must comply with applicable law, including providing appropriate notices to users of your applications together with a copy of Microsoft's privacy statement. Our privacy statement is located at https://go.microsoft.com/fwlink/?LinkID=824704. You can learn more about data collection and use in the help documentation and our privacy statement. Your use of the software operates as your consent to these practices.
+
+***
+
+### Private Builds
+No data collection is performed when using your private builds built from source code.
+
+### Official Builds
+ONNX Runtime does not maintain any independent telemetry collection mechanisms outside of what is provided by the platforms it supports. However, where applicable, ONNX Runtime will take advantage of platform-supported telemetry systems to collect trace events with the goal of improving product quality.
+
+Currently telemetry is only implemented for Windows builds and is turned **ON** by default in the official builds distributed in their respective package management repositories ([see here](../README.md#binaries)). This may be expanded to cover other platforms in the future. Data collection is implemented via 'Platform Telemetry' per vendor platform providers (see [telemetry.h](../onnxruntime/core/platform/telemetry.h)).
+
+#### Technical Details
+The Windows provider uses the [TraceLogging](https://docs.microsoft.com/en-us/windows/win32/tracelogging/trace-logging-about) API for its implementation. This enables ONNX Runtime trace events to be collected by the operating system, and based on user consent, this data may be periodically sent to Microsoft servers following GDPR and privacy regulations for anonymity and data access controls. 
+
+Windows ML and onnxruntime C APIs allow Trace Logging to be turned on/off (see [API pages](../README.md#api-documentation) for details).
+For information on how to enable and disable telemetry, see [C API: Telemetry](./C_API.md#telemetry). 
+There are equivalent APIs in the C#, Python, and Java language bindings as well.
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/ThirdPartyNotices.txt
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/ThirdPartyNotices.txt
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/init.py
@@ -0,0 +1,360 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""
+ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exchange (ONNX) models.
+For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
+or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
+"""
+
+__version__ = "1.23.2"
+__author__ = "Microsoft"
+
+# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
+# in order to know whether the onnxruntime package is for training it needs
+# to do import onnxruntime.training.ortmodule first.
+# onnxruntime.capi._pybind_state is required before import onnxruntime.training.ortmodule.
+# however, import onnxruntime.capi._pybind_state will already raise an exception if a required Cuda version
+# is not found.
+# here we need to save the exception and continue with Cuda version validation in order to post
+# meaningful messages to the user.
+# the saved exception is raised after device version validation.
+try:
+    from onnxruntime.capi._pybind_state import (
+        ExecutionMode,  # noqa: F401
+        ExecutionOrder,  # noqa: F401
+        GraphOptimizationLevel,  # noqa: F401
+        LoraAdapter,  # noqa: F401
+        ModelMetadata,  # noqa: F401
+        NodeArg,  # noqa: F401
+        OrtAllocatorType,  # noqa: F401
+        OrtArenaCfg,  # noqa: F401
+        OrtCompileApiFlags,  # noqa: F401
+        OrtDeviceMemoryType,  # noqa: F401
+        OrtEpDevice,  # noqa: F401
+        OrtExecutionProviderDevicePolicy,  # noqa: F401
+        OrtExternalInitializerInfo,  # noqa: F401
+        OrtHardwareDevice,  # noqa: F401
+        OrtHardwareDeviceType,  # noqa: F401
+        OrtMemoryInfo,  # noqa: F401
+        OrtMemoryInfoDeviceType,  # noqa: F401
+        OrtMemType,  # noqa: F401
+        OrtSparseFormat,  # noqa: F401
+        OrtSyncStream,  # noqa: F401
+        RunOptions,  # noqa: F401
+        SessionIOBinding,  # noqa: F401
+        SessionOptions,  # noqa: F401
+        create_and_register_allocator,  # noqa: F401
+        create_and_register_allocator_v2,  # noqa: F401
+        disable_telemetry_events,  # noqa: F401
+        enable_telemetry_events,  # noqa: F401
+        get_all_providers,  # noqa: F401
+        get_available_providers,  # noqa: F401
+        get_build_info,  # noqa: F401
+        get_device,  # noqa: F401
+        get_ep_devices,  # noqa: F401
+        get_version_string,  # noqa: F401
+        has_collective_ops,  # noqa: F401
+        register_execution_provider_library,  # noqa: F401
+        set_default_logger_severity,  # noqa: F401
+        set_default_logger_verbosity,  # noqa: F401
+        set_global_thread_pool_sizes,  # noqa: F401
+        set_seed,  # noqa: F401
+        unregister_execution_provider_library,  # noqa: F401
+    )
+
+    import_capi_exception = None
+except Exception as e:
+    import_capi_exception = e
+
+from onnxruntime.capi import onnxruntime_validation
+
+if import_capi_exception:
+    raise import_capi_exception
+
+from onnxruntime.capi.onnxruntime_inference_collection import (
+    AdapterFormat,  # noqa: F401
+    InferenceSession,  # noqa: F401
+    IOBinding,  # noqa: F401
+    ModelCompiler,  # noqa: F401
+    OrtDevice,  # noqa: F401
+    OrtValue,  # noqa: F401
+    SparseTensor,  # noqa: F401
+    copy_tensors,  # noqa: F401
+)
+
+# TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
+try:  # noqa: SIM105
+    from . import experimental  # noqa: F401
+except ImportError:
+    pass
+
+
+package_name, version, cuda_version = onnxruntime_validation.get_package_name_and_version_info()
+
+if version:
+    __version__ = version
+
+onnxruntime_validation.check_distro_info()
+
+
+def _get_package_version(package_name: str):
+    from importlib.metadata import PackageNotFoundError, version  # noqa: PLC0415
+
+    try:
+        package_version = version(package_name)
+    except PackageNotFoundError:
+        package_version = None
+    return package_version
+
+
+def _get_package_root(package_name: str, directory_name: str | None = None):
+    from importlib.metadata import PackageNotFoundError, distribution  # noqa: PLC0415
+
+    root_directory_name = directory_name or package_name
+    try:
+        dist = distribution(package_name)
+        files = dist.files or []
+
+        for file in files:
+            if file.name.endswith("__init__.py") and root_directory_name in file.parts:
+                return file.locate().parent
+
+        # Fallback to the first __init__.py
+        if not directory_name:
+            for file in files:
+                if file.name.endswith("__init__.py"):
+                    return file.locate().parent
+    except PackageNotFoundError:
+        # package not found, do nothing
+        pass
+
+    return None
+
+
+def _get_nvidia_dll_paths(is_windows: bool, cuda: bool = True, cudnn: bool = True):
+    if is_windows:
+        # Path is relative to site-packages directory.
+        cuda_dll_paths = [
+            ("nvidia", "cublas", "bin", "cublasLt64_12.dll"),
+            ("nvidia", "cublas", "bin", "cublas64_12.dll"),
+            ("nvidia", "cufft", "bin", "cufft64_11.dll"),
+            ("nvidia", "cuda_runtime", "bin", "cudart64_12.dll"),
+        ]
+        cudnn_dll_paths = [
+            ("nvidia", "cudnn", "bin", "cudnn_engines_runtime_compiled64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_engines_precompiled64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_heuristic64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_ops64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_adv64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn_graph64_9.dll"),
+            ("nvidia", "cudnn", "bin", "cudnn64_9.dll"),
+        ]
+    else:  # Linux
+        # cublas64 depends on cublasLt64, so cublasLt64 should be loaded first.
+        cuda_dll_paths = [
+            ("nvidia", "cublas", "lib", "libcublasLt.so.12"),
+            ("nvidia", "cublas", "lib", "libcublas.so.12"),
+            ("nvidia", "cuda_nvrtc", "lib", "libnvrtc.so.12"),
+            ("nvidia", "curand", "lib", "libcurand.so.10"),
+            ("nvidia", "cufft", "lib", "libcufft.so.11"),
+            ("nvidia", "cuda_runtime", "lib", "libcudart.so.12"),
+        ]
+
+        # Do not load cudnn sub DLLs (they will be dynamically loaded later) to be consistent with PyTorch in Linux.
+        cudnn_dll_paths = [
+            ("nvidia", "cudnn", "lib", "libcudnn.so.9"),
+        ]
+
+    return (cuda_dll_paths if cuda else []) + (cudnn_dll_paths if cudnn else [])
+
+
+def print_debug_info():
+    """Print information to help debugging."""
+    import importlib.util  # noqa: PLC0415
+    import os  # noqa: PLC0415
+    import platform  # noqa: PLC0415
+    from importlib.metadata import distributions  # noqa: PLC0415
+
+    print(f"{package_name} version: {__version__}")
+    if cuda_version:
+        print(f"CUDA version used in build: {cuda_version}")
+    print("platform:", platform.platform())
+
+    print("\nPython package, version and location:")
+    ort_packages = []
+    for dist in distributions():
+        package = dist.metadata["Name"]
+        if package == "onnxruntime" or package.startswith(("onnxruntime-", "ort-")):
+            # Exclude packages whose root directory name is not onnxruntime.
+            location = _get_package_root(package, "onnxruntime")
+            if location and (package not in ort_packages):
+                ort_packages.append(package)
+                print(f"{package}=={dist.version} at {location}")
+
+    if len(ort_packages) > 1:
+        print(
+            "\033[33mWARNING: multiple onnxruntime packages are installed to the same location. "
+            "Please 'pip uninstall` all above packages, then `pip install` only one of them.\033[0m"
+        )
+
+    if cuda_version:
+        # Print version of installed packages that is related to CUDA or cuDNN DLLs.
+        packages = [
+            "torch",
+            "nvidia-cuda-runtime-cu12",
+            "nvidia-cudnn-cu12",
+            "nvidia-cublas-cu12",
+            "nvidia-cufft-cu12",
+            "nvidia-curand-cu12",
+            "nvidia-cuda-nvrtc-cu12",
+            "nvidia-nvjitlink-cu12",
+        ]
+        for package in packages:
+            directory_name = "nvidia" if package.startswith("nvidia-") else None
+            version = _get_package_version(package)
+            if version:
+                print(f"{package}=={version} at {_get_package_root(package, directory_name)}")
+            else:
+                print(f"{package} not installed")
+
+    if platform.system() == "Windows":
+        print(f"\nEnvironment variable:\nPATH={os.environ['PATH']}")
+    elif platform.system() == "Linux":
+        print(f"\nEnvironment variable:\nLD_LIBRARY_PATH={os.environ['LD_LIBRARY_PATH']}")
+
+    if importlib.util.find_spec("psutil"):
+
+        def is_target_dll(path: str):
+            target_keywords = ["vcruntime140", "msvcp140"]
+            if cuda_version:
+                target_keywords = ["cufft", "cublas", "cudart", "nvrtc", "curand", "cudnn", *target_keywords]
+            return any(keyword in path for keyword in target_keywords)
+
+        import psutil  # noqa: PLC0415
+
+        p = psutil.Process(os.getpid())
+
+        print("\nList of loaded DLLs:")
+        for lib in p.memory_maps():
+            if is_target_dll(lib.path.lower()):
+                print(lib.path)
+
+        if cuda_version:
+            if importlib.util.find_spec("cpuinfo") and importlib.util.find_spec("py3nvml"):
+                from .transformers.machine_info import get_device_info  # noqa: PLC0415
+
+                print("\nDevice information:")
+                print(get_device_info())
+            else:
+                print("please `pip install py-cpuinfo py3nvml` to show device information.")
+    else:
+        print("please `pip install psutil` to show loaded DLLs.")
+
+
+def preload_dlls(cuda: bool = True, cudnn: bool = True, msvc: bool = True, directory=None):
+    """Preload CUDA 12.x and cuDNN 9.x DLLs in Windows or Linux, and MSVC runtime DLLs in Windows.
+
+       When the installed PyTorch is compatible (using same major version of CUDA and cuDNN),
+       there is no need to call this function if `import torch` is done before `import onnxruntime`.
+
+    Args:
+        cuda (bool, optional): enable loading CUDA DLLs. Defaults to True.
+        cudnn (bool, optional): enable loading cuDNN DLLs. Defaults to True.
+        msvc (bool, optional): enable loading MSVC DLLs in Windows. Defaults to True.
+        directory(str, optional): a directory contains CUDA or cuDNN DLLs. It can be an absolute path,
+           or a path relative to the directory of this file.
+           If directory is None (default value), the search order: the lib directory of compatible PyTorch in Windows,
+            nvidia site packages, default DLL loading paths.
+           If directory is empty string (""), the search order: nvidia site packages, default DLL loading paths.
+           If directory is a path, the search order: the directory, default DLL loading paths.
+    """
+    import ctypes  # noqa: PLC0415
+    import os  # noqa: PLC0415
+    import platform  # noqa: PLC0415
+    import sys  # noqa: PLC0415
+
+    if platform.system() not in ["Windows", "Linux"]:
+        return
+
+    is_windows = platform.system() == "Windows"
+    if is_windows and msvc:
+        try:
+            ctypes.CDLL("vcruntime140.dll")
+            ctypes.CDLL("msvcp140.dll")
+            if platform.machine() != "ARM64":
+                ctypes.CDLL("vcruntime140_1.dll")
+        except OSError:
+            print("Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.")
+            print("It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe.")
+
+    if not (cuda_version and cuda_version.startswith("12.")) and (cuda or cudnn):
+        print(
+            f"\033[33mWARNING: {package_name} is not built with CUDA 12.x support. "
+            "Please install a version that supports CUDA 12.x, or call preload_dlls with cuda=False and cudnn=False.\033[0m"
+        )
+        return
+
+    if not (cuda_version and cuda_version.startswith("12.") and (cuda or cudnn)):
+        return
+
+    is_cuda_cudnn_imported_by_torch = False
+
+    if is_windows:
+        torch_version = _get_package_version("torch")
+        is_torch_for_cuda_12 = torch_version and "+cu12" in torch_version
+        if "torch" in sys.modules:
+            is_cuda_cudnn_imported_by_torch = is_torch_for_cuda_12
+            if (torch_version and "+cu" in torch_version) and not is_torch_for_cuda_12:
+                print(
+                    f"\033[33mWARNING: The installed PyTorch {torch_version} does not support CUDA 12.x. "
+                    f"Please install PyTorch for CUDA 12.x to be compatible with {package_name}.\033[0m"
+                )
+
+        if is_torch_for_cuda_12 and directory is None:
+            torch_root = _get_package_root("torch", "torch")
+            if torch_root:
+                directory = os.path.join(torch_root, "lib")
+
+    base_directory = directory or ".."
+    if not os.path.isabs(base_directory):
+        base_directory = os.path.join(os.path.dirname(__file__), base_directory)
+    base_directory = os.path.normpath(base_directory)
+    if not os.path.isdir(base_directory):
+        raise RuntimeError(f"Invalid parameter of directory={directory}. The directory does not exist!")
+
+    if is_cuda_cudnn_imported_by_torch:
+        # In Windows, PyTorch has loaded CUDA and cuDNN DLLs during `import torch`, no need to load them again.
+        print("Skip loading CUDA and cuDNN DLLs since torch is imported.")
+        return
+
+    # Try load DLLs from nvidia site packages.
+    dll_paths = _get_nvidia_dll_paths(is_windows, cuda, cudnn)
+    loaded_dlls = []
+    for relative_path in dll_paths:
+        dll_path = (
+            os.path.join(base_directory, relative_path[-1])
+            if directory
+            else os.path.join(base_directory, *relative_path)
+        )
+        if os.path.isfile(dll_path):
+            try:
+                _ = ctypes.CDLL(dll_path)
+                loaded_dlls.append(relative_path[-1])
+            except Exception as e:
+                print(f"Failed to load {dll_path}: {e}")
+
+    # Try load DLLs with default path settings.
+    has_failure = False
+    for relative_path in dll_paths:
+        dll_filename = relative_path[-1]
+        if dll_filename not in loaded_dlls:
+            try:
+                _ = ctypes.CDLL(dll_filename)
+            except Exception as e:
+                has_failure = True
+                print(f"Failed to load {dll_filename}: {e}")
+
+    if has_failure:
+        print("Please follow https://onnxruntime.ai/docs/install/#cuda-and-cudnn to install CUDA and CuDNN.")
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/init.py
@@ -0,0 +1,6 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from .backend import is_compatible, prepare, run, supports_device  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/backend.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/backend.py
@@ -0,0 +1,175 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""
+Implements ONNX's backend API.
+"""
+
+import os
+import unittest
+
+import packaging.version
+from onnx import ModelProto, helper, version  # noqa: F401
+from onnx.backend.base import Backend
+from onnx.checker import check_model
+
+from onnxruntime import InferenceSession, SessionOptions, get_available_providers, get_device
+from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
+
+
+class OnnxRuntimeBackend(Backend):
+    """
+    Implements
+    `ONNX's backend API <https://github.com/onnx/onnx/blob/main/docs/ImplementingAnOnnxBackend.md>`_
+    with *ONNX Runtime*.
+    The backend is mostly used when you need to switch between
+    multiple runtimes with the same API.
+    `Importing models from ONNX to Caffe2 <https://github.com/onnx/tutorials/blob/master/tutorials/OnnxCaffe2Import.ipynb>`_
+    shows how to use *caffe2* as a backend for a converted model.
+    Note: This is not the official Python API.
+    """
+
+    allowReleasedOpsetsOnly = bool(os.getenv("ALLOW_RELEASED_ONNX_OPSET_ONLY", "1") == "1")  # noqa: N815
+
+    @classmethod
+    def is_compatible(cls, model, device=None, **kwargs):
+        """
+        Return whether the model is compatible with the backend.
+
+        :param model: unused
+        :param device: None to use the default device or a string (ex: `'CPU'`)
+        :return: boolean
+        """
+        if device is None:
+            device = get_device()
+        return cls.supports_device(device)
+
+    @classmethod
+    def is_opset_supported(cls, model):
+        """
+        Return whether the opset for the model is supported by the backend.
+        When By default only released onnx opsets are allowed by the backend
+        To test new opsets env variable ALLOW_RELEASED_ONNX_OPSET_ONLY should be set to 0
+
+        :param model: Model whose opsets needed to be verified.
+        :return: boolean and error message if opset is not supported.
+        """
+        if cls.allowReleasedOpsetsOnly:
+            for opset in model.opset_import:
+                domain = opset.domain if opset.domain else "ai.onnx"
+                try:
+                    key = (domain, opset.version)
+                    if key not in helper.OP_SET_ID_VERSION_MAP:
+                        error_message = (
+                            "Skipping this test as only released onnx opsets are supported."
+                            "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
+                            f" Got Domain '{domain}' version '{opset.version}'."
+                        )
+                        return False, error_message
+                except AttributeError:
+                    # for some CI pipelines accessing helper.OP_SET_ID_VERSION_MAP
+                    # is generating attribute error. TODO investigate the pipelines to
+                    # fix this error. Falling back to a simple version check when this error is encountered
+                    if (domain == "ai.onnx" and opset.version > 12) or (domain == "ai.ommx.ml" and opset.version > 2):
+                        error_message = (
+                            "Skipping this test as only released onnx opsets are supported."
+                            "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
+                            f" Got Domain '{domain}' version '{opset.version}'."
+                        )
+                        return False, error_message
+        return True, ""
+
+    @classmethod
+    def supports_device(cls, device):
+        """
+        Check whether the backend is compiled with particular device support.
+        In particular it's used in the testing suite.
+        """
+        if device == "CUDA":
+            device = "GPU"
+        return "-" + device in get_device() or device + "-" in get_device() or device == get_device()
+
+    @classmethod
+    def prepare(cls, model, device=None, **kwargs):
+        """
+        Load the model and creates a :class:`onnxruntime.InferenceSession`
+        ready to be used as a backend.
+
+        :param model: ModelProto (returned by `onnx.load`),
+            string for a filename or bytes for a serialized model
+        :param device: requested device for the computation,
+            None means the default one which depends on
+            the compilation settings
+        :param kwargs: see :class:`onnxruntime.SessionOptions`
+        :return: :class:`onnxruntime.InferenceSession`
+        """
+        if isinstance(model, OnnxRuntimeBackendRep):
+            return model
+        elif isinstance(model, InferenceSession):
+            return OnnxRuntimeBackendRep(model)
+        elif isinstance(model, (str, bytes)):
+            options = SessionOptions()
+            for k, v in kwargs.items():
+                if hasattr(options, k):
+                    setattr(options, k, v)
+
+            excluded_providers = os.getenv("ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS", default="").split(",")
+            providers = [x for x in get_available_providers() if (x not in excluded_providers)]
+
+            inf = InferenceSession(model, sess_options=options, providers=providers)
+            # backend API is primarily used for ONNX test/validation. As such, we should disable session.run() fallback
+            # which may hide test failures.
+            inf.disable_fallback()
+            if device is not None and not cls.supports_device(device):
+                raise RuntimeError(f"Incompatible device expected '{device}', got '{get_device()}'")
+            return cls.prepare(inf, device, **kwargs)
+        else:
+            # type: ModelProto
+            # check_model serializes the model anyways, so serialize the model once here
+            # and reuse it below in the cls.prepare call to avoid an additional serialization
+            # only works with onnx >= 1.10.0 hence the version check
+            onnx_version = packaging.version.parse(version.version) or packaging.version.Version("0")
+            onnx_supports_serialized_model_check = onnx_version.release >= (1, 10, 0)
+            bin_or_model = model.SerializeToString() if onnx_supports_serialized_model_check else model
+            check_model(bin_or_model)
+            opset_supported, error_message = cls.is_opset_supported(model)
+            if not opset_supported:
+                raise unittest.SkipTest(error_message)
+            # Now bin might be serialized, if it's not we need to serialize it otherwise we'll have
+            # an infinite recursive call
+            bin = bin_or_model
+            if not isinstance(bin, (str, bytes)):
+                bin = bin.SerializeToString()
+            return cls.prepare(bin, device, **kwargs)
+
+    @classmethod
+    def run_model(cls, model, inputs, device=None, **kwargs):
+        """
+        Compute the prediction.
+
+        :param model: :class:`onnxruntime.InferenceSession` returned
+            by function *prepare*
+        :param inputs: inputs
+        :param device: requested device for the computation,
+            None means the default one which depends on
+            the compilation settings
+        :param kwargs: see :class:`onnxruntime.RunOptions`
+        :return: predictions
+        """
+        rep = cls.prepare(model, device, **kwargs)
+        return rep.run(inputs, **kwargs)
+
+    @classmethod
+    def run_node(cls, node, inputs, device=None, outputs_info=None, **kwargs):
+        """
+        This method is not implemented as it is much more efficient
+        to run a whole model than every node independently.
+        """
+        raise NotImplementedError("It is much more efficient to run a whole model than every node independently.")
+
+
+is_compatible = OnnxRuntimeBackend.is_compatible
+prepare = OnnxRuntimeBackend.prepare
+run = OnnxRuntimeBackend.run_model
+supports_device = OnnxRuntimeBackend.supports_device
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/backend_rep.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/backend/backend_rep.py
@@ -0,0 +1,52 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""
+Implements ONNX's backend API.
+"""
+
+from onnx.backend.base import BackendRep
+
+from onnxruntime import RunOptions
+
+
+class OnnxRuntimeBackendRep(BackendRep):
+    """
+    Computes the prediction for a pipeline converted into
+    an :class:`onnxruntime.InferenceSession` node.
+    """
+
+    def __init__(self, session):
+        """
+        :param session: :class:`onnxruntime.InferenceSession`
+        """
+        self._session = session
+
+    def run(self, inputs, **kwargs):  # type: (Any, **Any) -> Tuple[Any, ...]
+        """
+        Computes the prediction.
+        See :meth:`onnxruntime.InferenceSession.run`.
+        """
+
+        options = RunOptions()
+        for k, v in kwargs.items():
+            if hasattr(options, k):
+                setattr(options, k, v)
+
+        if isinstance(inputs, list):
+            inps = {}
+            for i, inp in enumerate(self._session.get_inputs()):
+                inps[inp.name] = inputs[i]
+            outs = self._session.run(None, inps, options)
+            if isinstance(outs, list):
+                return outs
+            else:
+                output_names = [o.name for o in self._session.get_outputs()]
+                return [outs[name] for name in output_names]
+        else:
+            inp = self._session.get_inputs()
+            if len(inp) != 1:
+                raise RuntimeError(f"Model expect {len(inp)} inputs")
+            inps = {inp[0].name: inputs}
+            return self._session.run(None, inps, options)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/init.py
@@ -0,0 +1,4 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/_ld_preload.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/_ld_preload.py
@@ -0,0 +1,7 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# This file can be modified by setup.py when building a manylinux2010 wheel
+# When modified, it will preload some libraries needed for the python C extension
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/_pybind_state.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/_pybind_state.py
@@ -0,0 +1,33 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""
+Ensure that dependencies are available and then load the extension module.
+"""
+import os
+import platform
+import warnings
+
+from . import _ld_preload  # noqa: F401
+
+if platform.system() == "Windows":
+    from . import version_info
+
+    # If on Windows, check if this import error is caused by the user not installing the 2019 VC Runtime
+    # The VC Redist installer usually puts the VC Runtime dlls in the System32 folder, but it may also be found
+    # in some other locations.
+    # TODO, we may want to try to load the VC Runtime dlls instead of checking if the hardcoded file path
+    # is valid, and raise ImportError if the load fails
+    if version_info.vs2019 and platform.architecture()[0] == "64bit":
+        system_root = os.getenv("SystemRoot") or "C:\\Windows"
+        if not os.path.isfile(os.path.join(system_root, "System32", "vcruntime140_1.dll")):
+            warnings.warn("Please install the 2019 Visual C++ runtime and then try again. "
+                          "If you've installed the runtime in a non-standard location "
+                          "(other than %SystemRoot%\\System32), "
+                          "make sure it can be found by setting the correct path.")
+
+
+
+from .onnxruntime_pybind11_state import *  # noqa
+
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/build_and_package_info.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/build_and_package_info.py
@@ -0,0 +1,2 @@
+package_name = 'onnxruntime'
+__version__ = '1.23.2'
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/convert_npz_to_onnx_adapter.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/convert_npz_to_onnx_adapter.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This script helps converting .npz files to .onnx_adapter files
+
+import argparse
+import os
+import sys
+
+import numpy as np
+
+import onnxruntime as ort
+
+
+def get_args() -> argparse:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--npz_file_path", type=str, required=True)
+    parser.add_argument("--output_file_path", type=str, required=True)
+    parser.add_argument("--adapter_version", type=int, required=True)
+    parser.add_argument("--model_version", type=int, required=True)
+    return parser.parse_args()
+
+
+def export_lora_parameters(
+    npz_file_path: os.PathLike, adapter_version: int, model_version: int, output_file_path: os.PathLike
+):
+    """The function converts lora parameters in npz to onnx_adapter format"""
+    adapter_format = ort.AdapterFormat()
+    adapter_format.set_adapter_version(adapter_version)
+    adapter_format.set_model_version(model_version)
+    name_to_ort_value = {}
+    with np.load(npz_file_path) as data:
+        for name, np_arr in data.items():
+            ort_value = ort.OrtValue.ortvalue_from_numpy(np_arr)
+            name_to_ort_value[name] = ort_value
+
+    adapter_format.set_parameters(name_to_ort_value)
+    adapter_format.export_adapter(output_file_path)
+
+
+def main() -> int:
+    args = get_args()
+    export_lora_parameters(args.npz_file_path, args.adapter_version, args.model_version, args.output_file_path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/libonnxruntime.so.1.23.2
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/libonnxruntime.so.1.23.2
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/libonnxruntime_providers_shared.so
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/libonnxruntime_providers_shared.so
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_collect_build_info.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_collect_build_info.py
@@ -0,0 +1,47 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import ctypes
+import sys
+import warnings
+
+
+def find_cudart_versions(build_env=False, build_cuda_version=None):
+    # ctypes.CDLL and ctypes.util.find_library load the latest installed library.
+    # it may not the the library that would be loaded by onnxruntime.
+    # for example, in an environment with Cuda 11.1 and subsequently
+    # conda cudatoolkit 10.2.89 installed. ctypes will find cudart 10.2. however,
+    # onnxruntime built with Cuda 11.1 will find and load cudart for Cuda 11.1.
+    # for the above reason, we need find all versions in the environment and
+    # only give warnings if the expected cuda version is not found.
+    # in onnxruntime build environment, we expected only one Cuda version.
+    if not sys.platform.startswith("linux"):
+        warnings.warn("find_cudart_versions only works on Linux")
+        return None
+
+    cudart_possible_versions = {None, build_cuda_version}
+
+    def get_cudart_version(find_cudart_version=None):
+        cudart_lib_filename = "libcudart.so"
+        if find_cudart_version:
+            cudart_lib_filename = cudart_lib_filename + "." + find_cudart_version
+
+        try:
+            cudart = ctypes.CDLL(cudart_lib_filename)
+            cudart.cudaRuntimeGetVersion.restype = int
+            cudart.cudaRuntimeGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+            version = ctypes.c_int()
+            status = cudart.cudaRuntimeGetVersion(ctypes.byref(version))
+            if status != 0:
+                return None
+        except Exception:
+            return None
+
+        return version.value
+
+    # use set to avoid duplications
+    cudart_found_versions = {get_cudart_version(cudart_version) for cudart_version in cudart_possible_versions}
+
+    # convert to list and remove None
+    return [ver for ver in cudart_found_versions if ver]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.cpython-313-x86_64-linux-gnu.so
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.cpython-313-x86_64-linux-gnu.so
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_validation.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_validation.py
@@ -0,0 +1,154 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""
+Check OS requirements for ONNX Runtime Python Bindings.
+"""
+
+import linecache
+import platform
+import warnings
+
+
+def check_distro_info():
+    __my_distro__ = ""
+    __my_distro_ver__ = ""
+    __my_system__ = platform.system().lower()
+
+    __OS_RELEASE_FILE__ = "/etc/os-release"  # noqa: N806
+    __LSB_RELEASE_FILE__ = "/etc/lsb-release"  # noqa: N806
+
+    if __my_system__ == "windows":
+        __my_distro__ = __my_system__
+        __my_distro_ver__ = platform.release().lower()
+
+        if __my_distro_ver__ not in ["10", "11"]:
+            warnings.warn(
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
+            )
+    elif __my_system__ == "linux":
+        """Although the 'platform' python module for getting Distro information works well on standard OS images
+        running on real hardware, it is not accurate when running on Azure VMs, Git Bash, Cygwin, etc.
+        The returned values for release and version are unpredictable for virtualized or emulated environments.
+        /etc/os-release and /etc/lsb_release files, on the other hand, are guaranteed to exist and have standard values
+        in all OSes supported by onnxruntime. The former is the current standard file to check OS info and the latter
+        is its predecessor.
+        """
+        # Newer systems have /etc/os-release with relevant distro info
+        __my_distro__ = linecache.getline(__OS_RELEASE_FILE__, 3)[3:-1]
+        __my_distro_ver__ = linecache.getline(__OS_RELEASE_FILE__, 6)[12:-2]
+
+        # Older systems may have /etc/os-release instead
+        if not __my_distro__:
+            __my_distro__ = linecache.getline(__LSB_RELEASE_FILE__, 1)[11:-1]
+            __my_distro_ver__ = linecache.getline(__LSB_RELEASE_FILE__, 2)[16:-1]
+
+        # Instead of trying to parse distro specific files,
+        # warn the user ONNX Runtime may not work out of the box
+        __my_distro__ = __my_distro__.lower()
+        __my_distro_ver__ = __my_distro_ver__.lower()
+    elif __my_system__ == "darwin":
+        __my_distro__ = __my_system__
+        __my_distro_ver__ = platform.release().lower()
+
+        if int(__my_distro_ver__.split(".")[0]) < 11:
+            warnings.warn(
+                f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
+            )
+    elif __my_system__ == "aix":
+        import subprocess  # noqa: PLC0415
+
+        returned_output = subprocess.check_output("oslevel")
+        __my_distro_ver__str = returned_output.decode("utf-8")
+        __my_distro_ver = __my_distro_ver__str[:3]
+    else:
+        warnings.warn(
+            f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS, AIX and Windows platforms, only."
+        )
+
+
+def get_package_name_and_version_info():
+    package_name = ""
+    version = ""
+    cuda_version = ""
+
+    try:
+        from .build_and_package_info import __version__ as version  # noqa: PLC0415
+        from .build_and_package_info import package_name  # noqa: PLC0415
+
+        try:  # noqa: SIM105
+            from .build_and_package_info import cuda_version  # noqa: PLC0415
+        except ImportError:
+            # cuda_version is optional. For example, cpu only package does not have the attribute.
+            pass
+    except Exception as e:
+        warnings.warn("WARNING: failed to collect package name and version info")
+        print(e)
+
+    return package_name, version, cuda_version
+
+
+def check_training_module():
+    import_ortmodule_exception = None
+
+    has_ortmodule = False
+    try:
+        from onnxruntime.training.ortmodule import ORTModule  # noqa: F401, PLC0415
+
+        has_ortmodule = True
+    except ImportError:
+        # ORTModule not present
+        has_ortmodule = False
+    except Exception as e:
+        # this may happen if Cuda is not installed, we want to raise it after
+        # for any exception other than not having ortmodule, we want to continue
+        # device version validation and raise the exception after.
+        try:
+            from onnxruntime.training.ortmodule._fallback import ORTModuleInitException  # noqa: PLC0415
+
+            if isinstance(e, ORTModuleInitException):
+                # ORTModule is present but not ready to run yet
+                has_ortmodule = True
+        except Exception:
+            # ORTModule not present
+            has_ortmodule = False
+
+        if not has_ortmodule:
+            import_ortmodule_exception = e
+
+    # collect onnxruntime package name, version, and cuda version
+    package_name, version, cuda_version = get_package_name_and_version_info()
+
+    if has_ortmodule and cuda_version:
+        try:
+            # collect cuda library build info. the library info may not be available
+            # when the build environment has none or multiple libraries installed
+            try:
+                from .build_and_package_info import cudart_version  # noqa: PLC0415
+            except ImportError:
+                warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
+                cudart_version = None
+
+            def print_build_package_info():
+                warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
+                warnings.warn(f"onnxruntime training package info: __version__: {version}")
+                warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
+                warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
+
+            # collection cuda library info from current environment.
+            from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions  # noqa: PLC0415
+
+            local_cudart_versions = find_cudart_versions(build_env=False, build_cuda_version=cuda_version)
+            if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
+                print_build_package_info()
+                warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
+                warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
+        except Exception as e:
+            warnings.warn("WARNING: failed to collect onnxruntime version and build info")
+            print(e)
+
+    if import_ortmodule_exception:
+        raise import_ortmodule_exception
+
+    return has_ortmodule, package_name, version, cuda_version
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/init.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+"""
+Short examples used in the documentation.
+"""
+
+import os
+
+
+def get_example(name):
+    """
+    Retrieves the absolute file name of an example.
+    """
+    this = os.path.abspath(os.path.dirname(__file__))
+    full = os.path.join(this, name)
+    if not os.path.exists(full):
+        raise FileNotFoundError(f"Unable to find example '{name}'")
+    return full
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/logreg_iris.onnx
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/logreg_iris.onnx
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/mul_1.onnx
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/mul_1.onnx
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/sigmoid.onnx
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/datasets/sigmoid.onnx
@@ -0,0 +1,13 @@
+backend-test:Q
+
+xy"Sigmoidtest_sigmoidZ
+x
+
+
+
+b
+y
+
+
+
+B	
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/KeyValue.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class KeyValue:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = KeyValue()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsKeyValue(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # KeyValue
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # KeyValue
+    def Key(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+    # KeyValue
+    def Value(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.String(o + self._tab.Pos)
+        return None
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(2)
+
+
+def KeyValueStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddKey(builder, key):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
+
+
+def KeyValueAddKey(builder, key):  # noqa: N802
+    """This method is deprecated. Please switch to AddKey."""
+    return AddKey(builder, key)
+
+
+def AddValue(builder, value):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
+
+
+def KeyValueAddValue(builder, value):  # noqa: N802
+    """This method is deprecated. Please switch to AddValue."""
+    return AddValue(builder, value)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def KeyValueEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/TrtTable.py
@@ -0,0 +1,90 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: CalTableFlatBuffers
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+
+np = import_numpy()
+
+
+class TrtTable:
+    __slots__ = ["_tab"]
+
+    @classmethod
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = TrtTable()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsTrtTable(cls, buf, offset=0):  # noqa: N802
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+
+    # TrtTable
+    def Init(self, buf, pos):  # noqa: N802
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # TrtTable
+    def Dict(self, j):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue  # noqa: PLC0415
+
+            obj = KeyValue()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # TrtTable
+    def DictLength(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # TrtTable
+    def DictIsNone(self):  # noqa: N802
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        return o == 0
+
+
+def Start(builder):  # noqa: N802
+    builder.StartObject(1)
+
+
+def TrtTableStart(builder):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return Start(builder)
+
+
+def AddDict(builder, dict):  # noqa: N802
+    builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
+
+
+def TrtTableAddDict(builder, dict):  # noqa: N802
+    """This method is deprecated. Please switch to AddDict."""
+    return AddDict(builder, dict)
+
+
+def StartDictVector(builder, numElems):  # noqa: N802
+    return builder.StartVector(4, numElems, 4)
+
+
+def TrtTableStartDictVector(builder, numElems):  # noqa: N802
+    """This method is deprecated. Please switch to Start."""
+    return StartDictVector(builder, numElems)
+
+
+def End(builder):  # noqa: N802
+    return builder.EndObject()
+
+
+def TrtTableEnd(builder):  # noqa: N802
+    """This method is deprecated. Please switch to End."""
+    return End(builder)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/CalTableFlatBuffers/init.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/init.py
@@ -0,0 +1,19 @@
+from .calibrate import (  # noqa: F401
+    CalibraterBase,
+    CalibrationDataReader,
+    CalibrationMethod,
+    MinMaxCalibrater,
+    create_calibrator,
+)
+from .qdq_quantizer import QDQQuantizer  # noqa: F401
+from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
+from .quantize import (
+    DynamicQuantConfig,  # noqa: F401
+    QuantizationMode,  # noqa: F401
+    StaticQuantConfig,  # noqa: F401
+    get_qdq_config,  # noqa: F401
+    quantize,  # noqa: F401
+    quantize_dynamic,  # noqa: F401
+    quantize_static,  # noqa: F401
+)
+from .shape_inference import quant_pre_process  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/base_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/base_quantizer.py
@@ -0,0 +1,529 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import logging
+from typing import Any
+
+import numpy as np
+import onnx
+import onnx.numpy_helper
+
+try:
+    from onnx.reference.op_run import to_array_extended
+except ImportError:
+    # old version of onnx.
+    to_array_extended = None
+
+from .calibrate import TensorData
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    DEQUANT_OP_NAME,
+    ONNX_TYPE_TO_NP_TYPE,
+    QUANT_OP_NAME,
+    TENSOR_NAME_QUANT_SUFFIX,
+    find_by_name,
+    get_opset_version,
+    model_has_infer_metadata,
+    normalize_axis,
+    pack_bytes_to_4bit,
+    quantize_data,
+    quantize_nparray,
+    save_and_reload_model_with_shape_infer,
+    tensor_proto_to_array,
+)
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantizationParams:
+    def __init__(self, **data: dict[str, Any]):
+        self.data = {}
+        for k, v in data.items():
+            if not isinstance(k, str):
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if k != "axis" and not isinstance(v, (int, str, np.ndarray, float)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "axis" and not isinstance(v, int) and v is not None:
+                raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
+            self.data[k] = v
+
+    def get(self, key, default_value=None):
+        return self.data.get(key, default_value)
+
+    def __iter__(self):
+        yield from self.data
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __setitem__(self, key, value):
+        self.data[key] = value
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BaseQuantizer:
+    def __init__(
+        self,
+        model,
+        per_channel,
+        reduce_range,
+        weight_qType,
+        activation_qType,
+        tensors_range,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options=None,
+    ):
+        if not model_has_infer_metadata(model):
+            model = save_and_reload_model_with_shape_infer(model)
+        self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+        self.value_infos.update({ot.name: ot for ot in model.graph.output})
+        self.value_infos.update({it.name: it for it in model.graph.input})
+
+        self.model = ONNXModel(model)
+        self.opset_version = get_opset_version(model)
+        self.per_channel = per_channel  # weight-pack per channel
+        self.reduce_range = reduce_range
+
+        self.extra_options = extra_options if extra_options else {}
+        self.enable_subgraph_quantization = (
+            "EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
+        )
+        self.parent = None
+        self.force_quantize_no_input_check = (
+            "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
+        )
+
+        # If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
+        # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
+        self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
+        self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.min_real_range = self.extra_options.get("MinimumRealRange")
+
+        self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
+        self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
+
+        """
+            Dictionary specifying the min and max values for tensors. It has following format:
+                {
+                    "param_name": [min, max]
+                }
+            example:
+                {
+                    'Conv_3:0': [np.float32(0), np.float32(0.5)],
+                    'Conv_4:0': [np.float32(1), np.float32(3.5)]
+                }
+        """
+        if tensors_range is not None and any(not isinstance(t, TensorData) for t in tensors_range.values()):
+            raise TypeError(
+                f"tensors_range contains unexpected types { {type(v) for v in tensors_range.values()} }, not TensorData."
+            )
+        self.tensors_range = tensors_range
+        self.nodes_to_quantize = nodes_to_quantize  # specific nodes to quantize
+        self.nodes_to_exclude = nodes_to_exclude  # specific nodes to exclude
+        self.op_types_to_quantize = op_types_to_quantize
+
+        # Get tensor-level quantization overrides and ensure they are valid.
+        self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
+
+        self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
+        overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
+            self.initializers, self.value_infos.keys(), activation_qType
+        )
+        if not overrides_valid:
+            raise ValueError(overrides_err)
+
+        self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
+
+    def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
+        if self._is_weight_symmetric is not None:
+            return self._is_weight_symmetric  # Return value explicitly set by user.
+        return weight_quant_type in (
+            onnx.TensorProto.INT4,
+            onnx.TensorProto.INT8,
+            onnx.TensorProto.INT16,
+            onnx.TensorProto.FLOAT8E4M3FN,
+        )
+
+    def quantize_model(self):
+        raise NotImplementedError
+
+    def is_input_a_initializer(self, input_name):
+        initializer = find_by_name(input_name, self.model.initializer())
+        return initializer is not None
+
+    def is_per_channel(self):
+        return self.per_channel
+
+    def is_valid_quantize_weight(self, weight_name):
+        weight = find_by_name(weight_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            return False
+        return self.parent.is_valid_quantize_weight(weight_name)
+
+    def should_quantize_node(self, node):
+        if (
+            self.nodes_to_quantize is not None
+            and len(self.nodes_to_quantize) != 0
+            and node.name not in self.nodes_to_quantize
+        ):
+            return False
+
+        if node.op_type not in self.op_types_to_quantize:
+            return False
+
+        if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME):
+            return False
+
+        if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
+            return False
+
+        return True
+
+    def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
+        """
+        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
+        """
+
+        # get bias
+        bias_initializer = find_by_name(bias_name, self.model.initializer())
+        bias_data = tensor_proto_to_array(bias_initializer)
+        quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
+
+        # quantize bias
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
+            quantized_data = data.astype(np.float32)
+            bias_scale = np.array([1], dtype=quantized_data.dtype)
+            bias_scale_data = bias_scale.reshape(-1)
+            packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+            node_type = "Cast"
+        else:
+            # calculate scale for bias
+            # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
+            bias_scale = input_scale * weight_scale * beta
+
+            # Quantize by dividing by bias_scale
+            quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
+            quantized_data = quantized_data.round()
+
+            # Clip quantized data to the range of a int32
+            int32_min = np.float64(np.iinfo(np.int32).min)
+            int32_max = np.float64(np.iinfo(np.int32).max)
+            if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
+                logging.warning(
+                    f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
+                )
+
+            quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
+
+            # update bias initializer
+            bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
+            packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
+            self.model.initializer_extend([packed_bias_initializer])
+
+            # Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
+            bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
+            node_type = "DequantizeLinear"
+            node_qtype = self.weight_qType
+
+        # update scale initializer
+        quantized_bias_scale_name = quantized_bias_name + "_scale"
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
+        self.model.initializer_extend([packed_bias_scale_initializer])
+
+        # update zero initializer
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            tensor_type = self.weight_qType
+        else:
+            tensor_type = onnx.TensorProto.INT32
+
+        quantized_bias_zp_name = quantized_bias_name + "_zero_point"
+        if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
+        elif bias_scale.size > 1:
+            bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
+            packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
+        else:
+            packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
+        self.model.initializer_extend([packed_bias_zp_initializer])
+
+        return (
+            quantized_bias_name,
+            quantized_bias_scale_name,
+            quantized_bias_zp_name,
+            bias_scale_data,
+            node_type,
+            node_qtype,
+        )
+
+    def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
+        """
+        :param weight: TensorProto initializer
+        :param qType: type to quantize to
+        :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
+                                  If keep_float_weight is False, quantize the weight, or don't quantize the weight.
+        :return: quantized weight name, zero point name, scale name
+        """
+        # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
+        q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight.name + "_zero_point"
+        scale_name = weight.name + "_scale"
+
+        # Quantize weight data. Use quantization overrides if provided by the user.
+        weight_data = tensor_proto_to_array(weight)
+        quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
+        if "quant_type" in quant_overrides:
+            qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
+
+        if "scale" in quant_overrides and "zero_point" in quant_overrides:
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
+            q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                f"Unexpected dtype {zero_point.dtype}"
+            )
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        else:
+            symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
+            zero_point, scale, q_weight_data = quantize_data(
+                weight_data.flatten(),
+                qType,
+                quant_overrides.get("symmetric", symmetric),
+                reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
+                min_real_range=self.min_real_range,
+                rmin_override=quant_overrides.get("rmin"),
+                rmax_override=quant_overrides.get("rmax"),
+            )
+
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                f"Unexpected dtype {zero_point.dtype}"
+            )
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
+                q_weight_initializer = onnx.TensorProto()
+                q_weight_initializer.data_type = self.weight_qType
+                q_weight_initializer.dims.extend(weight.dims)
+                q_weight_initializer.name = q_weight_name
+                # Do not remove .flatten().copy() numpy is not clear about data persistence.
+                q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
+                if to_array_extended is not None:
+                    # This test should not be needed but it helped catch some issues
+                    # with data persistence and tobytes.
+                    check = to_array_extended(q_weight_initializer)
+                    if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
+                        raise RuntimeError(
+                            f"The initializer of shape {weight_data.shape} could not be created, expecting "
+                            f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
+                            f"\nraw={str(q_weight_initializer)[:200]}."
+                        )
+            elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if q_weight_data.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
+            else:
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
+                    weight.dims
+                )
+                q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
+            self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def quantize_weight_per_channel_impl(
+        self,
+        weight_name,
+        weight_qType,
+        channel_axis,
+        reduce_range=True,
+        keep_float_weight=False,
+    ):
+        # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
+        initializer = find_by_name(weight_name, self.model.initializer())
+        if initializer is None:
+            raise ValueError("{} is not an initializer", weight_name)
+
+        weights = tensor_proto_to_array(initializer)
+        weights_rank = len(weights.shape)
+        is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
+        if not is_axis_valid:
+            raise ValueError(
+                f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
+                f"out-of-bounds for rank {weights_rank}"
+            )
+
+        channel_axis = axis_norm
+        channel_count = weights.shape[channel_axis]
+        quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
+            weight_name, default_val=[{"axis": channel_axis}]
+        )
+
+        num_channel_overrides = len(quant_overrides_for_channels)
+        if num_channel_overrides != 1 and num_channel_overrides != channel_count:
+            raise ValueError(
+                f"Per-channel tensor quantization overrides for {weight_name} must have "
+                f"either 1 or {channel_count} elements in the list of dictionaries."
+            )
+
+        is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
+        if not is_axis_override_valid or axis_override != channel_axis:
+            raise ValueError(
+                f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
+                f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
+            )
+
+        # If user provides per-channel quantization overrides, all channels must use the same quant_type,
+        # axis, symmetric, and reduce_range values. So, just use the first channel's values.
+        if "quant_type" in quant_overrides_for_channels[0]:
+            weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type  # noqa: N806
+
+        symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
+        reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        weights_shape = list(weights.shape)
+        reshape_dims = list(weights_shape)  # deep copy
+        reshape_dims[channel_axis] = 1  # only one per channel for reshape
+        for i in range(channel_count):
+            per_channel_data = weights.take(i, channel_axis)
+            channel_override_index = i if i < num_channel_overrides else 0
+            channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
+
+            if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
+                quantized_per_channel_data = quantize_nparray(
+                    weight_qType, per_channel_data.flatten(), scale, zero_point
+                )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                    f"Unexpected dtype {zero_point.dtype}"
+                )
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(quantized_per_channel_data, np.ndarray), (
+                    f"Unexpected type {type(quantized_per_channel_data)}"
+                )
+
+            else:
+                zero_point, scale, quantized_per_channel_data = quantize_data(
+                    per_channel_data.flatten(),
+                    weight_qType,
+                    symmetric,
+                    reduce_range=reduce_range,
+                    min_real_range=self.min_real_range,
+                    rmin_override=channel_quant_overrides.get("rmin"),
+                    rmax_override=channel_quant_overrides.get("rmax"),
+                )
+
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
+                    f"Unexpected dtype {zero_point.dtype}"
+                )
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(quantized_per_channel_data, np.ndarray), (
+                    f"Unexpected type {type(quantized_per_channel_data)}"
+                )
+
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(np.asarray(quantized_per_channel_data).reshape(reshape_dims))
+
+        # combine per_channel_data into one
+        quantized_weights = np.concatenate(quantized_per_channel_data_list, channel_axis)
+        q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
+        zp_name = weight_name + "_zero_point"
+        scale_name = weight_name + "_scale"
+
+        # Update packed weight, zero point, and scale initializers
+        zero_scale_shape = [initializer.dims[channel_axis]]
+        scale_initializer = onnx.helper.make_tensor(
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
+        )
+
+        self.model.initializer_extend([scale_initializer, zero_initializer])
+
+        if not keep_float_weight:
+            if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
+                if quantized_weights.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(
+                    q_weight_name, weight_qType, weights_shape, packed_data, raw=True
+                )
+                self.model.initializer_extend([q_weight_initializer])
+            else:
+                quantized_weights = np.asarray(
+                    quantized_weights,
+                    dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
+                ).reshape(initializer.dims)
+                q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
+                self.model.initializer_extend([q_weight_initializer])
+
+        return q_weight_name, zp_name, scale_name
+
+    def adjust_tensor_ranges(self):
+        if self.tensors_range is None:
+            return
+
+        for node in self.model.nodes():
+            # adjust tensor_ranges for input of Clip and Relu node
+            if node.op_type in ["Clip", "Relu"]:
+                if not self.should_quantize_node(node):
+                    continue
+                if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
+                    continue
+                if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
+                    continue
+                td = self.tensors_range[node.output[0]]
+                if not isinstance(td, TensorData):
+                    raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
+                self.tensors_range[node.input[0]] = td
+            # Adjust Softmax to range from 0.0 to 1.0
+            elif node.op_type == "Softmax":
+                if not self.should_quantize_node(node):
+                    continue
+                self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/calibrate.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/calibrate.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/init.py
@@ -0,0 +1,2 @@
+from .preprocess import qnn_preprocess_model  # noqa: F401
+from .quant_config import get_qnn_qdq_config  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_lpnorm.py
@@ -0,0 +1,132 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ...fusions import Fusion
+from ...onnx_model import ONNXModel
+
+
+class FusionLpNormalization(Fusion):
+    def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
+        super().__init__(model, "LpNormalization", "ReduceL2")
+        self.epsilon = epsilon
+
+    def fuse(
+        self,
+        reduce_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
+        LpNormalization node.
+
+        Pattern 1:
+                    [root] --> ReduceL2 -----> Clip  --> Expand ----> Div -->
+                       |      (axis=-1)    (min=epsilon) (shape=root)  ^
+                       |   (keepdims=True)                             |
+                       |                                               |
+                       +-----------------------------------------------+
+        Notes:
+          - ReduceL2 must use the last axis, and keepdims == True
+          - Clip must only have a min attribute that is ~1e-12
+          - Expand must restore the shape to root.shape
+          - The output of Expand must be the second input to Div.
+        """
+        if reduce_node.output[0] not in input_name_to_nodes:
+            return
+
+        # ReduceL2 must have one Clip child
+        children = input_name_to_nodes[reduce_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Clip":
+            return
+
+        # ReduceL2 must have keepdims == True
+        keepdims = self.get_node_attribute(reduce_node, "keepdims")
+        if not keepdims:
+            return
+
+        # ReduceL2 axes must refer only to the last dimension.
+        # Axes became an input in opset 18. Before then, axes was an attribute
+        reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
+        if not reduce_input_ttype:
+            return
+
+        reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
+        if not reduce_input_shape:
+            return
+
+        axes = self.get_node_attribute(reduce_node, "axes")
+        if not axes and len(reduce_node.input) > 1:
+            axes = self.model.get_constant_value(reduce_node.input[1])
+
+        if not axes or len(axes) != 1:
+            return
+
+        last_dim = len(reduce_input_shape) - 1
+        if axes[0] != -1 and axes[0] != last_dim:
+            return
+
+        # Clip node must have a min attribute approximately equal to 1e-12
+        clip_node = children[0]
+        clip_min = self.get_node_attribute(clip_node, "min")
+        if clip_min is None and len(clip_node.input) > 1:
+            clip_min = self.model.get_constant_value(clip_node.input[1])
+
+        clip_max = self.get_node_attribute(clip_node, "max")  # TODO: clip_max could be FLOAT_MAX
+        if clip_max is None and len(clip_node.input) > 2:
+            clip_max = self.model.get_constant_value(clip_node.input[2])
+
+        if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
+            return
+
+        if clip_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Clip must have a single Expand child.
+        children = input_name_to_nodes[clip_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Expand":
+            return
+
+        expand_node = children[0]
+        if expand_node.output[0] not in input_name_to_nodes:
+            return
+
+        # Expand must have a single Div child
+        children = input_name_to_nodes[expand_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Div":
+            return
+
+        div_node = children[0]
+
+        # The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
+        # The second input to Div must be the output of the Expand.
+        # As long as these two inputs go to the same Div node, then ONNX validation will ensure that
+        # their shapes match.
+        if div_node.input[0] != reduce_node.input[0]:
+            return
+        if div_node.input[1] != expand_node.output[0]:
+            return
+
+        subgraph_input = reduce_node.input[0]
+        subgraph_output = div_node.output[0]
+
+        subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[subgraph_input],
+            outputs=[subgraph_output],
+            p=2,
+            axis=-1,
+        )
+        self.nodes_to_add.append(fused_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_spacetodepth.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/fusion_spacetodepth.py
@@ -0,0 +1,162 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Define SpaceToDepth fusion."""
+
+import onnx
+
+from ... import fusions, onnx_model
+
+
+class FusionSpaceToDepth(fusions.Fusion):
+    """Fusion for SpaceToDepth."""
+
+    def __init__(self, model: onnx_model.ONNXModel):
+        """Initialize.
+
+        Args:
+            model: An onnx_model.ONNXModel instance.
+        """
+        super().__init__(model, "SpaceToDepth", "Reshape")
+
+    def _fuse_yolo(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse for early version of YOLO.
+
+        Pattern:
+
+                |     [N, C, H, W]
+             Reshape
+                |     [N, C, H/blk, blk, W/blk, blk]
+            Transpose
+                |     [N, C, H/blk, W/blk, blk, blk]
+             Reshape
+                |     [N, C, H/blk * W/blk, blk * blk]
+            Transpose
+                |     [N, C, blk * blk, H/blk * W/blk]
+             Reshape
+                |     [N, C, blk * blk, H/blk, W/blk]
+            Transpose
+                |     [N, blk * blk, C, H/blk, W/blk]
+             Reshape
+                |     [N, blk * blk * C, H/blk, W/blk]
+
+        This sequence can be fused into a single SpaceToDepth with blocksize `blk`. Note that unlike DepthToSpace
+        supporting DCR or CRD mode, SpaceToDepth only supports DCR mode in its latest opset version (13), which matches
+        the pattern here.
+        """
+        reshape_node1 = node
+
+        def get_target_child(parent_node, target_op_type):
+            """Get target child of given node."""
+            if parent_node.output[0] not in input_name_to_nodes:
+                return None
+
+            children = input_name_to_nodes[parent_node.output[0]]
+            if len(children) > 1 or children[0].op_type != target_op_type:
+                return None
+
+            return children[0]
+
+        if (
+            (transpose_node1 := get_target_child(reshape_node1, "Transpose")) is None
+            or (reshape_node2 := get_target_child(transpose_node1, "Reshape")) is None
+            or (transpose_node2 := get_target_child(reshape_node2, "Transpose")) is None
+            or (reshape_node3 := get_target_child(transpose_node2, "Reshape")) is None
+            or (transpose_node3 := get_target_child(reshape_node3, "Transpose")) is None
+            or (reshape_node4 := get_target_child(transpose_node3, "Reshape")) is None
+        ):
+            return False
+
+        def get_tensor_shape(tensor_name):
+            """Get shape for given tensor name."""
+            tensor_type = self.model.get_tensor_type(tensor_name)
+            if not tensor_type:
+                return None
+
+            tensor_shape = self.tensor_shape_to_list(tensor_type)
+            if not tensor_shape:
+                return None
+
+            return tensor_shape
+
+        if (
+            (input_shape := get_tensor_shape(reshape_node1.input[0])) is None
+            or (reshape_shape1 := get_tensor_shape(reshape_node1.output[0])) is None
+            or (reshape_shape2 := get_tensor_shape(reshape_node2.output[0])) is None
+            or (reshape_shape3 := get_tensor_shape(reshape_node3.output[0])) is None
+            or (reshape_shape4 := get_tensor_shape(reshape_node4.output[0])) is None
+        ):
+            return False
+
+        transpose_perm1 = self.get_node_attribute(transpose_node1, "perm")
+        transpose_perm2 = self.get_node_attribute(transpose_node2, "perm")
+        transpose_perm3 = self.get_node_attribute(transpose_node3, "perm")
+
+        # Check rank.
+        if (
+            len(input_shape) != 4
+            or len(reshape_shape1) != 6
+            or len(reshape_shape2) != 4
+            or len(reshape_shape3) != 5
+            or len(reshape_shape4) != 4
+        ):
+            return False
+
+        # Check shape and perm.
+        batch, channel, height, width = input_shape
+        blocksize = reshape_shape1[3]
+        if (
+            reshape_shape1 != [batch, channel, height // blocksize, blocksize, width // blocksize, blocksize]
+            or transpose_perm1 != [0, 1, 2, 4, 3, 5]
+            or reshape_shape2 != [batch, channel, (height // blocksize) * (width // blocksize), blocksize**2]
+            or transpose_perm2 != [0, 1, 3, 2]
+            or reshape_shape3 != [batch, channel, blocksize**2, height // blocksize, width // blocksize]
+            or transpose_perm3 != [0, 2, 1, 3, 4]
+            or reshape_shape4 != [batch, blocksize**2 * channel, height // blocksize, width // blocksize]
+        ):
+            return False
+
+        self.nodes_to_remove.extend(
+            [
+                reshape_node1,
+                transpose_node1,
+                reshape_node2,
+                transpose_node2,
+                reshape_node3,
+                transpose_node3,
+                reshape_node4,
+            ]
+        )
+
+        s2d_node = onnx.helper.make_node(
+            self.fused_op_type,
+            name=self.create_unique_node_name(),
+            inputs=[reshape_node1.input[0]],
+            outputs=[reshape_node4.output[0]],
+            blocksize=blocksize,
+        )
+        self.nodes_to_add.append(s2d_node)
+
+        return True
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Fuse a sequence of Reshape and Transpose nodes into a single SpaceToDepth node.
+
+        Args:
+            node: An onnx.NodeProto matching the specified search type (i.e., Reshape).
+            input_name_to_nodes: A dict mapping tensor name to consumed nodes.
+            output_name_to_node: A dict mapping tensor name to produced node.
+        """
+        self._fuse_yolo(node, input_name_to_nodes, output_name_to_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/mixed_precision_overrides_utils.py
@@ -0,0 +1,413 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import onnx
+
+from ...quant_utils import QuantType
+from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
+
+
+@dataclass
+class TensorTypeRequest:
+    """
+    Bundles desired quantization type requests for a tensor. A distinction is made between the
+    produced type and the consumed type.
+    """
+
+    # The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
+    producer: QuantTypeInfo | None
+
+    # The tensor's quant type received by a set of consumer nodes.
+    # If None, assumed to be the default activation quant type for all consumers.
+    # consumers[1] is a set of consumer node names.
+    consumers: tuple[QuantTypeInfo, set[str]] | None
+
+
+class MixedPrecisionTensorQuantOverridesFixer:
+    """
+    Helper that generates tensor quantization overrides for mixed-precision QDQ models.
+
+    Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
+    activation quantization type to one or more tensors by doing the following:
+     - Inferring which other tensors need to be overridden to the non-default activation quantization type.
+     - Inserting quantization data type conversions.
+
+    Example:
+    --------
+
+    Float model:
+
+    input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
+                                 ^
+                                 |
+    input_1 --> Op2 -+-> Op4 ----+
+                     |
+                     +-> Op7 --> output_1
+                     |
+                     +-> Op8 --> output_2
+
+    If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
+    is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
+
+    ```
+    init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
+    ```
+
+    These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
+    to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
+    are valid:
+
+    ```
+    overrides = TensorQuantOverridesHelper(init_overrides)
+
+    fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
+    fixer.apply(
+        default_activation_qtype=QuantType.QUInt8,
+        default_activation_symmetric=False,
+    )
+    ```
+
+    The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
+
+    {
+      "Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
+      "Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
+      "Op4_out": [{"quant_type": QUInt16}],
+      "Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
+    }
+
+    How to interpret the fixed overrides:
+    - Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
+      but Op7 and Op8 consume the original u8 type.
+    - Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
+    - Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
+    - Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
+    """
+
+    def __init__(
+        self,
+        overrides: TensorQuantOverridesHelper,
+        producers: dict[str, onnx.NodeProto],
+        consumers: dict[str, list[onnx.NodeProto]],
+        value_infos: dict[str, onnx.ValueInfoProto],
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        """
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
+            consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
+            value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
+            initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
+        """
+        self.overrides = overrides
+        self.consumers = consumers
+        self.producers = producers
+        self.value_infos = value_infos
+        self.initializers = initializers
+
+    @staticmethod
+    def create_from_model(
+        overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
+    ) -> MixedPrecisionTensorQuantOverridesFixer:
+        """
+        Helper function that creates an instance of this class from a loaded ONNX model.
+
+        Params:
+            overrides: The initial tensor quantization overrides to fix.
+            model: Loaded ONNX model
+            default_activation_qtype: The intended default activation quantization type.
+                                      Used to validate the initial overrides.
+
+        Returns:
+            Initialized MixedPrecisionTensorQuantOverridesFixer object
+        """
+        model = onnx.shape_inference.infer_shapes(model)  # Need to infer shapes to get value_infos
+
+        # Build dictionaries that enable convenient lookups of initializers and value_infos by name.
+        initializers = {initializer.name: initializer for initializer in model.graph.initializer}
+        value_infos = {vi.name: vi for vi in model.graph.value_info}
+        value_infos.update({ot.name: ot for ot in model.graph.output})
+        value_infos.update({it.name: it for it in model.graph.input})
+
+        # Ensure that the user-provided initial overrides are actually valid.
+        valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = overrides.pprint_str(indent=4)
+            logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
+            raise ValueError(err)
+
+        consumers = {}
+        producers = {}
+
+        # Build dictionaries that map a tensor name to the consumer or producer nodes.
+        for node in model.graph.node:
+            for input_name in node.input:
+                if input_name:
+                    if input_name not in consumers:
+                        consumers[input_name] = []
+
+                    consumers[input_name].append(node)
+
+            for output_name in node.output:
+                producers[output_name] = node
+
+        return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
+
+    def apply(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ):
+        """
+        Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+        """
+        type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
+
+        # Use type requests to "fix" tensor quantization overrides by adding
+        # quantization type conversions where necessary.
+        for tensor_name, type_req in type_requests.items():
+            all_consumers = {node.name for node in self.consumers.get(tensor_name, [])}
+            has_producer_req = type_req.producer is not None
+            has_consumer_req = bool(type_req.consumers)
+
+            # Only producer type: Add conversion back to default activation type
+            if has_producer_req and not has_consumer_req:
+                self._update_converted_tensor(
+                    tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
+                )
+            # Only consumers
+            elif not has_producer_req and has_consumer_req:
+                prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
+                        raise ValueError(
+                            f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
+                        )
+            # Both producer and consumers
+            elif has_producer_req and has_consumer_req:
+                prod_type_info = type_req.producer
+                consumer_type_info = type_req.consumers[0]
+
+                if prod_type_info != consumer_type_info:
+                    self._update_converted_tensor(
+                        tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
+                    )
+                else:
+                    consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
+
+                    if len(consumers_for_original_type) == 0:
+                        # All consumers want the overridden type, so no need for convert nodes!
+                        # Just add the override to the new new if not already present.
+                        if tensor_name not in self.overrides:
+                            self.overrides[tensor_name] = [{}]
+                            prod_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+                        assert "convert" not in self.overrides[tensor_name][0]
+                    else:
+                        # Some consumers don't want the overridden type.
+                        self._update_converted_tensor(
+                            tensor_name,
+                            prod_type_info,
+                            QuantTypeInfo(default_activation_qtype),
+                            consumers_for_original_type,
+                        )
+            else:
+                raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
+
+        # Done. Check if the overrides are valid.
+        valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
+        if not valid:
+            pprint_overrides = self.overrides.pprint_str(indent=4)
+            logging.error(
+                f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
+            )
+            raise ValueError(err)
+
+    def get_desired_tensor_types(
+        self,
+        default_activation_qtype: QuantType,
+        default_activation_symmetric: bool,
+    ) -> dict[str, TensorTypeRequest]:
+        """
+        Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
+        that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
+        used to generated the "fixed" overrides.
+
+        Params:
+            default_activation_qtype: The intended default activation quantization type.
+            default_activation_symmetric: The intended default symmetry used to quantize activations.
+
+        Returns:
+            TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
+        """
+        type_requests = {}
+        default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
+
+        # Scan tensor overrides for type conversion requests.
+        for tensor_name, override_list in self.overrides.items():
+            if not self.__is_tensor_quantizable(tensor_name):
+                continue  # Skip non-quantizable tensors (e.g., not a float)
+
+            if tensor_name in self.initializers:
+                continue  # Skip initializers
+
+            if not override_list or len(override_list) > 1:
+                continue  # Skip per-channel stuff
+
+            override_dict = override_list[0]
+            quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
+            producer_node = self.producers.get(tensor_name)  # None if this is a model input
+
+            if quant_type_info != default_activation_type_info and "convert" not in override_dict:
+                if producer_node is not None:
+                    self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
+
+                # Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
+                for consumer_node in self.consumers.get(tensor_name, []):
+                    self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
+
+        return type_requests
+
+    def _add_type_requests_for_node(
+        self,
+        type_requests: dict[str, TensorTypeRequest],
+        quant_type_info: QuantTypeInfo,
+        node: onnx.NodeProto,
+    ):
+        """
+        Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
+        to have the same quantization type (as specified by the `quant_type_info` parameter).
+
+        Params:
+            type_requests: Dictionary of type requests to append to for this node.
+            quant_type_info: The quantization type to use for inputs and outputs.
+            node: The node for which the TensorTypeRequest objects are created and added to type_requests.
+        """
+        # Add output side
+        for output_name in node.output:
+            if not self.__is_tensor_quantizable(output_name):
+                continue
+
+            if output_name not in type_requests:
+                type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
+            else:
+                if (
+                    type_requests[output_name].producer is not None
+                    and type_requests[output_name].producer != quant_type_info
+                ):
+                    raise ValueError(f"Tensor {output_name} has multiple types.")
+
+                type_requests[output_name].producer = quant_type_info
+
+        # Add the consumer side
+        for input_name in node.input:
+            if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
+                if input_name not in type_requests:
+                    type_requests[input_name] = TensorTypeRequest(None, None)
+
+                if type_requests[input_name].consumers is None:
+                    type_requests[input_name].consumers = (quant_type_info, set())
+
+                if type_requests[input_name].consumers[0] != quant_type_info:
+                    raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
+
+                if not node.name:
+                    raise ValueError(
+                        f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
+                    )
+
+                type_requests[input_name].consumers[1].add(node.name)
+
+    def _update_converted_tensor(
+        self,
+        tensor_name: str,
+        producer_type_info: QuantTypeInfo,
+        consumer_type_info: QuantTypeInfo,
+        consumer_names: set[str],
+    ):
+        """
+        Updates the tensor quantization overrides for a tensor that is converted from one type to another.
+
+        Params:
+            tensor_name: The name of the tensor for which to update overrides.
+            producer_type_info: Info for the tensor's produced type.
+            consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
+            consumer_names: Nodes names of consumers that consume the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            self.overrides[tensor_name] = [{}]
+            producer_type_info.save_to_dict(self.overrides[tensor_name][0])
+
+        overrides = self.overrides[tensor_name][0]
+        if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
+            raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
+
+        if consumer_names:
+            if "convert" not in overrides:
+                overrides["convert"] = {}
+                consumer_type_info.save_to_dict(overrides["convert"])
+
+            convert_dict = overrides["convert"]
+            if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
+                raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
+
+            if "recv_nodes" not in convert_dict:
+                convert_dict["recv_nodes"] = set()
+
+            convert_dict["recv_nodes"].update(consumer_names)
+
+    def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
+        """
+        Returns true if the given nodes do not consume/receive a converted quantization type.
+
+        Params:
+            tensor_name: The name of the tensor to check.
+            node_names: Set of node names that should not be consumers of the converted type.
+        """
+        if tensor_name not in self.overrides or not self.overrides[tensor_name]:
+            return True
+
+        overrides = self.overrides[tensor_name][0]
+
+        if "convert" not in overrides:
+            return True
+
+        convert_dict = overrides["convert"]
+
+        if "recv_nodes" not in convert_dict:
+            return False
+
+        return not convert_dict["recv_nodes"].intersection(node_names)
+
+    def __is_tensor_quantizable(self, tensor_name):
+        weight = self.initializers.get(tensor_name)
+        if weight is not None:
+            if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
+                return True
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.FLOAT16,
+            ):
+                return True
+
+        return False
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/preprocess.py
@@ -0,0 +1,335 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import onnx
+
+from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
+from ....tools.remove_initializer_from_input import remove_initializer_from_input
+from ...fusions import FusionGelu, FusionLayerNormalization
+from ...onnx_model import ONNXModel
+from ...quant_utils import save_and_reload_model_with_shape_infer
+from .fusion_lpnorm import FusionLpNormalization
+from .fusion_spacetodepth import FusionSpaceToDepth
+
+
+def qnn_preprocess_model(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    exclude_initializer_from_input: bool = False,
+    fuse_layernorm: bool = False,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    external_data_convert_attribute: bool = False,
+    inputs_to_make_channel_last: list[str] | None = None,
+    outputs_to_make_channel_last: list[str] | None = None,
+    dynamic_input_shapes: list[tuple[str, str]] | None = None,
+) -> bool:
+    """
+    If necessary, this method creates a new "pre-processed" model in preparation for
+    quantization of a model to be used in QNN EP. Returns true if a new model was created.
+
+    This method perfoms the following operations:
+    - Fuse Erf sequence into a single Gelu node.
+    - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
+    - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
+
+    Args:
+        model_input: Path to the input model file or ModelProto.
+        model_output: Path the output model file, which is only created if this method returns True.
+        exclude_initializer_from_input: A bool specifying whether to exclude initializer from input.
+            Defaults to False.
+        fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
+            Defaults to False.
+        save_as_external_data: True if output model should be saved with external data. Defaults to false.
+        all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
+            If true, save all tensors to one external file specified by external_data_location.
+            If false, save each tensor to a file named with the tensor name.
+        external_data_location: Effective only if save_as_external_data is true. Defaults to None.
+            Specify the external file to which all tensors are saved. Path is relative
+            to the model path. If not specified, the model's name is used.
+        external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
+            Tensors with a data size >= external_data_size_threshold are converted to external data.
+            To convert every tensor with raw data to external data, set to 0.
+        external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
+            If true, convert all tensors to external data.
+            If false, convert only non-attribute tensors to external data.
+        inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
+            if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
+
+            Original:
+                input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            Updated:
+                input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
+            if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
+            shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
+
+            Original:
+                <Nodes> --> output0 (N, C, D1, D2, ..., Dn)
+
+            Updated:
+                <Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
+
+            This can potentially improve inference latency for QDQ models running on QNN EP because the
+            additional transpose node may allow other transpose nodes inserted during ORT layout transformation
+            to cancel out.
+        dynamic_input_shapes: A list of tuples specifying model input name to and its static shape in comma seprated
+            format, for example: [('input', '1,3,256,256')]. Defaults to None.
+    """
+    modified = False
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
+    model = save_and_reload_model_with_shape_infer(model)
+    onnx_model = ONNXModel(model)
+
+    # Optionally, fix the dynamic input shapes.
+    if dynamic_input_shapes:
+        for input_name, input_shape_str in dynamic_input_shapes:
+            input_shape = [int(i) for i in input_shape_str.split(",")]
+            make_input_shape_fixed(onnx_model.graph(), input_name, input_shape)
+        fix_output_shapes(onnx_model.model)
+        modified = True
+
+    # Exclude initializer from input if model.ir_version >= 4
+    if exclude_initializer_from_input:
+        modified |= remove_initializer_from_input(onnx_model.model)
+
+    # Fuse Erf sequence into a single Gelu
+    fusion_gelu = FusionGelu(onnx_model)
+    if fusion_gelu.apply():
+        modified = True
+
+    # Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
+    fusion_lpnorm = FusionLpNormalization(onnx_model)
+    if fusion_lpnorm.apply():
+        modified = True
+
+    # Fuse Reshape/Transpose sequence into a single SpaceToDepth.
+    fusion_s2d = FusionSpaceToDepth(onnx_model)
+    if fusion_s2d.apply():
+        modified = True
+
+    # Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
+    if fuse_layernorm:
+        onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+
+        # Need opset >= 17 to use LayerNormalization.
+        if onnx_opset.version < 17:
+            logging.warning(
+                "Unable to fuse ReduceMean sequence into a LayerNormalization node. "
+                "ONNX model must use an opset >= 17 in order to use LayerNormalization, "
+                f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
+            )
+        else:
+            fusion_layernorm = FusionLayerNormalization(onnx_model)
+            if fusion_layernorm.apply():
+                modified = True
+
+    # Optionally, transpose inputs and/or outputs to make them "channel-last".
+    if inputs_to_make_channel_last or outputs_to_make_channel_last:
+        transpose_node_prefix = "Transpose_channel_"
+        transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
+        update_io_to_channel_last(
+            onnx_model.model,
+            inputs_to_make_channel_last,
+            outputs_to_make_channel_last,
+            transpose_node_name_prefix=transpose_node_prefix,
+            transpose_node_name_start_suffix=transpose_node_suffix,
+        )
+        modified = True
+
+    # Make sure all nodes have a name.
+    unnamed_node_prefix = "qnn_preproc_node_"
+    available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
+    for node in onnx_model.model.graph.node:
+        if node.op_type != "Constant" and not node.name:
+            new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
+            available_suffix += 1
+            node.name = new_node_name
+            modified = True
+            logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
+
+    if modified:
+        onnx_model.topological_sort()
+        onnx.save_model(
+            model,
+            model_output,
+            save_as_external_data=save_as_external_data,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=external_data_convert_attribute,
+        )
+
+    return modified
+
+
+class InputOutputNameMap:
+    def __init__(
+        self,
+        orig_tensor_names: set[str],
+        orig_graph_inputs: dict[str, onnx.ValueInfoProto],
+        orig_graph_outputs: dict[str, onnx.ValueInfoProto],
+    ):
+        self.orig_tensor_names = orig_tensor_names
+        self.orig_graph_inputs = orig_graph_inputs
+        self.orig_graph_outputs = orig_graph_outputs
+        self.updated_io_names = {}
+        self.new_value_infos = []
+
+    def get_new_name(self, orig_name: str):
+        if orig_name in self.updated_io_names:
+            return self.updated_io_names[orig_name]
+
+        # Make a new tensor name that is unique among all tensors in the graph.
+        prefix: str = f"{orig_name}_channel_first_"
+        suffix: int = -1
+        for tensor_name in self.orig_tensor_names:
+            if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
+                index = int(tensor_name[len(prefix) :])
+                suffix = max(suffix, index)
+
+        suffix += 1  # This is the first available suffix.
+        new_name = f"{prefix}{suffix!s}"
+
+        # Add new value_info objects for these new tensors.
+        orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
+        value_info_proto = onnx.ValueInfoProto()
+        value_info_proto.CopyFrom(orig_value_info)
+        value_info_proto.name = new_name
+        self.new_value_infos.append(value_info_proto)
+
+        self.updated_io_names[orig_name] = new_name
+        return self.updated_io_names[orig_name]
+
+
+def update_io_to_channel_last(
+    model: onnx.ModelProto,
+    inputs_to_update: list[str] | None,
+    outputs_to_update: list[str] | None,
+    transpose_node_name_prefix: str = "Transpose_channel_",
+    transpose_node_name_start_suffix: int = 0,
+):
+    inputs_to_update = set(inputs_to_update or [])
+    outputs_to_update = set(outputs_to_update or [])
+
+    if not inputs_to_update and not outputs_to_update:
+        return
+
+    graph = model.graph
+    orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
+    orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
+
+    # Check that the user passed in actual input and output names.
+    for input_name in inputs_to_update:
+        if input_name not in orig_graph_inputs:
+            raise ValueError(f"{input_name} is not a graph input")
+
+    for output_name in outputs_to_update:
+        if output_name not in orig_graph_outputs:
+            raise ValueError(f"{output_name} is not a graph output")
+
+    orig_tensor_names = set()
+    orig_tensor_names.update(set(orig_graph_inputs))
+    orig_tensor_names.update(set(orig_graph_outputs))
+    orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+
+    # Maps original input (or output) name to its updated name used within the graph.
+    io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
+
+    # Update each node's inputs/outputs to use the transposed versions.
+    for node in graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] and node.input[i] in inputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+            elif node.input[i] and node.input[i] in outputs_to_update:
+                node.input[i] = io_map.get_new_name(node.input[i])
+
+        for i in range(len(node.output)):
+            if node.output[i] in outputs_to_update:
+                node.output[i] = io_map.get_new_name(node.output[i])
+
+    # Update graph inputs to channel-last and a Transpose (to channel-first) after each.
+    for g_input_name in inputs_to_update:
+        g_input = orig_graph_inputs[g_input_name]
+
+        if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
+
+        input_shape = g_input.type.tensor_type.shape
+        input_rank = len(input_shape.dim)
+
+        if input_rank < 3:
+            raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(input_shape.dim[1])
+        for i in range(1, input_rank - 1):
+            input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
+        input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(input_rank))
+        for i in range(input_rank):
+            transpose_perm[i] = i if i < 1 else i - 1
+        transpose_perm[1] = input_rank - 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[g_input.name],
+            outputs=[io_map.get_new_name(g_input.name)],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    # Update graph outputs to channel-last and a Transpose (from channel-first) before each.
+    for g_output_name in outputs_to_update:
+        g_output = orig_graph_outputs[g_output_name]
+        if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
+            raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
+
+        output_shape = g_output.type.tensor_type.shape
+        output_rank = len(output_shape.dim)
+
+        if output_rank < 3:
+            raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
+
+        channel_dim = onnx.TensorShapeProto.Dimension()
+        channel_dim.CopyFrom(output_shape.dim[1])
+        for i in range(1, output_rank - 1):
+            output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
+        output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
+
+        transpose_perm = list(range(output_rank))
+        for i in range(output_rank):
+            transpose_perm[i] = i if i == 0 else i + 1
+        transpose_perm[output_rank - 1] = 1
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
+            inputs=[io_map.get_new_name(g_output.name)],
+            outputs=[g_output.name],
+            perm=transpose_perm,
+        )
+        transpose_node_name_start_suffix += 1
+
+        graph.node.extend([transpose_node])
+
+    graph.value_info.extend(io_map.new_value_infos)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py
@@ -0,0 +1,406 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ...calibrate import CalibrationDataReader, CalibrationMethod
+from ...quant_utils import QuantType
+from ...quantize import StaticQuantConfig
+from ...tensor_quant_overrides import TensorQuantOverridesHelper
+from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
+
+Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
+Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
+Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
+OP_TYPES_TO_EXCLUDE = {"Cast"}
+MODEL_SIZE_THRESHOLD = 2147483648  # Quant model should use external data if >= 2GB
+
+
+def warn_unable_to_override(
+    node: onnx.NodeProto,
+    what_str: str,
+    tensor_name: str,
+    io_kind: str,
+):
+    logging.warning(
+        f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
+        "because it has already been overridden! Check the initial quantization overrides provided "
+        "to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
+        f"Node name: {node.name}, {io_kind} name: {tensor_name}"
+    )
+
+
+def get_qnn_qdq_config(
+    model_input: str | Path | onnx.ModelProto,
+    calibration_data_reader: CalibrationDataReader,
+    calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
+    activation_type: QuantType = QuantType.QUInt8,
+    weight_type: QuantType = QuantType.QUInt8,
+    per_channel: bool = False,
+    init_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    add_qtype_converts: bool = True,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+    keep_removable_activations: bool = False,
+    stride: int | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
+    nodes_to_exclude: list[str] | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a static quantization configuration suitable for running QDQ models on QNN EP.
+    This is done primarily by setting tensor-level quantization overrides.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QUInt8.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
+            and their quantization axes.
+
+            If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
+                - Conv:
+                    - input[1] on axis 0
+                    - input[2] (bias) on axis 0
+                - ConvTranspose:
+                    - input[1] on axis 1
+                    - input[2] (bias) on axis 0
+        init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
+            of these overrides with any necessary adjustments and includes them in the returned
+            configuration object (i.e., config.extra_options['TensorQuantOverrides']).
+
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        add_qtype_converts: True if this function should automatically add "convert" entries to the provided
+            `init_overrides` to ensure that operators use valid input/output types (activations only).
+            Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
+            of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
+            appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
+            the zero-point values are 128 and 32,768, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
+        nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
+            quantization when it is not None.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    if weight_symmetric is None:
+        weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
+
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
+
+    op_types = set()
+    model_has_external_data = False
+    name_to_initializer = {}
+
+    # Build map of initializers (name -> initializer) and
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        name_to_initializer[initializer.name] = initializer
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
+
+    if not overrides_helper.empty() and add_qtype_converts:
+        # Fix mixed-precision overrides.
+        overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
+            overrides_helper, model, activation_type
+        )
+        overrides_fixer.apply(activation_type, activation_symmetric)
+
+    # Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
+    qnn_compat = QnnCompatibilityOverrides(
+        activation_type,
+        weight_type,
+        activation_symmetric,
+        weight_symmetric,
+        per_channel,
+        overrides_helper,
+        name_to_initializer,
+    )
+
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
+
+    for node in model.graph.node:
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
+            continue
+        op_types.add(node.op_type)
+        qnn_compat.process_node(node)
+
+    extra_options = {
+        "MinimumRealRange": 0.0001,
+        "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
+        "QDQKeepRemovableActivations": keep_removable_activations,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
+        "CalibStridedMinMax": stride,
+    }
+
+    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+    if onnx_opset.version < 21:
+        opset21_types = Q16_TYPES.union(Q4_TYPES)
+        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
+        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+            extra_options["UseQDQContribOps"] = True
+
+    return StaticQuantConfig(
+        calibration_data_reader,
+        calibrate_method=calibrate_method,
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
+        ),
+        nodes_to_exclude=nodes_to_exclude,
+        per_channel=per_channel,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
+        extra_options=extra_options,
+    )
+
+
+class QnnCompatibilityOverrides:
+    """
+    Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
+    compatible with QNN EP.
+    """
+
+    def __init__(
+        self,
+        default_activation_qtype: QuantType,
+        default_weight_qtype: QuantType,
+        activation_symmetric: bool,
+        weight_symmetric: bool,
+        per_channel: bool,
+        overrides: TensorQuantOverridesHelper,
+        initializers: dict[str, onnx.TensorProto],
+    ):
+        self.default_activation_qtype = default_activation_qtype
+        self.default_weight_qtype = default_weight_qtype
+        self.activation_symmetric = activation_symmetric
+        self.weight_symmetric = weight_symmetric
+        self.per_channel = per_channel
+        self.overrides = overrides
+        self.initializers = initializers
+
+        self.process_fns = {
+            "MatMul": self._process_matmul,
+            "LayerNormalization": self._process_layernorm,
+            "Sigmoid": self._process_sigmoid,
+            "Tanh": self._process_tanh,
+        }
+
+    def process_node(self, node: onnx.NodeProto):
+        process_fn = self.process_fns.get(node.op_type)
+
+        if process_fn is not None:
+            process_fn(node)
+
+    def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
+        """
+        Overrides initializer input(s) to use the default weight type if:
+        - The default weight type is 8-bit
+        - One of the inputs is a 16-bit activation
+        - The other input is an initializer (per-tensor quantized)
+
+        This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
+        inputs the default weight type. Instead, it assigns the default activation type.
+        """
+        if self.default_weight_qtype not in Q8_TYPES:
+            return
+
+        input_16bit_act_name = None
+        input_weight_name = None
+
+        # Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
+        for i in range(2):
+            input_name = node.input[i]
+            if not input_name:
+                continue
+
+            is_weight = input_name in self.initializers
+            qtype_info = self.overrides.get_node_input_qtype_info(
+                input_name,
+                node.name,
+                default_qtype=None if is_weight else self.default_activation_qtype,
+            )
+
+            if qtype_info.axis is not None:
+                return  # Don't process MatMul with a per-channel quantized input.
+
+            if (
+                is_weight
+                and qtype_info.quant_type == self.default_weight_qtype
+                and qtype_info.symmetric == self.weight_symmetric
+            ):
+                return  # Return. Weight is already overridden to use the desired weight type.
+
+            if is_weight:
+                input_weight_name = input_name
+            elif qtype_info.quant_type in Q16_TYPES:
+                input_16bit_act_name = input_name
+
+        # Override initializer input to use the default weight type.
+        if input_16bit_act_name and input_weight_name:
+            did_update = self.overrides.update_tensor_overrides(
+                input_weight_name,
+                {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                overwrite=False,
+            )
+
+            if not did_update:
+                warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
+
+    def _process_matmul(self, node: onnx.NodeProto):
+        assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        # QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
+        # quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
+        # provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
+        # the user did not provide any other overrides.
+        for input_name in node.input:
+            is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
+            if is_weight_no_overrides:
+                self.overrides.update_tensor_overrides(
+                    input_name,
+                    {"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
+                )
+
+    def _process_layernorm(self, node: onnx.NodeProto):
+        assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
+
+        if not self.per_channel:
+            self._make_static_inputs_use_default_weight_type(node)
+            return
+
+        has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
+        has_bias_no_overrides = (
+            len(node.input) > 2
+            and node.input[2]
+            and node.input[2] in self.initializers
+            and node.input[2] not in self.overrides
+        )
+
+        if has_weight_no_overrides or has_bias_no_overrides:
+            # TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
+            # tries to makes it per-channel if the weight is also per-channel.
+            raise ValueError(
+                "get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
+                " Please try using custom overrides that make bias per-tensor quantized."
+            )
+
+    def _process_sigmoid(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 65536.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
+
+    def _process_tanh(self, node: onnx.NodeProto):
+        """
+        Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
+        """
+        assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
+        output_type = self.overrides.get_node_output_qtype_info(
+            node.output[0], self.default_activation_qtype
+        ).quant_type
+
+        if output_type == QuantType.QUInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(32768, dtype=np.uint16),
+                },
+            )
+        elif output_type == QuantType.QInt16:
+            self.overrides.update_tensor_overrides(
+                node.output[0],
+                {
+                    "quant_type": output_type,
+                    "scale": np.array(1.0 / 32768.0, dtype=np.float32),
+                    "zero_point": np.array(0, dtype=np.int16),
+                },
+            )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/init.py
@@ -0,0 +1,4 @@
+from .fusion import Fusion  # noqa: F401
+from .fusion_gelu import FusionGelu  # noqa: F401
+from .fusion_layernorm import FusionLayerNormalization  # noqa: F401
+from .replace_upsample_with_resize import ReplaceUpsampleWithResize  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion.py
@@ -0,0 +1,311 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from collections import deque
+
+import onnx
+
+from ..onnx_model import ONNXModel
+
+
+class Fusion:
+    """
+    Base class for fusions.
+    """
+
+    def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
+        self.search_op_type: str = search_op_type
+        self.fused_op_type: str = fused_op_type
+        self.model: ONNXModel = model
+        self.nodes_to_remove: list = []
+        self.nodes_to_add: list = []
+
+        self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
+        self._new_node_name_suffix = None  # int|None used to create unique node names for the fused ops.
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function for derived fusion classes. Tries to fuse a node sequence containing
+        the specified node.
+        """
+        raise NotImplementedError
+
+    def apply(self) -> bool:
+        """
+        Apply graph fusion on the entire model graph.
+        """
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        for node in self.model.nodes():
+            if node.op_type == self.search_op_type:
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add)
+
+        graph_updated = bool(self.nodes_to_remove or self.nodes_to_add)
+
+        if graph_updated:
+            self.model.remove_unused_constant()
+
+        return graph_updated
+
+    def create_unique_node_name(self):
+        prefix = self._new_node_name_prefix
+
+        if self._new_node_name_suffix is None:
+            largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
+            self._new_node_name_suffix = largest_suffix + 1
+
+        new_name = f"{prefix}{self._new_node_name_suffix!s}"
+        self._new_node_name_suffix += 1
+
+        return new_name
+
+    @staticmethod
+    def is_safe_to_fuse_nodes(
+        nodes_to_remove: list[onnx.NodeProto],
+        keep_outputs: list[str],
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            # Not safe to remove nodes since output is used by impacted_node
+                            return False
+        return True
+
+    @staticmethod
+    def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = onnx.helper.get_attribute_value(attr)
+                return value
+        return None
+
+    @staticmethod
+    def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
+        for index, input_name in enumerate(child_node.input):
+            if input_name == node_output:
+                return index
+        return -1
+
+    @staticmethod
+    def tensor_shape_to_list(tensor_type) -> list[int]:
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_constant_input(self, node: onnx.NodeProto):
+        for i, inp in enumerate(node.input):
+            value = self.model.get_constant_value(inp)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> int:
+        i, value = self.get_constant_input(node)
+        if value is not None and value.size == 1 and abs(value - expected_value) < delta:
+            return i
+
+        return -1
+
+    def has_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> bool:
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def is_constant_with_specified_rank(self, output_name: str, rank: int) -> bool:
+        value = self.model.get_constant_value(output_name)
+        if value is None:
+            return False  # Not an initializer
+
+        if len(value.shape) != rank:
+            return False  # Wrong dimensions
+
+        return True
+
+    def match_first_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+    ) -> tuple[onnx.NodeProto | None, int | None]:
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node: current node.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        for i, inp in enumerate(node.input):
+            if inp in output_name_to_node:
+                parent = output_name_to_node[inp]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+
+        return None, None
+
+    def match_parent(
+        self,
+        node: onnx.NodeProto,
+        parent_op_type: str,
+        input_index: int | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        exclude: list[onnx.NodeProto] = [],  # noqa: B006
+        return_indice: list[int] | None = None,
+    ) -> onnx.NodeProto | None:
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            # Input index out of bounds.
+            return None
+
+        parent = self.model.get_parent(node, input_index, output_name_to_node)
+        if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
+            return parent
+
+        return None
+
+    def match_parent_path(
+        self,
+        node: onnx.NodeProto,
+        parent_op_types: list[str],
+        parent_input_index: list[int] | None = None,
+        output_name_to_node: dict[str, onnx.NodeProto] | None = None,
+        return_indice: list[int] | None = None,
+    ) -> list[onnx.NodeProto] | None:
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints,
+        and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index
+                                  When there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        if parent_input_index is not None:
+            assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.model.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i] if parent_input_index is not None else None,
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def match_parent_paths(
+        self,
+        node: onnx.NodeProto,
+        paths: list[tuple[list[str], list[int]]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> tuple[int, list[onnx.NodeProto] | None, list[int] | None]:
+        """
+        Find a matching parent path to the given node.
+        """
+        for i, path in enumerate(paths):
+            return_indice = []
+            matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def find_first_child_by_type(
+        self,
+        node: onnx.NodeProto,
+        child_type: str,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]] | None = None,
+        recursive: bool = True,
+    ) -> onnx.NodeProto | None:
+        children = self.model.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.model.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_gelu.py
@@ -0,0 +1,272 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing an Erf node into a single
+        Gelu node.
+        """
+        if (
+            self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node)
+            or self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
+        ):
+            self.model.set_opset_import("com.microsoft", 1)
+
+    def fuse_1(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+
+        mul_after_erf = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return False
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return False
+            mul_half = children[0]
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return False
+
+            if not self.has_constant_input(mul_half, 0.5):
+                return False
+
+            if subgraph_input not in mul_half.input:
+                return False
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_2(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_after_erf = children[0]
+
+        if not self.has_constant_input(mul_after_erf, 0.5):
+            return False
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul = children[0]
+
+        div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return False
+
+        sqrt_node = None
+        if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return False
+            if not self.has_constant_input(sqrt_node, 2.0):
+                return False
+
+        subgraph_input = div.input[0]
+
+        if subgraph_input not in mul.input:
+            return False
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
+
+    def fuse_3(
+        self,
+        erf_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return False
+        add_after_erf = children[0]
+
+        if not self.has_constant_input(add_after_erf, 1):
+            return False
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        mul_half = children[0]
+
+        if not self.has_constant_input(mul_half, 0.5):
+            return False
+
+        first_mul = self.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return False
+
+        i = self.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return False
+
+        root_input_index = 1 - i
+        subgraph_input = first_mul.input[root_input_index]
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return False
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return False
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
+            return False
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return False
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = onnx.helper.make_node(
+            "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        return True
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/fusion_layernorm.py
@@ -0,0 +1,135 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: ONNXModel):
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(
+        self,
+        reduce_mean_node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """
+        Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
+        LayerNormalization node.
+
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^
+                                     |                                                 |
+                                     +-------------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = reduce_mean_node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            # Skip fusion since epsilon value is not expected.
+            return
+
+        pow_node = parent_nodes[3]
+        if self.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [reduce_mean_node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
+        if not self.is_constant_with_specified_rank(weight_input, 1):
+            return
+
+        bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
+        if not self.is_constant_with_specified_rank(bias_input, 1):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = onnx.helper.make_node(
+            "LayerNormalization",
+            name=self.create_unique_node_name(),
+            inputs=[reduce_mean_node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+        )
+        normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
+        self.nodes_to_add.append(normalize_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/replace_upsample_with_resize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/fusions/replace_upsample_with_resize.py
@@ -0,0 +1,96 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import numpy as np
+import onnx
+
+from ..onnx_model import ONNXModel
+from .fusion import Fusion
+
+
+class ReplaceUpsampleWithResize(Fusion):
+    """Replace Upsample with Resize."""
+
+    def __init__(self, model: ONNXModel, opset):
+        """Initialize."""
+        super().__init__(model, "Resize", "Upsample")
+        self.opset = opset
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ):
+        """Replace Upsample with Resize."""
+        mode = None
+        for attr in node.attribute:
+            if attr.name == "mode":
+                mode = attr.s.decode("utf-8")
+                break
+
+        scales_input = None
+        if self.opset > 7:
+            scales_input = node.input[1] if len(node.input) > 1 else ""
+            resize_inputs = [node.input[0], node.name + "_roi", scales_input]
+        else:
+            if self.opset == 7:
+                for attr in node.attribute:
+                    if attr.name == "scales":
+                        scales_input = attr.floats
+                        break
+
+                scales_input = np.array(list(scales_input), np.float32)
+            else:
+                h_scale = 1
+                w_scale = 1
+                for attr in node.attribute:
+                    if attr.name == "height_scale":
+                        h_scale = attr.float
+                    elif attr.name == "width_scale":
+                        w_scale = attr.float
+
+                scales_input = np.array([1, 1, h_scale, w_scale], np.float32)
+
+            scales_tensor = onnx.helper.make_tensor(
+                name=node.name + "_scales",
+                data_type=onnx.TensorProto.FLOAT,
+                dims=scales_input.shape,
+                vals=scales_input.flatten().tolist(),
+            )
+
+            scales_node = onnx.helper.make_node(
+                "Constant", inputs=[], outputs=[node.name + "_scales"], value=scales_tensor
+            )
+
+            self.nodes_to_add.append(scales_node)
+
+            resize_inputs = [node.input[0], node.name + "_roi", node.name + "_scales"]
+
+        roi_tensor = onnx.helper.make_tensor(
+            name=node.name + "_roi",
+            data_type=onnx.TensorProto.FLOAT,
+            dims=(len(scales_input) * 2,),
+            vals=[0] * len(scales_input) + [1] * len(scales_input),
+        )
+
+        roi_node = onnx.helper.make_node("Constant", inputs=[], outputs=[node.name + "_roi"], value=roi_tensor)
+
+        resize_node = onnx.helper.make_node(
+            op_type="Resize", inputs=resize_inputs, outputs=node.output, mode=mode, nearest_mode="floor"
+        )
+
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(roi_node)
+        self.nodes_to_add.append(resize_node)
+
+    def apply(self) -> bool:
+        """Apply."""
+        if super().apply():
+            self.model.topological_sort()
+            return True
+        return False
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py
@@ -0,0 +1,239 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import os
+
+import numpy as np
+import numpy.typing as npt
+import onnx
+from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+
+from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
+
+from .onnx_model import ONNXModel
+from .quant_utils import attribute_to_kwarg
+
+logger = logging.getLogger(__name__)
+
+
+class MatMulBnb4Quantizer:
+    """Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
+
+    ##################
+    # quantization types, must be consistent with native code type
+    # Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
+
+    # 4b floating point with bias of 3
+    FP4 = 0
+
+    # 4b NormalFloat
+    NF4 = 1
+
+    def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
+        nodes_to_exclude = nodes_to_exclude or []
+        assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
+        self.model = ONNXModel(model)
+        self.quant_type = quant_type
+        self.block_size = block_size
+        self.nodes_to_exclude = set(nodes_to_exclude)
+
+    @staticmethod
+    def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
+        """4b quantize fp32/fp16 weight"""
+
+        if len(fpweight.shape) != 2:
+            raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
+        # need to copy since the transposed weight still has the original memory layout
+        # Linear4bit quantizes its weight data which is the transposed weight
+        fpweight_t = fpweight.transpose().copy()
+
+        rows, cols = fpweight.shape
+        numel = rows * cols
+        block_size = self.block_size
+        num_blocks = (numel + block_size - 1) // block_size
+        quantized_numel = (numel + 1) // 2
+
+        packed = np.zeros(quantized_numel, dtype="uint8")
+        absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
+        # block wise quantization, fpweight_t is flattened and divided into blocks
+        quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
+
+        return (packed, absmax)
+
+    def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+
+        logger.debug(f"start to quantize {node.name} ...")
+        if node.name in self.nodes_to_exclude:
+            logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+            return node
+
+        inputB = node.input[1]  # noqa: N806
+        B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        if B is None:
+            logger.debug("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
+        if len(B_array.shape) != 2:
+            logger.debug("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+
+        packed, absmax = self.bnb4_block_quant(B_array)
+        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
+        B_quant.name = B.name + "_Bnb4"
+        for input in Bs_graph.input:
+            if input.name == inputB:
+                Bs_graph.input.remove(input)
+                break
+
+        absmax_tensor = onnx.numpy_helper.from_array(absmax)
+        absmax_tensor.name = B.name + "_absmax"
+
+        Bs_graph.initializer.extend([B_quant, absmax_tensor])
+
+        kwargs = {}
+        rows, cols = B_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["block_size"] = self.block_size
+        kwargs["quant_type"] = self.quant_type
+
+        matmul_bnb4_node = onnx.helper.make_node(
+            "MatMulBnb4",
+            inputs=[node.input[0], B_quant.name, absmax_tensor.name],
+            outputs=[node.output[0]],
+            name=node.name + "_Bnb4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.debug(f"complete quantization of {node.name} ...")
+
+        return matmul_bnb4_node
+
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
+        new_nodes = []
+        graph = graph_stack[-1]
+
+        for node in graph.node:
+            graph_attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if graph_attrs:
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        # recursive call to take care of sub-graph
+                        graph_stack.append(attr.g)
+                        kv = {attr.name: self._process_subgraph(graph_stack)}
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        value = []
+                        for subgraph in attr.graphs:
+                            # recursive call to take care of sub-graph
+                            graph_stack.append(subgraph)
+                            value.extend([self._process_subgraph(graph_stack)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx.helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_stack.pop()
+        return graph
+
+    def process(self):
+        # use a stack to keep track of sub-graphs
+        graph_stack = [self.model.graph()]
+        opset_import = self.model.opset_import()
+
+        has_ms_domain = False
+        for opset in opset_import:
+            if opset.domain == "com.microsoft":
+                has_ms_domain = True
+        if not has_ms_domain:
+            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+        self._process_subgraph(graph_stack)
+        self.model.clean_initializers()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
+
+A weight matrix is partitioned into blocks, where each block is a contiguous
+subset inside the flattened transposed weight matrix. Each block is quantized
+into a set of 4b integers with an absolute value scaling factor.
+"""
+    )
+
+    parser.add_argument("--input_model", required=True, help="Path to the input model file")
+    parser.add_argument("--output_model", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--quant_type",
+        required=False,
+        default=1,
+        choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        help="Quantization data type. 0: FP4, 1: NF4",
+    )
+    parser.add_argument(
+        "--block_size",
+        required=False,
+        default=64,
+        help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+    )
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        type=str,
+        required=False,
+        default=[],
+        help="Specify the nodes to be excluded from quantization with node names",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    input_model_path = args.input_model
+    output_model_path = args.output_model
+
+    if os.path.exists(output_model_path):
+        logger.error(f"file {output_model_path} already exists")
+        raise Exception(f"file {output_model_path} already exists")
+
+    model = onnx.load(input_model_path)
+    quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
+    quant.process()
+    quant.model.save_model_to_file(output_model_path, True)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_nbits_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/matmul_nbits_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/init.py
@@ -0,0 +1 @@
+from .weight_only import gptq_quantize, rtn_quantize  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/onnx_model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/onnx_model.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/util.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/util.py
@@ -0,0 +1,80 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper classes or functions for onnxrt adaptor."""
+
+import importlib
+import logging
+
+import numpy as np
+
+logger = logging.getLogger("neural_compressor")
+
+
+MAXIMUM_PROTOBUF = 2147483648
+
+
+def simple_progress_bar(total, i):
+    """Progress bar for cases where tqdm can't be used."""
+    progress = i / total
+    bar_length = 20
+    bar = "#" * int(bar_length * progress)
+    spaces = " " * (bar_length - len(bar))
+    percentage = progress * 100
+    print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="")
+
+
+def find_by_name(name, item_list):
+    """Helper function to find item by name in a list."""
+    items = []
+    for item in item_list:
+        assert hasattr(item, "name"), f"{item} should have a 'name' attribute defined"  # pragma: no cover
+        if item.name == name:
+            items.append(item)
+    if len(items) > 0:
+        return items[0]
+    else:
+        return None
+
+
+def to_numpy(data):
+    """Convert to numpy ndarrays."""
+    import torch  # noqa: PLC0415
+
+    if not isinstance(data, np.ndarray):
+        if not importlib.util.find_spec("torch"):
+            logger.error(
+                "Please install torch to enable subsequent data type check and conversion, "
+                "or reorganize your data format to numpy array."
+            )
+            exit(0)
+        if isinstance(data, torch.Tensor):
+            if data.dtype is torch.bfloat16:  # pragma: no cover
+                return data.detach().cpu().to(torch.float32).numpy()
+            if data.dtype is torch.chalf:  # pragma: no cover
+                return data.detach().cpu().to(torch.cfloat).numpy()
+            return data.detach().cpu().numpy()
+        else:
+            try:
+                return np.array(data)
+            except Exception:
+                assert False, (  # noqa: B011
+                    f"The input data for onnx model is {type(data)}, which is not supported to convert to numpy ndarrays."
+                )
+    else:
+        return data
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/weight_only.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/weight_only.py
@@ -0,0 +1,932 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modifications:
+# Add k-quant quantization method.
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""WeightOnly for onnxrt adaptor."""
+
+import copy
+import logging
+import os
+import sys
+
+import numpy as np
+import onnx
+from onnx import numpy_helper
+from onnx.helper import np_dtype_to_tensor_dtype
+
+import onnxruntime as ort
+
+from .onnx_model import ONNXModel
+from .util import simple_progress_bar
+
+logger = logging.getLogger("neural_compressor")
+
+
+def make_matmul_weight_only_node(
+    node,
+    weight_shape,
+    num_bits,
+    group_size,
+    k_blocks,
+    q_weight,
+    scale,
+    zero_point,
+    accuracy_level=0,
+):  # pragma: no cover
+    """Build MatMulNBits node.
+
+    Args:
+        node: original matmul node
+        weight_shape: original weight shape
+        num_bits (int): num_bits
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): block number
+        q_weight (array): quantized weight
+        scale (array): scale
+        zero_point (array): zero point
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+
+    Returns:
+        matmul_weight_only_node: MatMulNBits node
+        new_inits: initializers of the new node
+    """
+    blob_size = group_size * num_bits // 8
+    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
+    q_weight_name = node.input[1] + f"_Q{num_bits!s}G{group_size!s}"
+    input_names = [node.input[0], q_weight_name]
+    new_inits = []
+    kwargs = {}
+
+    op_type = "MatMulNBits"
+
+    # pack quantized weight
+    if num_bits == 4:
+        q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
+        packed[:, :] = q_weight_pairs[:, :blob_size]
+    elif num_bits == 8:
+        packed = q_weight
+    else:
+        logger.error(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+    packed = np.reshape(packed, (-1, k_blocks, blob_size))
+
+    # build scale tensor
+    scale = np.reshape(scale, (-1, k_blocks))
+    assert scale.dtype == np.float32 or scale.dtype == np.float16
+    scale_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_scale",
+        data_type=np_dtype_to_tensor_dtype(scale.dtype),
+        dims=scale.shape,
+        vals=scale.tobytes(),
+        raw=True,
+    )
+    input_names.append(scale_tensor.name)
+    new_inits.append(scale_tensor)
+
+    # build zero_point tensor
+    if zero_point is not None:
+        if num_bits == 8:
+            packed_zp = zero_point.astype("uint8")
+        elif num_bits == 4:
+            # For 4-bit case, the default zeros is 0x8. So it is 0x88 = 136 if we fill lower/higher 4 bits with 0x8.
+            packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
+            # create an index array
+            idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
+            # separate odd and even indices
+            even_idx = idx[::2]
+            odd_idx = idx[1::2]
+            # vectorized operation for even and odd indices
+            packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
+            packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
+        else:
+            raise ValueError(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+        packed_zp = np.reshape(packed_zp, (weight_shape[1], -1))
+        zp_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
+        )
+        input_names.append(zp_tensor.name)
+        new_inits.append(zp_tensor)
+
+    # set kwargs
+    kwargs["K"] = weight_shape[0]
+    kwargs["N"] = weight_shape[1]
+    kwargs["bits"] = num_bits
+    kwargs["block_size"] = group_size
+    if accuracy_level > 0:
+        # require onnxruntime > 1.16.3
+        kwargs["accuracy_level"] = accuracy_level
+
+    q_weight_tensor = onnx.helper.make_tensor(
+        name=q_weight_name,
+        data_type=2,
+        dims=packed.shape,
+        vals=packed.tobytes(),
+        raw=True,
+    )
+    new_inits.append(q_weight_tensor)
+
+    matmul_weight_only_node = onnx.helper.make_node(
+        op_type,
+        inputs=input_names,
+        outputs=node.output,
+        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
+        domain="com.microsoft",
+        **kwargs,
+    )
+    return matmul_weight_only_node, new_inits
+
+
+def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quantize tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size))
+    if scheme == "asym" or dtype == "uint":
+        maxq = 2**num_bits - 1
+        minq = 0
+    elif scheme == "sym":
+        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
+        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
+
+    rmin = np.min(data, axis=1, keepdims=True) * ratio
+    rmax = np.max(data, axis=1, keepdims=True) * ratio
+    if scheme == "sym":
+        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
+        scale = np.ones(rmax.shape)
+        mask = max_range > 0
+        scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
+        zero_point = (
+            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
+        )
+    else:
+        scale = np.ones(rmax.shape)
+        scale[rmin != rmax] = np.array(
+            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
+        )
+        zero_point = (
+            ((np.zeros(scale.shape) - rmin) / scale).round()
+            if dtype == "int"
+            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
+        )
+
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size)).astype(np.float32)  # nb = data.shape[0], (nb, group_size)
+    maxq = 2**num_bits - 1
+    minq = 0
+    sum_x2 = np.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+    av_x = np.sqrt(sum_x2 / group_size)  # (nb, 1)
+    weights = np.add(av_x, np.abs(data))  # (nb, group_size)
+    rmin = np.min(data, axis=1, keepdims=True)  # (nb, 1)
+    rmax = np.max(data, axis=1, keepdims=True)  # (nb, 1)
+    sum_w = np.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+    sum_x = np.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+    iscale = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+    mask = rmin != rmax
+    iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+    scale = 1 / iscale
+    quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+    diff = scale * quant_data + rmin - data  # (nb, group_size)
+    best_mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+    nstep = 20
+    rdelta = 0.1
+    # nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
+    rrmin = -1
+    for is_ in range(nstep):
+        iscale_new = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+        factor = np.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+        mask = rmin != rmax
+        iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+        quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+        mul_weights_quant_data_new = weights * quant_data_new
+        sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+        D = np.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+        diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+        mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+        mad_1 = np.array(mad)
+        best_mad_1 = np.array(best_mad)
+        idx_to_replace = np.where(mad_1 < best_mad_1)[0]
+        quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+        best_mad[idx_to_replace] = mad[idx_to_replace]
+        scale[idx_to_replace] = this_scale[idx_to_replace]
+        rmin[idx_to_replace] = this_min[idx_to_replace]
+
+    zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+    scale = scale.astype(np.float64)
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    try:
+        import cupy as cp  # noqa: PLC0415
+        import torch  # noqa: PLC0415
+
+        if torch.cuda.is_available():
+            data = cp.asarray(data)
+            data = data.reshape((-1, group_size)).astype(cp.float32)  # nb = data.shape[0], (nb, group_size)
+            maxq = 2**num_bits - 1
+            minq = 0
+            sum_x2 = cp.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+            av_x = cp.sqrt(sum_x2 / group_size)  # (nb, 1)
+            weights = cp.add(av_x, cp.abs(data))  # (nb, group_size)
+            rmin = cp.min(data, axis=1, keepdims=True)  # (nb, 1)
+            rmax = cp.max(data, axis=1, keepdims=True)  # (nb, 1)
+            sum_w = cp.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+            sum_x = cp.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+            iscale = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+            mask = rmin != rmax
+            iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+            scale = 1 / iscale
+            quant_data = cp.clip(cp.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+            diff = scale * quant_data + rmin - data  # (nb, group_size)
+            best_mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+            nstep = 20
+            rdelta = 0.1
+            rrmin = -1
+            for is_ in range(nstep):
+                iscale_new = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+                factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+                mask = rmin != rmax
+                iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+                quant_data_new = cp.clip(cp.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+                mul_weights_quant_data_new = weights * quant_data_new
+                sum_l = cp.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_l2 = cp.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_xl = cp.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+                D = cp.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+                this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+                this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+                diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+                mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+                mad_1 = cp.array(mad)
+                best_mad_1 = cp.array(best_mad)
+                idx_to_replace = cp.where(mad_1 < best_mad_1)[0]
+                quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+                best_mad[idx_to_replace] = mad[idx_to_replace]
+                scale[idx_to_replace] = this_scale[idx_to_replace]
+                rmin[idx_to_replace] = this_min[idx_to_replace]
+
+            zero_point = cp.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+            scale = scale.astype(cp.float64)
+            q_weight = cp.empty_like(data, dtype=scale.dtype)
+            cp.divide(data, scale, out=q_weight)
+            cp.add(q_weight, zero_point, out=q_weight)
+            cp.round(q_weight, out=q_weight)
+            cp.clip(q_weight, minq, maxq, out=q_weight)
+
+            return q_weight.get(), scale.get(), zero_point.get()
+        else:
+            logger.warning(
+                "Try to use k-quant quantization on CUDA. However, CUDA is not available."
+                "Fall back to k-quant quantization on CPU."
+            )
+            return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+    except ImportError:
+        logger.info(
+            "Now we are using k-quant quantization on cpu, which is time consuming."
+            "Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
+            "Please also install torch to check CUDA availability."
+        )
+        return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+
+
+def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quant dequant tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quant-dequant weight
+    """
+    org_shape = data.shape
+    weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
+    return np.reshape(scale * (weight - zp), org_shape)
+
+
+def pad_tensor(weight, group_size, k_blocks):
+    """Pad tensor rowi so that it can be is divisible by group_size.
+
+    Args:
+        weight (array): weight
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): the number of block
+
+    Returns:
+        weight: paded weight
+    """
+    if group_size == -1:
+        return weight
+
+    org_w_shape = weight.shape
+    padded_rows = k_blocks * group_size
+    pad_len = padded_rows - org_w_shape[0]
+
+    if pad_len > 0:
+        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
+
+    return weight
+
+
+def rtn_quantize(
+    model,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    ratios={},  # noqa: B006
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+    algorithm="k_quant",
+):
+    """Quant the model with round to nearst method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'RTN'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        ratios (dict, optional): percentile of clip. Defaults to {}.
+        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+    new_nodes = []
+    remove_nodes = []
+    total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
+    curr_id = 0
+    for node in model.nodes():
+        if node.op_type in ["MatMul"]:
+            curr_id += 1
+            simple_progress_bar(total_num, curr_id)
+        if (
+            node.op_type in ["MatMul"]
+            and model.get_initializer(node.input[1]) is not None
+            and weight_config.get(node.name, {}) != "fp32"
+        ):
+            weight_tensor = model.get_initializer(node.input[1])
+            weight = numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
+            if len(weight.shape) != 2:
+                continue
+
+            dtype = weight.dtype
+
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+
+            org_w_shape = weight.shape  # ic, oc
+            group_size = group_size if group_size != -1 else org_w_shape[0]
+
+            k_blocks = (org_w_shape[0] - 1) // group_size + 1
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            weight = pad_tensor(weight, group_size, k_blocks)
+
+            satisfy_MatMulNBits_condition = num_bits == 4 or num_bits == 8  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                if algorithm == "k_quant":
+                    q_weight, scale, zp = quant_tensor_k_quant_cuda(weight.T, num_bits, group_size)
+                else:
+                    q_weight, scale, zp = quant_tensor(
+                        weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+                    )
+
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_w_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" or algorithm == "k_quant" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                remove_nodes.append(node)
+                new_nodes.append(q_matmul_node)
+            else:
+                q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
+                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
+                q_weight = np.transpose(q_weight)
+                q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=weight.shape,
+                    vals=q_weight.tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.add_nodes(new_nodes)
+    model.remove_nodes(remove_nodes)
+    model.topological_sort()
+    return model
+
+
+def get_weight_scale(weight, group_size):
+    """Get the scale of weight."""
+    org_shape = weight.shape
+    weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight
+    scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0)
+    return scale
+
+
+def prepare_inputs(model, n_samples, dataloader, providers):
+    """Prepare inputs for weight only quantization.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        n_samples (int, optional): calibration sample number. -1 means all samples.
+        dataloader (object): dataloader for calibration.
+        providers (list): providers to use
+
+    Returns:
+        inputs: prepared inputs.
+        so: session options
+    """
+    from importlib.util import find_spec  # noqa: PLC0415
+
+    from .util import to_numpy  # noqa: PLC0415
+
+    so = ort.SessionOptions()
+    if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
+        from onnxruntime_extensions import get_library_path  # noqa: PLC0415
+
+        so.register_custom_ops_library(get_library_path())
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+    inputs_names = [i.name for i in session.get_inputs()]
+    del session
+
+    inputs = []
+    for i, data in enumerate(dataloader):
+        if n_samples != -1 and ((i + 1) * dataloader.batch_size) > n_samples:
+            break
+        if len(inputs_names) != 1 or isinstance(data[0], dict):
+            assert len(data[0]) == len(inputs_names), (
+                f"Input number mismatch, require {len(inputs_names)} but get {len(data[0])}"
+            )
+
+        if isinstance(data[0], dict):
+            inputs.append(dict([(name, to_numpy(inp_data)) for name, inp_data in data[0].items()]))  # noqa: C404
+        elif isinstance(data[0], np.ndarray):  # pragma: no cover
+            inputs.append(dict([(name, inp) for name, inp in zip(inputs_names, [data[0]], strict=False)]))  # noqa: C404
+        else:  # pragma: no cover
+            inputs.append(dict([(name, to_numpy(inp)) for name, inp in zip(inputs_names, data[0], strict=False)]))  # noqa: C404
+    return inputs, so
+
+
+def gptq(
+    W,
+    H,
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    blocksize=128,
+    percdamp=0.01,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+):
+    """Quant the weight with GPTQ method.
+
+    Args:
+        W (array): weight.
+        H (array): Hessian matrix.
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        blocksize (int, optional): blocksize to quantize weight.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+
+    Returns:
+        Q: fake quantized weight
+    """
+    maxq = 2**num_bits - 1
+    grid = 100
+    maxshrink = 0.8
+    norm = 2.4
+
+    def find_params(weight):
+        org_shape = weight.shape
+        # find zp, scale
+        if not perchannel:
+            weight = np.expand_dims(weight.flatten(), axis=1)
+        tmp = np.zeros(weight.shape[1])
+        xmin = np.minimum(np.min(weight, axis=0), tmp)
+        xmax = np.maximum(np.max(weight, axis=0), tmp)
+        if scheme == "sym":
+            xmax = np.maximum(np.abs(xmin), xmax)
+            tmp = xmin < 0
+            if np.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        scale = (xmax - xmin) / maxq
+        if scheme == "sym":
+            zero = np.ones(scale.shape) * (maxq + 1) / 2
+        else:
+            zero = np.round(-xmin / scale)
+        if mse:
+            best = np.ones([weight.shape[1]]) * float("inf")
+            for i in range(int(maxshrink * grid)):
+                p = 1 - i / grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / maxq
+                zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
+                q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
+                q -= weight
+                q = np.power(np.abs(q), norm)
+                err = np.sum(q, 0)
+                tmp = err < best
+                if np.any(tmp):
+                    best[tmp] = err[tmp]
+                    scale[tmp] = scale1[tmp]
+                    zero[tmp] = zero1[tmp]
+        if not perchannel:
+            tmp = org_shape[1]
+            scale = np.repeat(scale, tmp)
+            zero = np.repeat(zero, tmp)
+        shape = [-1] + [1] * (len(org_shape) - 1)
+        scale = np.reshape(scale, shape)
+        zero = np.reshape(zero, shape)
+        return scale, zero
+
+    shape = W.shape
+    scale, zp = find_params(W)
+    dead = np.diag(H) == 0
+    H[dead, dead] = 1
+    W[dead, :] = 0  # such channel makes no contribution to quantization computation
+
+    # rearrange considering the diag's value
+    if actorder:
+        perm = np.argsort(np.diag(H))[::-1]
+        W = W[perm, :]  # noqa: N806
+        H = H[perm, :][:, perm]  # noqa: N806
+    Losses = np.zeros_like(W)  # noqa: N806
+    Q = np.zeros_like(W)  # noqa: N806
+    damp = percdamp * np.mean(np.diag(H))
+    diag = np.arange(shape[0])
+    H[diag, diag] += damp  # add a average value of
+    H = np.linalg.cholesky(np.linalg.inv(H)).T  # noqa: N806
+    Hinv = H  # noqa: N806
+    for i1 in range(0, shape[0], blocksize):
+        i2 = min(i1 + blocksize, shape[0])
+        count = i2 - i1
+
+        W1 = copy.deepcopy(W[i1:i2, :])  # noqa: N806
+        Q1 = np.zeros_like(W1)  # noqa: N806
+        Err1 = np.zeros_like(W1)  # noqa: N806
+        Losses1 = np.zeros_like(W1)  # noqa: N806
+        Hinv1 = Hinv[i1:i2, i1:i2]  # noqa: N806
+
+        for i in range(count):  # within a block, channel wise
+            w = W1[i, :]
+            d = Hinv1[i, i]
+
+            if group_size != -1:
+                if (i1 + i) % group_size == 0:
+                    scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
+
+            q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
+            Q1[i, :] = q
+            Losses1[i, :] = (w - q) ** 2 / d**2
+
+            err1 = (w - q) / d
+            W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0))
+            Err1[i, :] = err1
+
+        Q[i1:i2, :] = Q1
+        Losses[i1:i2, :] = Losses1 / 2
+
+        W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1)
+
+    if actorder:
+        invperm = np.argsort(perm)
+        Q = Q[invperm, :]  # noqa: N806
+
+    Q = np.reshape(Q, W.shape)  # noqa: N806
+    del W
+    return Q
+
+
+def gptq_quantize(
+    model,
+    dataloader,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    n_samples=128,
+    percdamp=0.01,
+    blocksize=128,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+):
+    """Quant the model with GPTQ method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        dataloader (object): dataloader for calibration.
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'GPTQ'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        n_samples (int, optional): calibration sample number.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        blocksize (int, optional): blocksize to quantize weight.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+
+    inputs, so = prepare_inputs(model, n_samples, dataloader, providers)
+    del dataloader
+    org_output = copy.deepcopy(model.model.graph.output)
+    model.remove_tensors_from_outputs([i.name for i in org_output])
+    output_names = []
+    for node in model.nodes():
+        if (
+            node.op_type in ["MatMul"]
+            and weight_config.get(node.name, {}) != "fp32"
+            and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+        ):
+            output_names.append(node.input[0])
+    output_names = list(set(output_names))
+    model.add_tensors_to_outputs(output_names)
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+
+    for idx, input_name in enumerate(output_names):
+        simple_progress_bar(len(output_names), idx + 1)
+        node_list = []
+        weights = []
+
+        for node in model.input_name_to_nodes[input_name]:
+            if (
+                node.op_type in ["MatMul"]
+                and weight_config.get(node.name, {}) != "fp32"
+                and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+                and model.get_initializer(node.input[1]) is not None
+            ):
+                weight = numpy_helper.to_array(
+                    model.get_initializer(model.get_node(node.name).input[1]), base_dir
+                ).copy()
+                if len(weight.shape) != 2:
+                    continue
+
+                weights.append(weight)
+                node_list.append(model.get_node(node.name))
+
+        if len(weights) == 0:
+            continue
+
+        Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights]  # noqa: N806
+        nsamples = 0
+        for data in inputs:
+            inp = session.run([input_name], data)[0]
+            tmp = inp.shape[0]
+            inp = np.reshape(inp, (-1, inp.shape[-1]))
+            Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs]  # noqa: N806
+            nsamples += tmp
+            inp = np.sqrt(2 / nsamples) * inp
+            Hs = [i + np.matmul(inp.T, inp) for i in Hs]  # noqa: N806
+
+        for (
+            node,
+            weight,
+            H,  # noqa: N806
+        ) in zip(node_list, weights, Hs, strict=False):
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+            group_size = group_size if group_size != -1 else weight.shape[0]
+            dtype = weight.dtype
+
+            q_weight = gptq(
+                weight,
+                H,
+                num_bits=num_bits,
+                group_size=group_size,
+                scheme=scheme,
+                blocksize=blocksize,
+                percdamp=percdamp,
+                actorder=actorder,
+                mse=mse,
+                perchannel=perchannel,
+            )
+
+            weight_tensor = model.get_initializer(node.input[1])
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            satisfy_MatMulNBits_condition = num_bits == 4  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                org_shape = weight.shape
+                k_blocks = (org_shape[0] + group_size - 1) // group_size
+                q_weight = pad_tensor(q_weight, group_size, k_blocks)
+                q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                model.remove_node(node)
+                model.add_node(q_matmul_node)
+            else:
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=q_weight.shape,
+                    vals=q_weight.astype(dtype).tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.remove_tensors_from_outputs(output_names)
+    model.model.graph.output.MergeFrom(org_output)
+
+    model.topological_sort()
+
+    # reload external data to prevent external data file path errors
+    if model.is_large_model:
+        from onnx.external_data_helper import load_external_data_for_model  # noqa: PLC0415
+
+        load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
+
+    return model
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_model.py
@@ -0,0 +1,600 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from pathlib import Path
+
+import onnx
+import onnx.helper as onnx_helper
+import onnx.numpy_helper as onnx_numpy_helper
+from onnx.onnx_pb import ModelProto
+
+from .quant_utils import attribute_to_kwarg, find_by_name
+
+
+def _clean_initializers_helper(graph, model):
+    """Clean unused initializers from graph.
+
+    Returns:
+        A cleaned graph without unused initializers
+        A list of tensor names, which are not produced by this graph and its subgraphes
+    """
+    requesting_tensor_names = set()
+    requesting_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
+    requesting_tensor_names.update(g_out.name for g_out in graph.output if g_out.name)
+
+    new_nodes = []
+    for node in graph.node:
+        new_node = node
+        graph_attrs = [
+            attr
+            for attr in node.attribute
+            if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+        ]
+        if graph_attrs:
+            kwargs = {}
+            for attr in node.attribute:
+                new_attribute = {}
+                if attr.type == onnx.AttributeProto.GRAPH:
+                    (
+                        cleaned_sub_graph,
+                        sub_requesting_tensor_names,
+                    ) = _clean_initializers_helper(attr.g, model)
+                    new_attribute = {attr.name: cleaned_sub_graph}
+                    requesting_tensor_names.update(sub_requesting_tensor_names)
+                elif attr.type == onnx.AttributeProto.GRAPHS:
+                    cleaned_graphes = []
+                    for subgraph in attr.graphs:
+                        (
+                            cleaned_sub_graph,
+                            sub_requesting_tensor_names,
+                        ) = _clean_initializers_helper(subgraph, model)
+                        cleaned_graphes.append(cleaned_sub_graph)
+                        requesting_tensor_names.update(sub_requesting_tensor_names)
+                    new_attribute = {attr.name: cleaned_graphes}
+                else:
+                    new_attribute = attribute_to_kwarg(attr)
+                kwargs.update(new_attribute)
+            new_node = onnx_helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
+        new_nodes.append(new_node)
+
+    graph.ClearField("node")
+    graph.node.extend(new_nodes)
+
+    requesting_tensor_names.difference_update(output for node in graph.node for output in node.output)
+
+    unused_initializer = []
+    for initializer in graph.initializer:
+        if initializer.name in requesting_tensor_names:
+            requesting_tensor_names.remove(initializer.name)
+        else:
+            # mark it to remove, remove here directly will cause mis-behavier
+            unused_initializer.append(initializer)
+
+    name_to_input = {input.name: input for input in graph.input}
+    for initializer in unused_initializer:
+        graph.initializer.remove(initializer)
+        if initializer.name in name_to_input:
+            try:
+                graph.input.remove(name_to_input[initializer.name])
+            except StopIteration:
+                if model.ir_version < 4:
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
+
+    requesting_tensor_names.difference_update(input.name for input in graph.input)
+
+    return graph, requesting_tensor_names
+
+
+class ONNXModel:
+    def __init__(self, model: ModelProto):
+        self.model = model
+
+    def nodes(self):
+        return self.model.graph.node
+
+    def initializer(self):
+        return self.model.graph.initializer
+
+    def initializer_extend(self, inits):
+        if len(inits) == 0:
+            raise ValueError("Can add an empty list.")
+        for init in self.initializer():
+            self._check_init(init, "gain")
+        for init in inits:
+            self._check_init(init)
+            self.model.graph.initializer.append(init)
+
+    def graph(self):
+        return self.model.graph
+
+    def ir_version(self):
+        return self.model.ir_version
+
+    def opset_import(self):
+        return self.model.opset_import
+
+    def set_opset_import(self, domain, version):
+        for opset in self.model.opset_import:
+            if opset.domain == domain:
+                opset.version = version
+                return
+
+        self.model.opset_import.extend([onnx_helper.make_opsetid(domain, version)])
+
+    def remove_node(self, node):
+        if node in self.model.graph.node:
+            self.model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        self.model.graph.node.extend([self._check_node(node)])
+
+    def add_nodes(self, nodes_to_add):
+        for node in nodes_to_add:
+            self.add_node(node)
+
+    def add_initializer(self, tensor):
+        if find_by_name(tensor.name, self.model.graph.initializer) is None:
+            self._check_init(tensor)
+            self.model.graph.initializer.extend([tensor])
+
+    def get_initializer(self, name):
+        for tensor in self.model.graph.initializer:
+            if tensor.name == name:
+                return tensor
+        return None
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_tensor_type(self, tensor_name: str):
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if tensor_name in tensor_type_map:
+            return tensor_type_map[tensor_name].tensor_type
+
+        g_input = self.find_graph_input(tensor_name)
+        if g_input:
+            return g_input.type.tensor_type
+
+        g_output = self.find_graph_output(tensor_name)
+        if g_output:
+            return g_output.type.tensor_type
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.model.graph.node:
+            if node.op_type == "Constant":
+                if node.output[0] == output_name:
+                    for attr in node.attribute:
+                        if attr.name == "value":
+                            return onnx_numpy_helper.to_array(attr.t)
+
+        # Fallback to initializer since constant folding may have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return onnx_numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_initializer_name_set(self):
+        return {initializer.name for initializer in self.model.graph.initializer}
+
+    def remove_initializer(self, tensor):
+        if tensor in self.model.graph.initializer:
+            self.model.graph.initializer.remove(tensor)
+            for input in self.model.graph.input:
+                if input.name == tensor.name:
+                    self.model.graph.input.remove(input)
+                    break
+
+    def remove_initializers(self, init_to_remove):
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def get_non_initializer_inputs(self):
+        initializer_names = self.get_initializer_name_set()
+        non_initializer_inputs = set()
+        for input in self.model.graph.input:
+            if input.name not in initializer_names:
+                non_initializer_inputs.add(input.name)
+        return non_initializer_inputs
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.model.graph.node:
+            for input_name in node.input:
+                if input_name:  # Could be empty when it is optional
+                    if input_name not in input_name_to_nodes:
+                        input_name_to_nodes[input_name] = [node]
+                    else:
+                        input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.model.graph.node:
+            for output_name in node.output:
+                if output_name:  # Could be empty when it is optional
+                    output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)  # noqa: PERF402
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, idx, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def find_node_by_name(self, node_name, new_nodes_list, graph):
+        """Find out if a node exists in a graph or a node is in the
+        new set of nodes created during quantization.
+
+        Returns:
+            The node found or None.
+        """
+        graph_nodes_list = list(graph.node)  # deep copy
+        graph_nodes_list.extend(new_nodes_list)
+        node = find_by_name(node_name, graph_nodes_list)
+        return node
+
+    def get_largest_node_name_suffix(self, node_name_prefix):
+        """
+        Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
+        Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
+        """
+        suffix = -1
+
+        for node in self.model.graph.node:
+            if node.name and node.name.startswith(node_name_prefix):
+                try:
+                    index = int(node.name[len(node_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
+    def get_largest_initializer_name_suffix(self, initializer_name_prefix):
+        """
+        Gets the largest initializer name integer suffix for all initializer names that begin
+        with `initializer_name_prefix`. This can be used to create unique initializer names.
+
+        Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
+                 `initializer_name_prefix` is 'my_weight_'.
+        """
+        suffix = -1
+
+        for initializer in self.model.graph.initializer:
+            if initializer.name.startswith(initializer_name_prefix):
+                try:
+                    index = int(initializer.name[len(initializer_name_prefix) :])
+                    suffix = max(index, suffix)
+                except ValueError:
+                    continue
+
+        return suffix
+
+    def find_nodes_by_initializer(self, graph, initializer):
+        """
+        Find all nodes with given initializer as an input.
+        """
+        nodes = []
+        for node in graph.node:
+            for node_input in node.input:
+                if node_input == initializer.name:
+                    nodes.append(node)
+        return nodes
+
+    @staticmethod
+    def __get_initializer(name, graph_path):
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    @staticmethod
+    def __replace_gemm_with_matmul(graph_path):
+        new_nodes = []
+        graph = graph_path[-1]
+        for node in graph.node:
+            graph_attrs = [attr for attr in node.attribute if attr.type == 5 or attr.type == 10]
+            if graph_attrs:
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == 5:
+                        graph_path.append(attr.g)
+                        kv = {attr.name: ONNXModel.__replace_gemm_with_matmul(graph_path)}
+                    elif attr.type == 10:
+                        value = []
+                        for subgraph in attr.graphs:
+                            graph_path.append(subgraph)
+                            value.extend([ONNXModel.__replace_gemm_with_matmul(graph_path)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx_helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            if node.op_type == "Gemm":
+                alpha = 1.0
+                beta = 1.0
+                transA = 0  # noqa: N806
+                transB = 0  # noqa: N806
+                for attr in node.attribute:
+                    if attr.name == "alpha":
+                        alpha = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "beta":
+                        beta = onnx_helper.get_attribute_value(attr)
+                    elif attr.name == "transA":
+                        transA = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                    elif attr.name == "transB":
+                        transB = onnx_helper.get_attribute_value(attr)  # noqa: N806
+                if alpha == 1.0 and beta == 1.0 and transA == 0:
+                    inputB = node.input[1]  # noqa: N806
+                    if transB == 1:
+                        B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path)  # noqa: N806
+                        if B:
+                            # assume B is not used by any other node
+                            B_array = onnx_numpy_helper.to_array(B)  # noqa: N806
+                            B_trans = onnx_numpy_helper.from_array(B_array.T)  # noqa: N806
+                            B_trans.name = B.name
+                            Bs_graph.initializer.remove(B)
+                            for input in Bs_graph.input:
+                                if input.name == inputB:
+                                    Bs_graph.input.remove(input)
+                                    break
+                            Bs_graph.initializer.extend([B_trans])
+                        else:
+                            inputB += "_Transposed"  # noqa: N806
+                            transpose_node = onnx_helper.make_node(
+                                "Transpose",
+                                inputs=[node.input[1]],
+                                outputs=[inputB],
+                                name=node.name + "_Transpose" if node.name else "",
+                            )
+                            new_nodes.append(transpose_node)
+
+                    matmul_node = onnx_helper.make_node(
+                        "MatMul",
+                        inputs=[node.input[0], inputB],
+                        outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
+                        name=node.name + "_MatMul" if node.name else "",
+                    )
+                    new_nodes.append(matmul_node)
+
+                    if len(node.input) > 2:
+                        add_node = onnx_helper.make_node(
+                            "Add",
+                            inputs=[node.output[0] + "_MatMul", node.input[2]],
+                            outputs=node.output,
+                            name=node.name + "_Add" if node.name else "",
+                        )
+                        new_nodes.append(add_node)
+
+                # unsupported
+                else:
+                    new_nodes.append(node)
+
+            # not GEMM
+            else:
+                new_nodes.append(node)
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_path.pop()
+        return graph
+
+    def replace_gemm_with_matmul(self):
+        graph_path = [self.graph()]
+        ONNXModel.__replace_gemm_with_matmul(graph_path)
+
+    def save_model_to_file(self, output_path, use_external_data_format=False):
+        """
+        Save model to external data, which is needed for model size > 2GB
+        """
+        self.topological_sort()
+        if use_external_data_format:
+            onnx.external_data_helper.convert_model_to_external_data(
+                self.model,
+                all_tensors_to_one_file=True,
+                location=Path(output_path).name + ".data",
+                convert_attribute=True,
+            )
+        for init in self.model.graph.initializer:
+            self._check_init(init, "end")
+        onnx.save_model(self.model, output_path)
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
+        for node in self.model.graph.node:
+            if node.name in node_names_set:
+                ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if (
+                node.op_type == "Constant"
+                and not self.is_graph_output(node.output[0])
+                and node.output[0] not in input_name_to_nodes
+            ):
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        ununsed_weights = []
+        for w in self.initializer():
+            if w.name not in input_name_to_nodes and not self.is_graph_output(w.name):
+                ununsed_weights.append(w)
+                # Remove from graph.input
+                for graph_input in self.graph().input:
+                    if graph_input.name == w.name:
+                        self.graph().input.remove(graph_input)
+
+        self.remove_initializers(ununsed_weights)
+
+    def is_graph_output(self, output_name):
+        return any(output.name == output_name for output in self.model.graph.output)
+
+    def is_graph_input(self, tensor_name: str) -> bool:
+        return any(input.name == tensor_name for input in self.model.graph.input)
+
+    # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
+    # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
+    def topological_sort(self):
+        deps_count = [0] * len(self.nodes())  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(self.nodes()):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(self.nodes()[node_idx])
+                continue
+
+            for input_name in node.input:
+                if not input_name:
+                    continue
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        initializer_names = [init.name for init in self.initializer()]
+        graph_input_names = [input.name for input in self.model.graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(self.nodes()[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(self.nodes()[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        assert end == len(self.graph().node), "Graph is not a DAG"
+        self.graph().ClearField("node")
+        self.graph().node.extend(sorted_nodes)
+
+    def clean_initializers(self):
+        return _clean_initializers_helper(self.graph(), self.model)
+
+    def _check_init(self, init, test=None):
+        if init.data_type == onnx.TensorProto.FLOAT8E4M3FN:
+            if init.HasField("raw_data"):
+                b = list(init.raw_data)
+                if any((i & 127) == 127 for i in b):
+                    raise ValueError(f"Initializer {init.name!r} has nan.")
+        return init
+
+    def _check_node(self, node):
+        """
+        A quantization to float 8 does not use quantized bias but float 16 bias.
+        This function checks that DequantizeLinear is not used to
+        dequantize from float 16.
+        """
+        if node.op_type == "DequantizeLinear":
+            zero_point = node.input[2]
+            init = self.get_initializer(zero_point)
+            dtype = init.data_type
+            if dtype in {
+                onnx.TensorProto.FLOAT16,
+                onnx.TensorProto.FLOAT,
+                onnx.TensorProto.DOUBLE,
+                onnx.TensorProto.BFLOAT16,
+            }:
+                raise RuntimeError(f"Unsupported DequantizeLinear operator, dequantization from {dtype}.")
+        return node
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/onnx_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/init.py
@@ -0,0 +1,2 @@
+# from .base_operator import QuantOperatorBase
+# from .matmul import MatMulInteger
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/activation.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/activation.py
@@ -0,0 +1,119 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearActivation(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def QuantizeClipRelu(self):  # noqa: N802
+        node = self.node
+        assert node.op_type == "Relu" or node.op_type == "Clip"
+
+        # When mode is QLinearOps, the output quantization params are calculated based on outputs from
+        # activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
+        # If input to this node is not quantized then keep this node
+        # If activation is symmetric, not quantize the op and simply return
+        if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
+            return super().quantize()
+
+        quantized_value = self.quantizer.quantized_value_map[node.input[0]]
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_value
+
+    def quantize(self):
+        node = self.node
+        if node.op_type == "Relu" or node.op_type == "Clip":
+            self.QuantizeClipRelu()
+            return
+
+        nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
+        sigmoid_nnapi_mode = (
+            node.op_type == "Sigmoid"
+            and nnapi_sigmoid_option in self.quantizer.extra_options
+            and self.quantizer.extra_options[nnapi_sigmoid_option]
+        )
+        use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
+        use_zeropoint = 0 if sigmoid_nnapi_mode else None
+
+        # No assert on op_type as it is controlled by registry
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_activation_name = ""
+        if node.name:
+            qlinear_activation_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_activation_inputs = [
+            quantized_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            output_scale_name,
+            output_zp_name,
+        ]
+
+        qlinear_activation_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_activation_inputs,
+            [qlinear_activation_output],
+            qlinear_activation_name,
+            **kwargs,
+        )
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_activation_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        nodes.append(qlinear_activation_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQRemovableActivation(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # If input to this node is not quantized then keep this node
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            return
+
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
+        ):
+            self.quantizer.remove_node(self.node)
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/argmax.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/argmax.py
@@ -0,0 +1,18 @@
+from .base_operator import QuantOperatorBase
+
+
+# Use the quantized tensor as input without DQ.
+class QArgMax(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+        if quantized_input_value is None:
+            self.quantizer.new_nodes += [node]
+            return
+
+        node.input[0] = quantized_input_value.q_name
+        self.quantizer.new_nodes += [node]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/attention.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/attention.py
@@ -0,0 +1,73 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize Attention
+"""
+
+
+class AttentionQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        """
+        parameter node: Attention node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "Attention"
+
+        # TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
+        # attribute. This needs to be removed once the QAttention for varied q,k,v sizes
+        # is implemented
+        for attr in node.attribute:
+            if attr.name == "qkv_hidden_sizes":
+                return super().quantize()
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qattention_name = "" if not node.name else node.name + "_quant"
+
+        inputs = []
+        inputs.extend(quantized_input_names)
+        inputs.extend([node.input[2]])
+        inputs.extend(scale_names)
+        inputs.extend([node.input[3] if len(node.input) > 3 else ""])
+        inputs.extend(zero_point_names)
+        inputs.extend([node.input[4] if len(node.input) > 4 else ""])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
+        nodes.append(qattention_node)
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/base_operator.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/base_operator.py
@@ -0,0 +1,26 @@
+class QuantOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_float_tensor(self.node.input[0])
+
+    def quantize(self):
+        """
+        Given a node which does not support quantization, this method checks whether the input to
+        this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
+            parameter node: Current node
+            parameter new_nodes_list: List of new nodes created before processing current node
+            return: List of new nodes created
+        """
+        for _, node_input in enumerate(self.node.input):
+            dequantize_node = self.quantizer._dequantize_value(node_input)
+            if dequantize_node is not None:
+                self.quantizer.new_nodes.append(dequantize_node)
+
+        # Append the original node
+        self.quantizer.new_nodes.append(self.node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/binary_op.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/binary_op.py
@@ -0,0 +1,72 @@
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearBinaryOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0, 1])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_binary_math_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlinear_binary_math_inputs = []
+        # Input 0
+        qlinear_binary_math_inputs.append(quantized_input_names[0])
+        qlinear_binary_math_inputs.append(scale_names[0])
+        qlinear_binary_math_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_binary_math_inputs.append(quantized_input_names[1])
+        qlinear_binary_math_inputs.append(scale_names[1])
+        qlinear_binary_math_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_binary_math_inputs.append(output_scale_name)
+        qlinear_binary_math_inputs.append(output_zp_name)
+
+        qlinear_binary_math_node = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            qlinear_binary_math_inputs,
+            [qlinear_binary_math_output],
+            qlinear_binary_math_name,
+            **kwargs,
+        )
+        nodes.append(qlinear_binary_math_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_binary_math_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/concat.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/concat.py
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import (  # noqa: F401
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase  # noqa: F401
+
+
+class QLinearConcat(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            quantized_input_value.value_type,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qlconcat_inputs = [output_scale_name, output_zp_name]
+        for i in range(len(q_input_names)):
+            qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
+        qlconcat_node = onnx.helper.make_node(
+            "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlconcat_node]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/conv.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/conv.py
@@ -0,0 +1,260 @@
+import numpy as np
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,
+    get_mul_node,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class ConvInteger(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def add_bias(self, nodes, scaled_output):
+        """
+        Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
+            parameter nodes: new nodes would be appended into nodes
+            parameter node: current node (Conv)
+            parameter scaled_output: output of quant conv without bias
+            parameter output: output of Conv
+            parameter bias_name: bias of Conv
+            return: the name of output
+        """
+        node = self.node
+        model = self.quantizer.model
+        # Add tensors for the shape to be reshaped to
+        weight = find_by_name(node.input[1], model.initializer())
+        if weight is None:
+            raise ValueError(f"Expected {node.input[1]} to be an initializer")
+
+        # Add reshape for correct broadcase
+        output = node.output[0]
+        reshape_input_data = node.input[2]  # bias of Conv
+        reshape_input_shape = output + "_bias_reshape_shape"
+        reshape_output = output + "_bias_reshape_output"
+
+        shape = np.ones((len(weight.dims)), dtype=np.int64)
+        shape[1] = -1
+        init_shape = onnx.helper.make_tensor(
+            reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
+        )
+        model.add_initializer(init_shape)
+
+        reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
+        nodes.append(reshape_node)
+
+        # Add an Add operation for bias
+        add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
+        nodes.append(add_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        conv_integer_output = node.output[0] + "_output_quantized"
+        conv_integer_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node(
+            "ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
+        )
+        nodes.append(conv_integer_node)
+
+        # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [conv_integer_output],
+            [cast_op_output],
+            conv_integer_output + "_cast",
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        if conv_integer_name:
+            scales_mul_op = conv_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        has_bias = len(node.input) == 3
+        scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                scaled_output_name,
+                output_scale_mul_op,
+            )
+        )
+
+        if has_bias:
+            self.add_bias(nodes, scaled_output_name)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QLinearConv(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                onnx_proto.TensorProto.INT8,
+                0,  # self.quantizer.weight_qType?
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        bias_present = False
+        if len(node.input) == 3:
+            if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
+                raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
+            quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
+            bias_present = True
+
+        qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        qlinear_conv_inputs = []
+        # Input 0
+        qlinear_conv_inputs.append(quantized_input_names[0])
+        qlinear_conv_inputs.append(scale_names[0])
+        qlinear_conv_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_conv_inputs.append(quantized_input_names[1])
+        qlinear_conv_inputs.append(scale_names[1])
+        qlinear_conv_inputs.append(zero_point_names[1])
+
+        # Output
+        qlinear_conv_inputs.append(output_scale_name)
+        qlinear_conv_inputs.append(output_zp_name)
+
+        if bias_present:
+            qlinear_conv_inputs.append(quantized_bias_name)
+
+        qlinear_conv_node = onnx.helper.make_node(
+            "QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
+        )
+        nodes.append(qlinear_conv_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_conv_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQConv(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if node.op_type == "Conv" else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/direct_q8.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/direct_q8.py
@@ -0,0 +1,78 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+# For operators that support 8bits operations directly, and output could
+# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
+class Direct8BitOp(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        if not self.quantizer.force_quantize_no_input_check:
+            # Keep backward compatibility
+            # Quantize when input[0] is quantized already. Otherwise keep it.
+            quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+            if quantized_input_value is None:
+                self.quantizer.new_nodes += [node]
+                return
+
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                quantized_input_value.value_type,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_value.q_name
+            node.output[0] = quantized_output_value.q_name
+            self.quantizer.new_nodes += [node]
+
+        else:
+            # Force quantize those ops if possible, use exclude node list if this is not you want
+            if not self.quantizer.is_valid_quantize_weight(node.input[0]):
+                super().quantize()
+                return
+
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            if quantized_input_names is None:
+                return super().quantize()
+
+            # Create an entry for output quantized value
+            quantized_output_value = QuantizedValue(
+                node.output[0],
+                node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_names[0]
+            node.output[0] = quantized_output_value.q_name
+            nodes.append(node)
+
+            self.quantizer.new_nodes += nodes
+
+
+class QDQDirect8BitOp(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        if self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(self.node.input[0])
+            if not self.disable_qdq_for_node_output:
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+        elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/embed_layernorm.py
@@ -0,0 +1,121 @@
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto  # noqa: F401
+
+from ..quant_utils import attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+"""
+Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
+
+This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
+weight inputs associated with the node to uint8.
+"""
+
+
+class EmbedLayerNormalizationQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        return self.quantizer.should_quantize_node(self.node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "EmbedLayerNormalization"
+
+        if len(node.output) > 2:
+            logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
+            return super().quantize()
+
+        """
+        Pre-quantization EmbedLayerNorm inputs:
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (float32)
+        [3] position_embedding (float32)
+        [4] segment_embedding (float32)
+        [5] gamma (float32)
+        [6] beta (float32)
+        [7] mask (int32) (optional)
+        """
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
+
+        """
+        Quantized Input Tensor List
+        [0] input_ids (int32)
+        [1] segment_ids (int32)
+        [2] word_embedding (uint8)
+        [3] position_embedding (uint8)
+        [4] segment_embedding (uint8)
+        [5] gamma (uint8)
+        [6] beta (uint8)
+        [7] mask (int32) (optional)
+        [8] word_embedding_scale (float)
+        [9] position_embedding_scale (float)
+        [10] segment_embedding_scale (float)
+        [11] gamma_scale (float)
+        [12] beta_scale (float)
+        [13] word_embedding_zero_point (uint8)
+        [14] position_embedding_zero_point (uint8)
+        [15] segment_embedding_zero_point (uint8)
+        [16] gamma_zero_point (uint8)
+        [17] beta_zero_point (uint8)
+        """
+        inputs = []
+        # 'input_ids'
+        inputs.extend([node.input[0]])
+        # 'segment_ids'
+        inputs.extend([node.input[1]])
+        # 'word_embedding_quant'
+        inputs.extend([quantized_input_names[0]])
+        # 'position_embedding_quant'
+        inputs.extend([quantized_input_names[1]])
+        # 'segment_embedding_quant'
+        inputs.extend([quantized_input_names[2]])
+        # 'gamma_quant'
+        inputs.extend([quantized_input_names[3]])
+        # 'beta_quant'
+        inputs.extend([quantized_input_names[4]])
+        # 'mask' (optional)
+        inputs.extend([node.input[7] if len(node.input) > 7 else ""])
+
+        # Add all scales:
+        inputs.extend([scale_names[0]])
+        inputs.extend([scale_names[1]])
+        inputs.extend([scale_names[2]])
+        inputs.extend([scale_names[3]])
+        inputs.extend([scale_names[4]])
+
+        # Add all zero points:
+        inputs.extend([zero_point_names[0]])
+        inputs.extend([zero_point_names[1]])
+        inputs.extend([zero_point_names[2]])
+        inputs.extend([zero_point_names[3]])
+        inputs.extend([zero_point_names[4]])
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qembed_layer_norm_node = onnx.helper.make_node(
+            "QEmbedLayerNormalization",
+            inputs,
+            node.output,
+            qembed_layer_norm_name,
+            **kwargs,
+        )
+        nodes.append(qembed_layer_norm_node)
+
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gather.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gather.py
@@ -0,0 +1,64 @@
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+"""
+    Quantize Gather
+"""
+
+
+class GatherQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            return False
+
+        return self.quantizer.is_valid_quantize_weight(self.node.input[0])
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather"
+
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            gather_new_output,
+            scale_names[0],
+            zero_point_names[0],
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        node.output[0] = gather_new_output
+        node.input[0] = quantized_input_names[0]
+        nodes.append(node)
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGather(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gather" or node.op_type == "GatherElements"
+
+        if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
+            self.quantizer.quantize_activation_tensor(node.input[0])
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
+        elif self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gavgpool.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gavgpool.py
@@ -0,0 +1,62 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QGlobalAveragePool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "GlobalAveragePool"
+
+        # If input to this node is not quantized then keep this node.
+        if node.input[0] not in self.quantizer.quantized_value_map:
+            return super().quantize()
+
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        # Create an entry for output quantized value.
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+        (
+            data_found,
+            output_scale_name_from_parameter,
+            output_zp_name_from_parameter,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        # Just use input scale and zp if parameters for output is not specified.
+        output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
+        output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        kwargs["channels_last"] = 0
+        qnode_name = node.name + "_quant" if node.name else ""
+
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_value.q_name,
+                quantized_input_value.scale_name,
+                quantized_input_value.zp_name,
+                output_scale_name,
+                output_zp_name,
+            ],
+            [quantized_output_value.q_name],
+            qnode_name,
+            **kwargs,
+        )
+        self.quantizer.new_nodes += [qnode]
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gemm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/gemm.py
@@ -0,0 +1,172 @@
+import logging
+
+import numpy as np  # noqa: F401
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    find_by_name,  # noqa: F401
+    get_mul_node,  # noqa: F401
+    ms_domain,
+)
+from .base_operator import QuantOperatorBase  # noqa: F401
+from .matmul import QOpMatMul
+from .qdq_base_operator import QDQOperatorBase
+
+
+def is_B_transposed(gemm_node):  # noqa: N802
+    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]  # noqa: N806
+    if transB_attribute:
+        return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
+
+    return False
+
+
+def get_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        return onnx.helper.get_attribute_value(beta_attribute[0])
+
+    return 1.0
+
+
+def set_default_beta(gemm_node):
+    beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
+    if beta_attribute:
+        beta_attribute[0].f = 1.0
+
+    return 1.0
+
+
+class QLinearGemm(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+            quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
+                node.input[1],
+                self.quantizer.weight_qType,
+                0 if is_B_transposed(node) else 1,
+            )
+            quantized_input_names.append(quant_weight_tuple[0])
+            zero_point_names.append(quant_weight_tuple[1])
+            scale_names.append(quant_weight_tuple[2])
+        else:
+            #  Get Quantized from both activation(input[0]) and weight(input[1])
+            (
+                quantized_input_names,
+                zero_point_names,
+                scale_names,
+                nodes,
+            ) = self.quantizer.quantize_activation(node, [0])
+
+            (
+                quantized_input_names_weight,
+                zero_point_names_weight,
+                scale_names_weight,
+                nodes_weight,
+            ) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
+            quantized_input_names.extend(quantized_input_names_weight)
+            zero_point_names.extend(zero_point_names_weight)
+            scale_names.extend(scale_names_weight)
+            nodes.extend(nodes_weight)
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        quantized_bias_name = ""
+        if len(node.input) == 3:
+            if not self.quantizer.is_input_a_initializer(node.input[2]):
+                return super().quantize()
+
+            # Note: if the quantized type is float 8, the bias is converted into float 16.
+            # cublasLtMatMul only supports (b)float16 or float32 bias.
+            quantized_bias_name = self.quantizer.quantize_bias_static(
+                node.input[2], node.input[0], node.input[1], get_beta(self.node)
+            )
+
+        qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qgemm_name = node.name + "_quant" if node.name else ""
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name != "beta":
+                kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        # generate input
+        qgemm_inputs = []
+        for i in range(2):
+            qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
+
+        qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
+
+        qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
+        nodes.append(qgemm_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qgemm_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+            node_type=node.op_type,
+            node_qtype=self.quantizer.weight_qType,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQGemm(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Gemm"
+
+        self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            self.quantizer.quantize_activation_tensor(node.output[0])
+
+        is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=0 if is_B_transposed(node) else 1
+        )
+        if is_weight_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
+        else:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+
+        if len(node.input) == 3:
+            if self.quantizer.is_input_a_initializer(node.input[2]):
+                self.quantizer.quantize_bias_tensor(
+                    node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
+                )
+                set_default_beta(self.node)
+            else:
+                logging.warning(
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
+                )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/lstm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/lstm.py
@@ -0,0 +1,121 @@
+import numpy
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain  # noqa: F401
+from .base_operator import QuantOperatorBase
+
+"""
+    Quantize LSTM
+"""
+
+
+class LSTMQuant(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        """
+        parameter node: LSTM node.
+        parameter new_nodes_list: List of new nodes created before processing this node.
+        return: a list of nodes in topological order that represents quantized Attention node.
+        """
+        node = self.node
+        assert node.op_type == "LSTM"
+
+        if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
+            node.input[2]
+        ):
+            super().quantize()
+            return
+
+        model = self.quantizer.model
+        W = model.get_initializer(node.input[1])  # noqa: N806
+        R = model.get_initializer(node.input[2])  # noqa: N806
+
+        if len(W.dims) != 3 or len(R.dims) != 3:
+            super().quantize()
+            return
+
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims  # noqa: N806
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            del W.dims[0]
+            del R.dims[0]
+            W.dims[0] = W_num_dir * W_4_hidden_size
+            R.dims[0] = R_num_dir * R_4_hidden_size
+
+        quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[1],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+        quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
+            node.input[2],
+            onnx_proto.TensorProto.INT8,
+            0,  # self.quantizer.weight_qType?
+        )
+
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)  # noqa: N806
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)  # noqa: N806
+
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))  # noqa: N806
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))  # noqa: N806
+
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))  # noqa: N806
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))  # noqa: N806
+
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])  # noqa: N806
+
+        model.remove_initializers([W_quant_weight, R_quant_weight])
+        model.add_initializer(W_quant_tranposed)
+        model.add_initializer(R_quant_tranposed)
+
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])  # noqa: N806
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])  # noqa: N806
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])  # noqa: N806
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])  # noqa: N806
+
+        if self.quantizer.is_per_channel():
+            W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
+            W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
+            R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
+
+        inputs = []
+        input_len = len(node.input)
+        inputs.extend([node.input[0]])
+        inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
+        inputs.extend([node.input[3] if input_len > 3 else ""])
+        inputs.extend([node.input[4] if input_len > 4 else ""])
+        inputs.extend([node.input[5] if input_len > 5 else ""])
+        inputs.extend([node.input[6] if input_len > 6 else ""])
+        inputs.extend([node.input[7] if input_len > 7 else ""])
+        inputs.extend(
+            [
+                quant_input_weight_tuple[2],
+                quant_input_weight_tuple[1],
+                quant_recurrent_weight_tuple[2],
+                quant_recurrent_weight_tuple[1],
+            ]
+        )
+
+        kwargs = {}
+        for attribute in node.attribute:
+            if attribute.name == "layout":
+                continue
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        quant_lstm_name = "" if not node.name else node.name + "_quant"
+        quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
+        self.quantizer.new_nodes.append(quant_lstm_node)
+
+        dequantize_node = self.quantizer._dequantize_value(node.input[0])
+        if dequantize_node is not None:
+            self.quantizer.new_nodes.append(dequantize_node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/matmul.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/matmul.py
@@ -0,0 +1,231 @@
+import itertools
+import logging
+
+import onnx
+from onnx import onnx_pb as onnx_proto
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QOpMatMul(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def should_quantize(self):
+        if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
+            return False
+
+        if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
+            not self.quantizer.is_float_tensor(self.node.input[0])
+        ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
+            return False
+
+        # do not quantize non-constant B matrices for matmul
+        if self.quantizer.q_matmul_const_b_only:
+            if not self.quantizer.find_initializer_in_path(self.node.input[1]):
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                return False
+        return True
+
+
+"""
+    Used when quantize mode is QuantizationMode.IntegerOps.
+"""
+
+
+class MatMulInteger(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+        nodes.extend(nodes_weight)
+
+        matmul_integer_output = node.output[0] + "_output_quantized"
+        matmul_integer_name = node.name + "_quant" if node.name else ""
+        matmul_integer_node = onnx.helper.make_node(
+            "MatMulInteger",
+            quantized_input_names + zero_point_names,
+            [matmul_integer_output],
+            matmul_integer_name,
+        )
+        nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
+        cast_node = onnx.helper.make_node(
+            "Cast",
+            [matmul_integer_output],
+            [cast_op_output],
+            matmul_integer_output + "_cast",
+            to=otype,
+        )
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert len(scale_names) == 2
+        scales_mul_op = (
+            matmul_integer_name + "_scales_mul"
+            if matmul_integer_name
+            else scale_names[0] + "_" + scale_names[1] + "_mul"
+        )
+
+        scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
+        if scales_mul_node is None:
+            scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = ""
+        if matmul_integer_name:
+            output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
+        nodes.append(
+            get_mul_node(
+                [cast_op_output, scales_mul_op_output],
+                node.output[0],
+                output_scale_mul_op,
+            )
+        )
+        self.quantizer.new_nodes += nodes
+
+
+"""
+    Used when quantize mode is QuantizationMode.QLinearOps
+"""
+
+
+class QLinearMatMul(QOpMatMul):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+        #  Get Quantized from both activation(input[0]) and weight(input[1])
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        (
+            quantized_input_names_weight,
+            zero_point_names_weight,
+            scale_names_weight,
+            nodes_weight,
+        ) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
+        quantized_input_names.extend(quantized_input_names_weight)
+        zero_point_names.extend(zero_point_names_weight)
+        scale_names.extend(scale_names_weight)
+
+        nodes.extend(nodes_weight)
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_matmul_name = node.name + "_quant" if node.name else ""
+
+        qlinear_matmul_inputs = []
+        # Input 0
+        qlinear_matmul_inputs.append(quantized_input_names[0])
+        qlinear_matmul_inputs.append(scale_names[0])
+        qlinear_matmul_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_matmul_inputs.append(quantized_input_names[1])
+        qlinear_matmul_inputs.append(scale_names[1])
+        qlinear_matmul_inputs.append(zero_point_names[1])
+        # Output quantization parameter
+        qlinear_matmul_inputs.append(output_scale_name)
+        qlinear_matmul_inputs.append(output_zp_name)
+
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
+        qlinear_matmul_node = onnx.helper.make_node(
+            "QLinearMatMul",
+            qlinear_matmul_inputs,
+            [qlinear_matmul_output],
+            qlinear_matmul_name,
+            domain=domain,
+        )
+        nodes.append(qlinear_matmul_node)
+
+        # Create an entry for this quantized value
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_matmul_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        self.quantizer.new_nodes += nodes
+
+
+class QDQMatMul(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MatMul"
+
+        if self.disable_qdq_for_node_output:
+            nodes_to_iterate = node.input
+        else:
+            nodes_to_iterate = itertools.chain(node.input, node.output)
+
+        for tensor_name in nodes_to_iterate:
+            if find_by_name(tensor_name, self.quantizer.model.initializer()):
+                is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
+                    tensor_name, default_axis=1, op_type=node.op_type
+                )
+                if is_per_channel:
+                    self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
+                else:
+                    self.quantizer.quantize_weight_tensor(tensor_name)
+            else:
+                self.quantizer.quantize_activation_tensor(tensor_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/maxpool.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/maxpool.py
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QMaxPool(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, go to normal quantize.
+        if self.quantizer.opset_version < 12:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQMaxPool(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "MaxPool"
+
+        # if version is less than 12, just no change
+        if self.quantizer.opset_version < 12:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/norm.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/norm.py
@@ -0,0 +1,40 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QDQNormalization(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
+
+        # Input
+        self.quantizer.quantize_activation_tensor(node.input[0])
+
+        # Scale
+        scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
+        scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
+            node.input[1], default_axis=1, op_type=node.op_type
+        )
+
+        if scale_is_per_channel:
+            self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
+        elif scale_is_initializer:
+            self.quantizer.quantize_weight_tensor(node.input[1])
+        else:
+            self.quantizer.quantize_activation_tensor(node.input[1])
+
+        # Bias
+        if len(node.input) > 2 and node.input[2]:
+            self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
+
+        # Output
+        if not self.disable_qdq_for_node_output:
+            for output_name in node.output:
+                self.quantizer.quantize_activation_tensor(output_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pad.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pad.py
@@ -0,0 +1,172 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import onnx
+
+from ..quant_utils import (
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    quantize_nparray,
+)
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Pad"
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if "mode" not in kwargs or kwargs["mode"] == b"constant":
+            if len(node.input) > 2 and node.input[2] != "":  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        self.quantizer.activation_qType,
+                        padding_constant_array,
+                        scale_value,
+                        zp_value,
+                    )
+                    quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array,
+                        quantized_padding_constant_name,
+                    )
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    # TODO: check quantize_inputs after sub graph is supported
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node,
+                        2,
+                        self.quantizer.activation_qType,
+                        quantized_input_value.scale_name,
+                        quantized_input_value.zp_name,
+                        initial_type=scale_tensor.data_type,
+                    )
+                    self.quantizer.new_nodes.extend(pad_value_qnodes)
+                    node.input[2] = pad_value_qnodes[0].output[0]
+            else:
+                # In quantized format, the `zero` before quantization is mapped
+                # to quantized_input_value.zp_name. Thus, padding 0 to
+                # original tensor should become padding zero point to quantized
+                # tensor.
+                if len(node.input) == 2:
+                    # Feed quantization's zero point to padding node.
+                    node.input.append(quantized_input_value.zp_name)
+                else:
+                    # Assign quantization's zero point to padding node.
+                    assert node.input[2] == ""
+                    node.input[2] = quantized_input_value.zp_name
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
+            quantized_input_value.scale_name,
+            quantized_input_value.zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
+
+
+class QDQPad(QDQOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
+        """
+        Returns the Pad's constant padding value. Returns `None` if the padding value is
+        not constant (i.e., comes from a dynamic input).
+        """
+        const_val = None
+        onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
+        if onnx_tensor_type is None:
+            return None
+
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
+        if self.quantizer.opset_version < 11:
+            const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
+        elif len(self.node.input) >= 3 and self.node.input[2]:
+            const_val = self.quantizer.model.get_constant_value(self.node.input[2])
+        else:
+            const_val = np.array(0, dtype=np_dtype)
+
+        return const_val
+
+    def _should_quantize_output_same_as_input(self) -> bool:
+        """
+        Returns true if Pad's output should use the same quantization parameters as input[0]
+        """
+        attrs_dict = {}
+        for attribute in self.node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            attrs_dict.update(kv)
+
+        pad_mode = attrs_dict.get("mode", b"constant")
+        if pad_mode in (b"reflect", b"edge", b"wrap"):
+            # These modes pad the output with a value that already exists in the input.
+            # So, we can quantize the output the same as the input.
+            return True
+
+        # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
+        # because our quantization floating-point range always includes 0.
+        if pad_mode == b"constant":
+            pad_val = self._get_pad_const_val(attrs_dict)
+            if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
+                return float(pad_val.item()) == 0
+
+        return False
+
+    def quantize(self):
+        assert self.node.op_type == "Pad"
+
+        for input_name in self.node.input:
+            if input_name:
+                self.quantizer.quantize_activation_tensor(input_name)
+
+        if not self.disable_qdq_for_node_output:
+            if self._should_quantize_output_same_as_input():
+                self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
+            else:
+                self.quantizer.quantize_activation_tensor(self.node.output[0])
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pooling.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/pooling.py
@@ -0,0 +1,67 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearPool(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear pool node for given type (AveragePool, etc)
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/qdq_base_operator.py
@@ -0,0 +1,22 @@
+import itertools
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray  # noqa: F401
+from .base_operator import QuantOperatorBase  # noqa: F401
+
+
+class QDQOperatorBase:
+    def __init__(self, onnx_quantizer, onnx_node):
+        self.quantizer = onnx_quantizer
+        self.node = onnx_node
+        self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
+
+    def quantize(self):
+        node = self.node
+
+        if self.disable_qdq_for_node_output:
+            tensors_to_quantize = node.input
+        else:
+            tensors_to_quantize = itertools.chain(node.input, node.output)
+
+        for tensor_name in tensors_to_quantize:
+            self.quantizer.quantize_activation_tensor(tensor_name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/resize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/resize.py
@@ -0,0 +1,34 @@
+from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
+
+
+class QResize(Direct8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, go to normal quantize.
+        if self.quantizer.opset_version < 11:
+            super(Direct8BitOp, self).quantize()
+            return
+
+        # Direct 8bits op
+        return super().quantize()
+
+
+class QDQResize(QDQDirect8BitOp):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Resize"
+
+        # if version is less than 11, just keep this node
+        if self.quantizer.opset_version < 11:
+            return
+
+        # Direct 8bits op
+        return super().quantize()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/softmax.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/softmax.py
@@ -0,0 +1,74 @@
+import onnx
+import onnx.helper
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+
+
+class QLinearSoftmax(QuantOperatorBase):
+    def quantize(self):
+        node = self.node
+        # set limitations for softmax output scale and zp, because the output of softmax is always 0-1
+        if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+            out_scale = 1 / 256.0
+            out_zero_point = 0
+        else:
+            out_scale = 1 / 256.0
+            out_zero_point = -128
+        # only try to quantize when given quantization parameters for it
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
+
+        # get quantized input tensor names, quantize input if needed
+        (
+            quantized_input_names,
+            input_zero_point_names,
+            input_scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+
+        if not data_found or quantized_input_names is None:
+            return super().quantize()
+
+        # Create an entry for output quantized value.
+        qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        quantized_output_value = QuantizedValue(
+            node.output[0],
+            qlinear_output_name,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        # Create qlinear softmax node for given type
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+        # make qlinearsoft has the real opset_version, its default SinceVersion would be 1
+        kwargs["opset"] = self.quantizer.opset_version
+        qlinear_node_name = node.name + "_quant" if node.name else ""
+        qnode = onnx.helper.make_node(
+            "QLinear" + node.op_type,
+            [
+                quantized_input_names[0],
+                input_scale_names[0],
+                input_zero_point_names[0],
+                output_scale_name,
+                output_zp_name,
+            ],
+            [qlinear_output_name],
+            qlinear_node_name,
+            **kwargs,
+        )
+
+        # add all newly created nodes
+        nodes.append(qnode)
+        self.quantizer.new_nodes += nodes
+        return None
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/split.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/split.py
@@ -0,0 +1,63 @@
+import onnx
+
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QSplit(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        (
+            quantized_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [0])
+        if quantized_input_names is None:
+            return super().quantize()
+
+        quantized_node_name = ""
+        if node.name:
+            quantized_node_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+
+        # Output just derive the scale/zero from input
+        quantized_output_names = []
+        for output_name in node.output:
+            quantized_output_name = output_name + "quantized"
+            quantized_output_names.append(quantized_output_name)
+            q_output = QuantizedValue(
+                output_name,
+                quantized_output_name,
+                scale_names[0],
+                zero_point_names[0],
+                QuantizedValueType.Input,
+            )
+            self.quantizer.quantized_value_map[output_name] = q_output
+
+        if len(node.input) > 1:
+            quantized_input_names.extend(node.input[1:])
+        quantized_node = onnx.helper.make_node(
+            node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
+        )
+
+        nodes.append(quantized_node)
+        self.quantizer.new_nodes += nodes
+
+
+class QDQSplit(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Split"
+
+        if not self.quantizer.is_tensor_quantized(node.input[0]):
+            self.quantizer.quantize_activation_tensor(node.input[0])
+        if not self.disable_qdq_for_node_output:
+            for output in node.output:
+                self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/where.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/operators/where.py
@@ -0,0 +1,87 @@
+import onnx
+
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase
+from .qdq_base_operator import QDQOperatorBase
+
+
+class QLinearWhere(QuantOperatorBase):
+    def should_quantize(self):
+        return True
+
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if not self.quantizer.force_quantize_no_input_check:
+            self.quantizer.new_nodes += [node]
+            return
+        (
+            data_found,
+            output_scale_name,
+            output_zp_name,
+            _,
+            _,
+        ) = self.quantizer._get_quantization_params(node.output[0])
+        (
+            q_input_names,
+            zero_point_names,
+            scale_names,
+            nodes,
+        ) = self.quantizer.quantize_activation(node, [1, 2])
+        if not data_found or q_input_names is None:
+            return super().quantize()
+        qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
+        qlinear_output_name = node.name + "_quant" if node.name else ""
+
+        q_output = QuantizedValue(
+            node.output[0],
+            qlinear_output,
+            output_scale_name,
+            output_zp_name,
+            QuantizedValueType.Input,
+        )
+        self.quantizer.quantized_value_map[node.output[0]] = q_output
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(attribute_to_kwarg(attribute))
+        kwargs["domain"] = ms_domain
+
+        qlwhere_inputs = [
+            node.input[0],
+            q_input_names[0],
+            scale_names[0],
+            zero_point_names[0],
+            q_input_names[1],
+            scale_names[1],
+            zero_point_names[1],
+            output_scale_name,
+            output_zp_name,
+        ]
+        qlwhere_node = onnx.helper.make_node(
+            "QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
+        )
+
+        self.quantizer.new_nodes += nodes
+        self.quantizer.new_nodes += [qlwhere_node]
+
+
+class QDQWhere(QDQOperatorBase):
+    def quantize(self):
+        node = self.node
+        assert node.op_type == "Where"
+        if self.quantizer.force_quantize_no_input_check:
+            if not self.quantizer.is_tensor_quantized(node.input[1]):
+                self.quantizer.quantize_activation_tensor(node.input[1])
+            if not self.quantizer.is_tensor_quantized(node.input[2]):
+                self.quantizer.quantize_activation_tensor(node.input[2])
+            if not self.disable_qdq_for_node_output:
+                for output in node.output:
+                    self.quantizer.quantize_activation_tensor(output)
+        elif (
+            self.quantizer.is_tensor_quantized(node.input[1])
+            and self.quantizer.is_tensor_quantized(node.input[2])
+            and not self.disable_qdq_for_node_output
+        ):
+            for output in node.output:
+                self.quantizer.quantize_activation_tensor(output)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/preprocess.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/preprocess.py
@@ -0,0 +1,141 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import sys
+
+from .shape_inference import quant_pre_process
+
+logger = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="""Model optimizer and shape inferencer, in preparation for quantization,
+Consists of three optional steps:
+1. Symbolic shape inference (best for transformer models).
+2. Model optimization.
+3. ONNX shape inference.
+
+Model quantization with QDQ format, i.e. inserting QuantizeLinear/DeQuantizeLinear on
+the tensor, requires tensor shape information to perform its best. Currently, shape inferencing
+works best with optimized model. As a result, it is highly recommended to run quantization
+on optimized model with shape information. This is the tool for optimization and shape
+inferencing.
+
+Essentially this tool performs the following three (skippable) steps:
+
+1. Symbolic shape inference.
+2. Model optimization
+3. ONNX shape inference"""
+    )
+
+    parser.add_argument("--input", required=True, help="Path to the input model file")
+    parser.add_argument("--output", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--skip_optimization",
+        type=bool,
+        default=False,
+        help="Skip model optimization step if true. It's a known issue that ORT"
+        " optimization has difficulty with model size greater than 2GB, rerun with"
+        " this option to get around this issue.",
+    )
+    parser.add_argument(
+        "--skip_onnx_shape",
+        type=bool,
+        default=False,
+        help="Skip ONNX shape inference. Symbolic shape inference is most effective"
+        " with transformer based models. Skipping all shape inferences may"
+        " reduce the effectiveness of quantization, as a tensor with unknown"
+        " shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--skip_symbolic_shape",
+        type=bool,
+        default=False,
+        help="Skip symbolic shape inference. Symbolic shape inference is most"
+        " effective with transformer based models. Skipping all shape"
+        " inferences may reduce the effectiveness of quantization, as a tensor"
+        " with unknown shape can not be quantized.",
+    )
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default=None,
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    if args.skip_optimization and args.skip_onnx_shape and args.skip_symbolic_shape:
+        logger.error("Skipping all three steps, nothing to be done. Quitting...")
+        sys.exit()
+
+    if (not args.skip_optimization) and args.save_as_external_data:
+        logger.error("ORT model optimization does not support external data yet!")
+        sys.exit()
+
+    logger.info("input model: %s", args.input)
+    logger.info("output model: %s", args.output)
+    quant_pre_process(
+        args.input,
+        args.output,
+        args.skip_optimization,
+        args.skip_onnx_shape,
+        args.skip_symbolic_shape,
+        args.auto_merge,
+        args.int_max,
+        args.guess_output_rank,
+        args.verbose,
+        args.save_as_external_data,
+        args.all_tensors_to_one_file,
+        args.external_data_location,
+        args.external_data_size_threshold,
+    )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_loss_debug.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_loss_debug.py
@@ -0,0 +1,389 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""Utilities to run a given ONNX model, while saving input/output tensors of
+eligible operator nodes.
+
+A use case is to debug quantization induced accuracy drop. An AI engineer can
+run the original float32 model and the quantized model with the same inputs,
+then compare the corresponding activations between the two models to find
+where the divergence is.
+
+Example Usage:
+
+```python
+    class ExampleDataReader(CalibrationDataReader):
+        def __init__(self):
+            ...
+        def get_next(self):
+            ...
+
+    input_data_reader = ExampleDataReader()
+
+    augmented_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_model.onnx"))
+    modify_model_output_intermediate_tensors (path_to_onnx_model, augmented_model_path)
+
+    tensor_dict = collect_activations(augmented_model_path, input_data_reader)
+```
+
+`tensor_dict` points to a dictionary where the keys are tensor names and each value
+is a list of tensors, one from each model run
+
+"""
+
+import logging
+import math
+import time
+from collections.abc import Callable, Sequence
+from pathlib import Path
+
+import numpy
+import onnx
+from onnx import helper, numpy_helper
+
+import onnxruntime
+
+from .calibrate import CalibraterBase, CalibrationDataReader
+from .onnx_model import ONNXModel
+from .quant_utils import (
+    DEQUANT_OP_NAME,
+    DEQUANT_OUTPUT_SUFFIX,
+    QUANT_INPUT_SUFFIX,
+    TENSOR_NAME_QUANT_SUFFIX,
+    find_by_name,
+    load_model_with_shape_infer,
+)
+
+_TENSOR_SAVE_POSTFIX = "_ReshapedSavedOutput"
+_TENSOR_SAVE_POSTFIX_LEN = len(_TENSOR_SAVE_POSTFIX)
+
+
+def modify_model_output_intermediate_tensors(
+    input_model_path: str | Path,
+    output_model_path: str | Path,
+    op_types_for_saving: Sequence[str] | None = None,
+    save_as_external_data: bool = False,
+) -> None:
+    """Augment a given ONNX model to save node input/output tensors.
+
+    Add all input/output tensors of operator nodes to model outputs
+    so that their values can be retrieved for debugging purposes.
+
+    Args:
+        input_model: the path to load the model.
+        op_types_for_saving: Operator types for which the
+                input/output should be saved. By default, saving all the
+                float32/float16 tensors.
+
+    Returns:
+        The augmented ONNX model
+    """
+
+    if op_types_for_saving is None:
+        op_types_for_saving = []
+    saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
+    model_to_augment = saver.model
+    tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
+    reshape_shape_name = "LinearReshape_" + str(time.time())
+    reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
+    model_to_augment.graph.initializer.append(reshape_shape)
+
+    for tensor_name in tensors:
+        reshape_output = tensor_name + _TENSOR_SAVE_POSTFIX
+        reshape_node = onnx.helper.make_node(
+            "Reshape",
+            inputs=[tensor_name, reshape_shape_name],
+            outputs=[reshape_output],
+            name=reshape_output,
+        )
+        model_to_augment.graph.node.append(reshape_node)
+        reshape_output_value_info = helper.make_tensor_value_info(
+            reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
+        )
+        model_to_augment.graph.output.append(reshape_output_value_info)
+
+    onnx.save(
+        model_to_augment,
+        output_model_path,
+        save_as_external_data=save_as_external_data,
+    )
+
+
+def collect_activations(
+    augmented_model: str,
+    input_reader: CalibrationDataReader,
+    session_options=None,
+    execution_providers: Sequence[str] | None = None,
+) -> dict[str, list[numpy.ndarray]]:
+    """Run augmented model and collect activations tensors.
+
+    Args:
+        augmented_model: Path to augmented model created by modify_model_output_intermediate_tensors ()
+        input_reader: Logic for reading input for the model, augmented model have the same
+            input with the original model.
+        session_options: Optional OnnxRuntime session options for controlling model run.
+            By default graph optimization is turned off
+        execution_providers: Collection of execution providers for running the model.
+            Only CPU EP is used by default.
+
+    Returns:
+        A dictionary where the key is tensor name and values are list of tensors from each batch
+    """
+
+    if session_options is None:
+        session_options = onnxruntime.SessionOptions()
+        session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+    if execution_providers is None:
+        execution_providers = ["CPUExecutionProvider"]
+
+    inference_session = onnxruntime.InferenceSession(
+        augmented_model,
+        sess_options=session_options,
+        providers=execution_providers,
+    )
+
+    intermediate_outputs = []
+    for input_d in input_reader:
+        intermediate_outputs.append(inference_session.run(None, input_d))
+    if not intermediate_outputs:
+        raise RuntimeError("No data is collected while running augmented model!")
+
+    output_dict = {}
+    output_info = inference_session.get_outputs()
+    for batch in intermediate_outputs:
+        for output, output_data in zip(output_info, batch, strict=False):
+            if output.name.endswith(_TENSOR_SAVE_POSTFIX):
+                output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN]
+                output_dict.setdefault(output_name, []).append(output_data)
+
+    return output_dict
+
+
+_POST_QDQ_POSTFIX1 = DEQUANT_OUTPUT_SUFFIX + "_1"
+
+
+def _add_pre_post_qdq_pair(
+    qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]],
+    activation_name: str,
+    pre_qdq_tensors: Sequence[numpy.ndarray] | None,
+    post_qdq_tensors: Sequence[numpy.ndarray] | None,
+) -> None:
+    if post_qdq_tensors is not None and pre_qdq_tensors is not None:
+        qdq_cmp[activation_name] = {}
+        qdq_cmp[activation_name]["pre_qdq"] = pre_qdq_tensors
+        qdq_cmp[activation_name]["post_qdq"] = post_qdq_tensors
+
+
+def create_activation_matching(
+    qdq_activations: dict[str, Sequence[numpy.ndarray]],
+    float_activations: dict[str, Sequence[numpy.ndarray]] | None = None,
+) -> dict[str, dict[str, Sequence[numpy.ndarray]]]:
+    """Comparing activation values to help debugging accuracy loss due to quantization.
+
+    This functions takes saved activations from the QDQ model and (optionally) the
+    float point model, and provides a data structure for comparing:
+        * from the qdq model, activation values before and after QDQ operation
+        * across both models, activations from the orignal model vs the corresponding
+          activations in the QDQ model
+
+    Arg:
+        qdq_activations: Output of `collect_activations`. This must be from a quantized
+            model with QDQ format.
+        float_activations: Output of `collect_activations`. This must be from the float
+            point model.
+
+    Returns:
+        Dict for comparing pre and post quantized activation tensors. E.g.
+        ```
+        qdq_cmp = cmp_qdq_input_output(qdq_activations)
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+
+
+        qdq_cmp = cmp_qdq_input_output(qdq_activations, float_activations)
+        print(qdq_cmp['activation1']['float'][0])
+        print(qdq_cmp['activation1']['pre_qdq'][0])
+        print(qdq_cmp['activation1'][`post_qdq'][0])
+        ```
+    """
+
+    qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]] = {}
+    for tensor_name, tensors in qdq_activations.items():
+        if tensor_name.endswith(QUANT_INPUT_SUFFIX):
+            pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)]
+            post_qdq_tensors = qdq_activations.get(pre_name)
+            pre_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(DEQUANT_OUTPUT_SUFFIX):
+            pre_name = tensor_name[: -len(DEQUANT_OUTPUT_SUFFIX)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+        elif tensor_name.endswith(_POST_QDQ_POSTFIX1):
+            pre_name = tensor_name[: -len(_POST_QDQ_POSTFIX1)]
+            pre_qdq_tensors = qdq_activations.get(pre_name)
+            post_qdq_tensors = tensors
+            _add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
+
+    if not float_activations:
+        return qdq_cmp
+
+    for act_name, act_values in qdq_cmp.items():
+        float_acts = float_activations.get(act_name)
+        if float_acts is not None:
+            act_values["float"] = float_acts
+
+    return qdq_cmp
+
+
+def _run_dequantize_linear(
+    weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int
+) -> numpy.ndarray | None:
+    assert weight_scale.shape == weight_zp.shape
+    if weight_zp.size == 1:
+        return (weight_tensor - weight_zp) * weight_scale
+
+    assert weight_zp.ndim == 1
+    reshape_dims = list(weight_tensor.shape)  # deep copy
+    reshape_dims[channel_axis] = 1  # only one per channel for reshape
+    channel_count = weight_tensor.shape[channel_axis]
+    dequantized_weights = None
+    for i in range(channel_count):
+        per_channel_data = weight_tensor.take(i, channel_axis)
+        dequantized_per_channel_data = (per_channel_data - weight_zp[i]) * weight_scale[i]
+        if i == 0:
+            dequantized_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+        else:
+            channel_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
+            dequantized_weights = numpy.concatenate((dequantized_weights, channel_weights), channel_axis)
+
+    if dequantized_weights is None:
+        return None
+
+    dequantized_weights.reshape(weight_tensor.shape)
+    return dequantized_weights
+
+
+def create_weight_matching(float_model_path: str, qdq_model_path: str) -> dict[str, dict[str, numpy.ndarray]]:
+    """Comparing weight values to help debugging accuracy loss due to quantization.
+
+    This functions takes the float model and the qdq model, and provides a data structure for comparing
+    their corresponding weights to locate quantization errors
+
+    Arg:
+        float_model_path: Path points to the float point model.
+        qdq_model_path: Path points to the qdq model.
+
+    Returns:
+        Dict for comparing weight tensors. E.g.
+        ```
+        qdq_weight_cmp = create_weight_matching(float_model, qdq_model)
+        print(qdq_weight_cmp['activation1']['float'])
+        print(qdq_weight_cmp['activation1']['dequantized'])
+        ```
+    """
+    float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path)))
+    qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path)))
+
+    matched_weights: dict[str, dict[str, numpy.ndarray]] = {}
+    initializers = qdq_onnx_model.initializer()
+    for node in qdq_onnx_model.nodes():
+        if node.op_type != DEQUANT_OP_NAME:
+            continue  # Only care about DQ node
+        weight_name: str = node.input[0]
+        weight_values = find_by_name(weight_name, initializers)
+        if not weight_values:
+            continue  # Only care about DQ node with const inputs
+        if not weight_name.endswith(TENSOR_NAME_QUANT_SUFFIX):
+            logging.error(f"Model Error in '{qdq_model_path}': Dequantized tensor name '{weight_name}' not recognized!")
+            continue
+
+        axis = -1
+        for attr in node.attribute:
+            if attr.name == "axis":
+                axis = attr.i
+
+        weight_tensor = numpy_helper.to_array(weight_values)
+        weight_scale = numpy_helper.to_array(find_by_name(node.input[1], initializers))
+        if len(node.input) > 2:
+            weight_zp = numpy_helper.to_array(find_by_name(node.input[2], initializers))
+        else:
+            weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
+
+        # Perform dequantization:
+        if weight_scale.size == weight_zp.size == 1:
+            # Avoids the confusion between a scaler and a tensor of one element.
+            weight_scale = weight_scale.reshape(())
+            weight_zp = weight_zp.reshape(())
+        if weight_scale.shape != weight_zp.shape:
+            raise RuntimeError(
+                f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
+            )
+        weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
+        weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
+        if weight_quant is None:
+            logging.error(f"Model Error in '{qdq_model_path}': '{weight_name}' per-channel quantization on 0 channel")
+            continue
+
+        float_values = find_by_name(weight_name, float_onnx_model.initializer())
+        if not float_values:
+            logging.error(f"Model Error in '{float_model_path}': weight tensor '{weight_name}' not found!")
+            continue
+        weight_float = numpy_helper.to_array(float_values)
+        matched_weights[weight_name] = {"float": weight_float, "dequantized": weight_quant}
+
+    return matched_weights
+
+
+def compute_signal_to_quantization_noice_ratio(
+    x: Sequence[numpy.ndarray] | numpy.ndarray, y: Sequence[numpy.ndarray] | numpy.ndarray
+) -> float:
+    if isinstance(x, numpy.ndarray):
+        xlist = [x]
+    else:
+        xlist = x
+    if isinstance(y, numpy.ndarray):
+        ylist = [y]
+    else:
+        ylist = y
+    if len(xlist) != len(ylist):
+        raise RuntimeError("Unequal number of tensors to compare!")
+
+    left = numpy.concatenate(xlist).flatten()
+    right = numpy.concatenate(ylist).flatten()
+
+    epsilon = numpy.finfo("float").eps
+    tensor_norm = max(numpy.linalg.norm(left), epsilon)
+    diff_norm = max(numpy.linalg.norm(left - right), epsilon)
+    res = tensor_norm / diff_norm
+    return 20 * math.log10(res)
+
+
+def compute_weight_error(
+    weights_match: dict[str, dict[str, numpy.ndarray]],
+    err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio,
+) -> dict[str, float]:
+    result: dict[str, float] = {}
+    for weight_name, weight_match in weights_match.items():
+        result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"])
+    return result
+
+
+def compute_activation_error(
+    activations_match: dict[str, dict[str, Sequence[numpy.ndarray]]],
+    err_func: Callable[
+        [Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float
+    ] = compute_signal_to_quantization_noice_ratio,
+) -> dict[str, dict[str, float]]:
+    result: dict[str, dict[str, float]] = {}
+    for name, match in activations_match.items():
+        err_result: dict[str, float] = {}
+        err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"])
+        float_activation = match["float"]
+        if float_activation:
+            err_result["xmodel_err"] = err_func(float_activation, match["post_qdq"])
+        result[name] = err_result
+    return result
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_quantizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/qdq_quantizer.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quant_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quant_utils.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quantize.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/quantize.py
@@ -0,0 +1,953 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import copy
+import logging
+import tempfile
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import onnx
+
+from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
+from .onnx_quantizer import ONNXQuantizer
+from .qdq_quantizer import QDQQuantizer
+from .quant_utils import (
+    MODEL_SIZE_THRESHOLD,
+    QuantFormat,
+    QuantizationMode,
+    QuantType,
+    load_model_with_shape_infer,
+    model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
+    update_opset_version,
+)
+from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
+from .tensor_quant_overrides import TensorQuantOverridesHelper
+
+
+class QuantConfig:
+    def __init__(
+        self,
+        activation_type=QuantType.QUInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+    ):
+        """
+        This is the Base class for both Static and Dynamic Quantize Configuration
+        Args:
+            activation_type:
+                quantization data type of activation. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            weight_type:
+                quantization data type of weight. Please refer to
+                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+            op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+            nodes_to_quantize:
+                List of nodes names to quantize. When this list is not None only the nodes in this list
+                are quantized.
+                example:
+                [
+                    'Conv__224',
+                    'Conv__252'
+                ]
+            nodes_to_exclude:
+                List of nodes names to exclude. The nodes in this list will be excluded from quantization
+                when it is not None.
+            per_channel: quantize weights per channel
+            reduce_range:
+                quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+                especially for per-channel mode
+            use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        """
+
+        nodes_to_exclude = nodes_to_exclude or []
+        nodes_to_quantize = nodes_to_quantize or []
+        op_types_to_quantize = op_types_to_quantize or []
+        self.op_types_to_quantize = op_types_to_quantize
+        self.per_channel = per_channel
+        self.reduce_range = reduce_range
+        self.weight_type = weight_type
+        self.activation_type = activation_type
+        self.nodes_to_quantize = nodes_to_quantize
+        self.nodes_to_exclude = nodes_to_exclude
+        self.use_external_data_format = use_external_data_format
+
+
+class StaticQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        calibrate_method=CalibrationMethod.MinMax,
+        quant_format=QuantFormat.QDQ,
+        activation_type=QuantType.QInt8,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,
+        extra_options=None,
+    ):
+        """
+        This is the derived class for static Quantize Configuration
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            calibrate_method:
+                Current calibration methods supported are MinMax, Entropy and Percentile.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+            calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+                [ "CPUExecutionProvider" ].
+            extra_options:
+                key value pair dictionary for various options in different case. Current used:
+                    extra.Sigmoid.nnapi = True/False  (Default is False)
+                    ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                    WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                    EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                                  Dyanmic mode currently is supported. Will support more in future.
+                    ForceQuantizeNoInputCheck = True/False :
+                        By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                        quantized already. Setting to True to force such operator always quantize input and so generate
+                        quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                    MatMulConstBOnly = True/False:
+                        Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                    AddQDQPairToWeight = True/False :
+                        Default is False which quantizes floating-point weight and feeds it to solely inserted
+                        DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                        QuantizeLinear/DeQuantizeLinear nodes to weight.
+                    OpTypesToExcludeOutputQuantization = list of op type :
+                        Default is []. If any op type is specified, it won't quantize the output of ops with this
+                        specific op types.
+                    DedicatedQDQPair = True/False :
+                        Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                        inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                    QDQOpTypePerChannelSupportToAxis = dictionary :
+                        Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                        effective only when per channel quantization is supported and per_channel is True. If specific
+                        op type supports per channel quantization but not explicitly specified with channel axis,
+                        default channel axis will be used.
+                    CalibTensorRangeSymmetric = True/False :
+                        Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                        set to symmetric to central point "0".
+                    CalibMovingAverage = True/False :
+                        Default is False. If enabled, the moving average of the minimum and maximum values will be
+                        computed when the calibration method selected is MinMax.
+                    CalibMovingAverageConstant = float :
+                        Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                        minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                        when CalibMovingAverage is set to True.
+                    QuantizeBias = True/False :
+                        Default is True which quantizes floating-point biases and it solely inserts
+                        a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
+                        any quantization nodes associated with biases.
+                        This extra option is only effective when quant_format is QuantFormat.QDQ.
+                    SmoothQuant = True/False :
+                        Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                        fake input channel quantization.
+                    SmoothQuantAlpha = float :
+                        Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                        and activation quantization. A larger alpha value could be used on models with more significant
+                        activation outliers to migrate more quantization difficulty to weights.
+                    SmoothQuantFolding = True/False :
+                        Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                        SmoothQuant will be folded into the previous op if the previous op is foldable.
+                    UseQDQContribOps = True/False :
+                        Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                        `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                        contrib op implementations. The contrib op implementations may support features not standardized
+                        into the ONNX specification (e.g., 16-bit quantization types).
+                    MinimumRealRange = float|None :
+                        Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                        (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
+                        is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                        necessary for EPs like QNN that require a minimum floating-point range when determining
+                        quantization parameters.
+                    TensorQuantOverrides = dictionary :
+                        Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                        list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                        per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                        Each dictionary contains optional overrides with the following keys and values.
+                            'quant_type' = QuantType : The tensor's quantization data type.
+                            'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                            'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                            'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                       set `scale` or `zero_point`.
+                            'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                            'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                       Invalid if also set `scale` or `zero_point`.
+                    QDQKeepRemovableActivations = True/False:
+                        Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                        will be explicitly represented in the QDQ model. If false, these activations are automatically
+                        removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                        optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                        operators from the model.
+                    QDQDisableWeightAdjustForInt32Bias = True/False:
+                        Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
+                        has a scale (input_scale * weight_scale) that is too small.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+
+        super().__init__(
+            activation_type=activation_type,
+            weight_type=weight_type,
+            op_types_to_quantize=op_types_to_quantize,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            use_external_data_format=use_external_data_format,
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.calibrate_method = calibrate_method
+        self.quant_format = quant_format
+        self.calibration_providers = calibration_providers
+        self.extra_options = extra_options or {}
+
+
+def get_qdq_config(
+    model_input: str | Path | onnx.ModelProto,
+    calibration_data_reader: CalibrationDataReader,
+    calibrate_method=CalibrationMethod.MinMax,
+    calibrate_args: dict[str, Any] | None = None,
+    activation_type=QuantType.QUInt8,
+    weight_type=QuantType.QInt8,
+    activation_symmetric: bool = False,
+    weight_symmetric: bool | None = None,
+    per_channel: bool = False,
+    reduce_range: bool = False,
+    keep_removable_activations: bool = False,
+    min_real_range: float | None = None,
+    tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
+    nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None,
+    extra_options: dict | None = None,
+) -> StaticQuantConfig:
+    """
+    Returns a configuration suitable that quantizes the entire model to integer precision.
+
+    Params:
+        model_input: Path to the input model file or ModelProto.
+        calibration_data_reader: Calibration data reader.
+        calibrate_methode: The calibration method. Defaults to MinMax.
+        activation_type: The default activation quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QInt8.
+        activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
+            Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
+            the zero-point values are 127 and 32,767, respectively.
+        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
+            Defaults to None. If set to None, weight_symmetric is assumed true if a weight's quant type is a signed int.
+        per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
+            Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
+            and their quantization axes.
+        reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
+            May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
+        min_real_range: Default is None. If set to a floating-point value, the calculation of the quantization parameters
+            (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
+            is less than the specified minimum range, rmax will be set to rmin + min_real_range.
+        tensor_quant_overrides: tensor-level quantization overrides. Defaults to None.
+            The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
+            contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
+            each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
+            key must be present in the first dictionary for per-channel quantization.
+
+            Each dictionary contains optional overrides with the following keys and values.
+                'quant_type' = QuantType : The tensor's quantization data type.
+                'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
+                'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                            set `scale` or `zero_point`.
+                'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                            set `scale` or `zero_point`. Only valid for initializers.
+                'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                            Invalid if also set `scale` or `zero_point`.
+                'convert' = Dict         : A nested dictionary with the same keys for an activation
+                                           tensor that should be converted to another quantization type.
+                'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
+                                               other nodes get the original type. If not specified,
+                                               assume all consumer nodes get the converted type.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
+            and QuantizeLinear are quantized.
+        nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
+            accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
+            should be excluded from quantization.
+        extra_options: Additional options specified as string key/value pairs. Refer to the documentation for
+            `quantize_static` for valid keys and values.
+
+    Returns:
+        A StaticQuantConfig object
+    """
+    q16_types = {QuantType.QInt16, QuantType.QUInt16}
+    q4_types = {QuantType.QInt4, QuantType.QUInt4}
+    op_types_to_exclude = {"Cast", "DequantizeLinear", "QuantizeLinear"}
+
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
+
+    op_types = set()
+    model_has_external_data = False
+    overrides_helper = TensorQuantOverridesHelper(
+        copy.deepcopy(tensor_quant_overrides) if tensor_quant_overrides else {}
+    )
+
+    # check if the model has external data.
+    for initializer in model.graph.initializer:
+        if onnx.external_data_helper.uses_external_data(initializer):
+            model_has_external_data = True
+
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if isinstance(nodes_to_exclude, list) else set()
+
+    # Iterate through nodes to get all operator types in the model and
+    # call user's function to filter out nodes from quantization.
+    for node in model.graph.node:
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if node.name in nodes_to_exclude_set:
+            continue
+        if callable(nodes_to_exclude) and nodes_to_exclude(model, node):
+            nodes_to_exclude_set.add(node.name)
+        else:
+            op_types.add(node.op_type)
+
+    final_extra_options = {
+        "MinimumRealRange": min_real_range,
+        "QDQKeepRemovableActivations": keep_removable_activations,
+        "ActivationSymmetric": activation_symmetric,
+        "WeightSymmetric": weight_symmetric,
+        "ForceQuantizeNoInputCheck": True,
+        "TensorQuantOverrides": overrides_helper.get_dict(),
+    }
+
+    # Pass along known calibration options
+    if calibrate_args:
+        calib_extra_options_keys = [
+            ("symmetric", "CalibTensorRangeSymmetric"),
+            ("moving_average", "CalibMovingAverage"),
+            ("averaging_constant", "CalibMovingAverageConstant"),
+            ("max_intermediate_outputs", "CalibMaxIntermediateOutputs"),
+            ("percentile", "CalibPercentile"),
+        ]
+        calib_extra_options = {
+            key: calibrate_args.get(name) for (name, key) in calib_extra_options_keys if name in calibrate_args
+        }
+        final_extra_options.update(calib_extra_options)
+
+    # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
+    # on Q/DQ operators if using 16-bit or 4-bit quantization.
+    onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
+    if onnx_opset.version < 21:
+        opset21_types = q16_types.union(q4_types)
+        overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
+        if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
+            final_extra_options["UseQDQContribOps"] = True
+
+    # Allow user's extra_options to override our final_extra_options.
+    if extra_options:
+        final_extra_options.update(extra_options)
+
+    return StaticQuantConfig(
+        calibration_data_reader,
+        calibrate_method=calibrate_method,
+        quant_format=QuantFormat.QDQ,
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(op_types_to_exclude))
+        ),
+        nodes_to_exclude=list(nodes_to_exclude_set),
+        per_channel=per_channel,
+        reduce_range=reduce_range,
+        use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
+        extra_options=final_extra_options,
+    )
+
+
+class DynamicQuantConfig(QuantConfig):
+    def __init__(
+        self,
+        weight_type=QuantType.QInt8,
+        op_types_to_quantize=None,
+        nodes_to_quantize=None,
+        nodes_to_exclude=None,
+        per_channel=False,
+        reduce_range=False,
+        use_external_data_format=False,
+        extra_options=None,
+    ):
+        """
+        This is a class for dynamic Quant Configuration
+
+        Args:
+            extra_options: key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
+
+        Raises:
+            ValueError: Raise ValueError if execution provider is unknown
+        """
+        super().__init__(
+            op_types_to_quantize=op_types_to_quantize,
+            per_channel=per_channel,
+            reduce_range=reduce_range,
+            weight_type=weight_type,
+            nodes_to_quantize=nodes_to_quantize,
+            nodes_to_exclude=nodes_to_exclude,
+            use_external_data_format=use_external_data_format,
+        )
+        self.extra_options = extra_options or {}
+
+
+def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format:"
+            "activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
+        )
+    if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
+            "!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
+        )
+
+    if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
+        raise ValueError(
+            "ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
+            f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
+        )
+
+    q16_types = [QuantType.QInt16, QuantType.QUInt16]
+
+    if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
+        raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
+
+    if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
+        logging.warning(
+            "Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
+            "Or it will lead to bad performance on x64."
+        )
+
+
+def quantize_static(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QDQ,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    activation_type=QuantType.QInt8,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    calibrate_method=CalibrationMethod.MinMax,
+    calibration_providers=None,
+    extra_options=None,
+):
+    """
+    Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
+    It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
+    = QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
+    targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
+    accuracy.
+
+    Args:
+
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        calibration_data_reader: a calibration data reader. It
+            enumerates calibration data and generates inputs for the
+            original model.
+        quant_format: QuantFormat{QOperator, QDQ}.
+            QOperator format quantizes the model with quantized operators directly.
+            QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+        activation_type:
+            quantization data type of activation. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        calibrate_method:
+            Current calibration methods supported are MinMax and Entropy.
+                Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
+        op_types_to_quantize:
+                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+                It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ]
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
+                                              Dyanmic mode currently is supported. Will support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is False for static mode. If enabled, only MatMul with const B will be quantized.
+                AddQDQPairToWeight = True/False :
+                    Default is False which quantizes floating-point weight and feeds it to solely inserted
+                    DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
+                    QuantizeLinear/DeQuantizeLinear nodes to weight.
+                OpTypesToExcludeOutputQuantization = list of op type :
+                    Default is []. If any op type is specified, it won't quantize the output of ops with this
+                    specific op types.
+                DedicatedQDQPair = True/False :
+                    Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
+                    inputs. If True, it will create identical and dedicated QDQ pair for each node.
+                QDQOpTypePerChannelSupportToAxis = dictionary :
+                    Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
+                    effective only when per channel quantization is supported and per_channel is True. If specific
+                    op type supports per channel quantization but not explicitly specified with channel axis,
+                    default channel axis will be used.
+                CalibTensorRangeSymmetric = True/False :
+                    Default is False. If enabled, the final range of tensor during calibration will be explicitly
+                    set to symmetric to central point "0".
+                CalibStridedMinMax = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max, only stride amount of
+                    data will be used and then all results will be merged in the end.
+                CalibMovingAverage = True/False :
+                    Default is False. If enabled, the moving average of the minimum and maximum values will be
+                    computed when the calibration method selected is MinMax.
+                CalibMovingAverageConstant = float :
+                    Default is 0.01. Constant smoothing factor to use when computing the moving average of the
+                    minimum and maximum values. Effective only when the calibration method selected is MinMax and
+                    when CalibMovingAverage is set to True.
+                CalibMaxIntermediateOutputs = Optional[int] :
+                    Default is None. If set to an integer, during calculation of the min-max range of the tensors
+                    it will load at max value number of outputs before computing and merging the range. This will
+                    produce the same result as all computing with None, but is more memory efficient.
+                SmoothQuant = True/False :
+                    Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
+                    fake input channel quantization.
+                SmoothQuantAlpha = float :
+                    Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
+                    and activation quantization. A larger alpha value could be used on models with more significant
+                    activation outliers to migrate more quantization difficulty to weights.
+                SmoothQuantFolding = True/False :
+                    Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
+                    SmoothQuant will be folded into the previous op if the previous op is foldable.
+                UseQDQContribOps = True/False :
+                    Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
+                    `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
+                    contrib op implementations. The contrib op implementations may support features not standardized
+                    into the ONNX specification (e.g., 16-bit quantization types).
+                MinimumRealRange = float|None :
+                    Default is None. If set to a floating-point value, the calculation of the quantization parameters
+                    (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
+                    is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
+                    necessary for EPs like QNN that require a minimum floating-point range when determining
+                    quantization parameters.
+                TensorQuantOverrides = dictionary :
+                    Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
+                    list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
+                    per-channel quantization, the list contains a dictionary for each channel in the tensor.
+                    Each dictionary contains optional overrides with the following keys and values.
+                        'quant_type' = QuantType : The tensor's quantization data type.
+                        'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
+                        'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
+                        'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
+                                                   set `scale` or `zero_point`.
+                        'rmax' = Float           : Override the maximum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                        'rmin' = Float           : Override the minimum real tensor value in calibration data.
+                                                   Invalid if also set `scale` or `zero_point`.
+                QDQKeepRemovableActivations = True/False:
+                    Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                    will be explicitly represented in the QDQ model. If false, these activations are automatically
+                    removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                    optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                    operators from the model.
+                QDQDisableWeightAdjustForInt32Bias = True/False:
+                    Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
+                    has a scale (input_scale * weight_scale) that is too small.
+    """
+    if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
+        if calibrate_method != CalibrationMethod.Distribution:
+            raise ValueError("Only Distribution calibration method is supported for float quantization.")
+
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+    mode = QuantizationMode.QLinearOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        q_linear_ops = list(QLinearOpsRegistry.keys())
+        qdq_ops = list(QDQRegistry.keys())
+        op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    calib_extra_options_keys = [
+        ("CalibTensorRangeSymmetric", "symmetric"),
+        ("CalibMovingAverage", "moving_average"),
+        ("CalibMovingAverageConstant", "averaging_constant"),
+        ("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
+        ("CalibPercentile", "percentile"),
+    ]
+    calib_extra_options = {
+        key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
+    }
+
+    if extra_options.get("SmoothQuant", False):
+        import importlib  # noqa: PLC0415
+
+        try:
+            importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
+        except Exception as e:
+            logging.error(f"{e}.")
+            raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
+
+        from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant  # noqa: PLC0415
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        orig_nodes = [i.name for i in model.graph.node]
+        dataloader = inc_dataloader()
+        sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
+        del dataloader
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
+        sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
+        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
+        model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
+
+    updated_model = update_opset_version(model, weight_type)
+    is_model_updated = updated_model is not model
+    if is_model_updated:
+        model = updated_model
+
+    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if is_model_updated:
+            # Update model_input and avoid to use the original one
+            model_input = copy.deepcopy(model)
+
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
+        calibrator = create_calibrator(
+            Path(model_input),
+            op_types_to_quantize,
+            augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
+            calibrate_method=calibrate_method,
+            use_external_data_format=use_external_data_format,
+            providers=calibration_providers,
+            extra_options=calib_extra_options,
+        )
+
+        stride = extra_options.get("CalibStridedMinMax", None)
+        if stride:
+            total_data_size = len(calibration_data_reader)
+            if total_data_size % stride != 0:
+                raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
+
+            for start in range(0, total_data_size, stride):
+                end_index = start + stride
+                calibration_data_reader.set_range(start_index=start, end_index=end_index)
+                calibrator.collect_data(calibration_data_reader)
+        else:
+            calibrator.collect_data(calibration_data_reader)
+        tensors_range = calibrator.compute_data()
+        if not isinstance(tensors_range, TensorsData):
+            raise TypeError(
+                f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
+            )
+        del calibrator
+
+    check_static_quant_arguments(quant_format, activation_type, weight_type)
+
+    if quant_format is QuantFormat.QOperator:
+        quantizer = ONNXQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            mode,
+            True,  # static
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+    else:
+        quantizer = QDQQuantizer(
+            model,
+            per_channel,
+            reduce_range,
+            weight_type,
+            activation_type,
+            tensors_range,
+            nodes_to_quantize,
+            nodes_to_exclude,
+            op_types_to_quantize,
+            extra_options,
+        )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+    if not pre_processed:
+        logging.warning(
+            "Please consider pre-processing before quantization. See "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if extra_options.get("SmoothQuant", False):
+        sq_path.cleanup()
+
+
+def quantize_dynamic(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    weight_type=QuantType.QInt8,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    use_external_data_format=False,
+    extra_options=None,
+):
+    """Given an onnx model, create a quantized onnx model and save it into a file
+
+    Args:
+        model_input: file path of model or ModelProto to quantize
+        model_output: file path of quantized model
+        op_types_to_quantize:
+            specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
+            It quantizes all supported operators by default.
+        per_channel: quantize weights per channel
+        reduce_range:
+            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
+            especially for per-channel mode
+        weight_type:
+            quantization data type of weight. Please refer to
+            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
+        nodes_to_quantize:
+            List of nodes names to quantize. When this list is not None only the nodes in this list
+            are quantized.
+            example:
+            [
+                'Conv__224',
+                'Conv__252'
+            ]
+        nodes_to_exclude:
+            List of nodes names to exclude. The nodes in this list will be excluded from quantization
+            when it is not None.
+        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
+        extra_options:
+            key value pair dictionary for various options in different case. Current used:
+                extra.Sigmoid.nnapi = True/False  (Default is False)
+                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
+                EnableSubgraph = True/False :
+                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
+                    support more in the future.
+                ForceQuantizeNoInputCheck = True/False :
+                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
+                    quantized already. Setting to True to force such operator always quantize input and so generate
+                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
+                MatMulConstBOnly = True/False:
+                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
+    """
+    extra_options = extra_options or {}
+    nodes_to_exclude = nodes_to_exclude or []
+    nodes_to_quantize = nodes_to_quantize or []
+    op_types_to_quantize = op_types_to_quantize or []
+
+    mode = QuantizationMode.IntegerOps
+
+    if not op_types_to_quantize or len(op_types_to_quantize) == 0:
+        op_types_to_quantize = list(IntegerOpsRegistry.keys())
+
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
+
+    pre_processed: bool = model_has_pre_process_metadata(model)
+    if not pre_processed:
+        logging.warning(
+            "Please consider to run pre-processing before quantization. Refer to example: "
+            "https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
+            "/cpu/ReadMe.md "
+        )
+
+    if "MatMulConstBOnly" not in extra_options:
+        extra_options["MatMulConstBOnly"] = True
+
+    model = update_opset_version(model, weight_type)
+
+    quantizer = ONNXQuantizer(
+        model,
+        per_channel,
+        reduce_range,
+        mode,
+        False,  # static
+        weight_type,
+        QuantType.QUInt8,  # dynamic activation only supports uint8
+        None,
+        nodes_to_quantize,
+        nodes_to_exclude,
+        op_types_to_quantize,
+        extra_options,
+    )
+
+    quantizer.quantize_model()
+    quantizer.model.save_model_to_file(model_output, use_external_data_format)
+
+
+def quantize(
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
+    quant_config: QuantConfig,
+):
+    """Quantize a model with QuantConfig.
+
+    Args:
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
+        quant_config (QuantConfig | WeightOnlyQuantConfig): Quantization Configuration.
+    """
+    if isinstance(quant_config, StaticQuantConfig):
+        quantize_static(
+            model_input,
+            model_output,
+            quant_config.calibration_data_reader,
+            calibrate_method=quant_config.calibrate_method,
+            quant_format=quant_config.quant_format,
+            activation_type=quant_config.activation_type,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            calibration_providers=quant_config.calibration_providers,
+            extra_options=quant_config.extra_options,
+        )
+
+    elif isinstance(quant_config, DynamicQuantConfig):
+        quantize_dynamic(
+            model_input,
+            model_output,
+            weight_type=quant_config.weight_type,
+            op_types_to_quantize=quant_config.op_types_to_quantize,
+            nodes_to_quantize=quant_config.nodes_to_quantize,
+            nodes_to_exclude=quant_config.nodes_to_exclude,
+            per_channel=quant_config.per_channel,
+            reduce_range=quant_config.reduce_range,
+            use_external_data_format=quant_config.use_external_data_format,
+            extra_options=quant_config.extra_options,
+        )
+    else:
+        # training package doesn't has quantize_matmul_4bits, avoid global import
+        from .matmul_nbits_quantizer import MatMulNBitsQuantizer, WeightOnlyQuantConfig  # noqa: PLC0415
+
+        if isinstance(quant_config, WeightOnlyQuantConfig):
+            model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load(model_input)
+            quant = MatMulNBitsQuantizer(model, algo_config=quant_config)
+            quant.process()
+            quant.model.save_model_to_file(model_output, True)
+        else:
+            raise TypeError(
+                "Invalid quantization config type, it must be either StaticQuantConfig, "
+                "DynamicQuantConfig, or WeightOnlyQuantConfig."
+            )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/registry.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/registry.py
@@ -0,0 +1,109 @@
+from .operators.activation import QDQRemovableActivation, QLinearActivation
+from .operators.argmax import QArgMax
+from .operators.attention import AttentionQuant
+from .operators.base_operator import QuantOperatorBase
+from .operators.binary_op import QLinearBinaryOp
+from .operators.concat import QLinearConcat
+from .operators.conv import ConvInteger, QDQConv, QLinearConv
+from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
+from .operators.embed_layernorm import EmbedLayerNormalizationQuant
+from .operators.gather import GatherQuant, QDQGather
+from .operators.gavgpool import QGlobalAveragePool
+from .operators.gemm import QDQGemm, QLinearGemm
+from .operators.lstm import LSTMQuant
+from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
+from .operators.maxpool import QDQMaxPool, QMaxPool
+from .operators.norm import QDQNormalization
+from .operators.pad import QDQPad, QPad
+from .operators.pooling import QLinearPool
+from .operators.qdq_base_operator import QDQOperatorBase
+from .operators.resize import QDQResize, QResize
+from .operators.softmax import QLinearSoftmax
+from .operators.split import QDQSplit, QSplit
+from .operators.where import QDQWhere, QLinearWhere
+from .quant_utils import QuantizationMode
+
+CommonOpsRegistry = {
+    "Gather": GatherQuant,
+    "Transpose": Direct8BitOp,
+    "EmbedLayerNormalization": EmbedLayerNormalizationQuant,
+}
+
+IntegerOpsRegistry = {
+    "Conv": ConvInteger,
+    "MatMul": MatMulInteger,
+    "Attention": AttentionQuant,
+    "LSTM": LSTMQuant,
+}
+IntegerOpsRegistry.update(CommonOpsRegistry)
+
+QLinearOpsRegistry = {
+    "ArgMax": QArgMax,
+    "Conv": QLinearConv,
+    "Gemm": QLinearGemm,
+    "MatMul": QLinearMatMul,
+    "Add": QLinearBinaryOp,
+    "Mul": QLinearBinaryOp,
+    "Relu": QLinearActivation,
+    "Clip": QLinearActivation,
+    "LeakyRelu": QLinearActivation,
+    "Sigmoid": QLinearActivation,
+    "MaxPool": QMaxPool,
+    "GlobalAveragePool": QGlobalAveragePool,
+    "Split": QSplit,
+    "Pad": QPad,
+    "Reshape": Direct8BitOp,
+    "Squeeze": Direct8BitOp,
+    "Unsqueeze": Direct8BitOp,
+    "Resize": QResize,
+    "AveragePool": QLinearPool,
+    "Concat": QLinearConcat,
+    "Softmax": QLinearSoftmax,
+    "Where": QLinearWhere,
+}
+QLinearOpsRegistry.update(CommonOpsRegistry)
+
+QDQRegistry = {
+    "Conv": QDQConv,
+    "ConvTranspose": QDQConv,
+    "Gemm": QDQGemm,
+    "Clip": QDQRemovableActivation,
+    "Relu": QDQRemovableActivation,
+    "Reshape": QDQDirect8BitOp,
+    "Transpose": QDQDirect8BitOp,
+    "Squeeze": QDQDirect8BitOp,
+    "Unsqueeze": QDQDirect8BitOp,
+    "Resize": QDQResize,
+    "MaxPool": QDQMaxPool,
+    "AveragePool": QDQDirect8BitOp,
+    "Slice": QDQDirect8BitOp,
+    "Pad": QDQPad,
+    "MatMul": QDQMatMul,
+    "Split": QDQSplit,
+    "Gather": QDQGather,
+    "GatherElements": QDQGather,
+    "Where": QDQWhere,
+    "InstanceNormalization": QDQNormalization,
+    "LayerNormalization": QDQNormalization,
+    "BatchNormalization": QDQNormalization,
+    "TopK": QDQDirect8BitOp,
+}
+
+
+def CreateDefaultOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateOpQuantizer(onnx_quantizer, node):  # noqa: N802
+    registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
+    if node.op_type in registry:
+        op_quantizer = registry[node.op_type](onnx_quantizer, node)
+        if op_quantizer.should_quantize():
+            return op_quantizer
+    return QuantOperatorBase(onnx_quantizer, node)
+
+
+def CreateQDQQuantizer(onnx_quantizer, node):  # noqa: N802
+    if node.op_type in QDQRegistry:
+        return QDQRegistry[node.op_type](onnx_quantizer, node)
+    return QDQOperatorBase(onnx_quantizer, node)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/shape_inference.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/shape_inference.py
@@ -0,0 +1,209 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+
+import logging
+import tempfile
+import traceback
+from pathlib import Path
+
+import onnx
+
+import onnxruntime
+from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
+
+from .fusions import ReplaceUpsampleWithResize
+from .onnx_model import ONNXModel
+from .quant_utils import add_pre_process_metadata, save_and_reload_model_with_shape_infer
+
+logger = logging.getLogger(__name__)
+
+
+def quant_pre_process(
+    input_model: str | Path | onnx.ModelProto | None = None,
+    output_model_path: str | Path | None = None,
+    skip_optimization: bool = False,
+    skip_onnx_shape: bool = False,
+    skip_symbolic_shape: bool = False,
+    auto_merge: bool = False,
+    int_max: int = 2**31 - 1,
+    guess_output_rank: bool = False,
+    verbose: int = 0,
+    save_as_external_data: bool = False,
+    all_tensors_to_one_file: bool = False,
+    external_data_location: str | None = None,
+    external_data_size_threshold: int = 1024,
+    **deprecated_kwargs,
+) -> None:
+    """Shape inference and model optimization, in preparation for quantization.
+
+    Args:
+        input_model: Path to the input model file or ModelProto
+        output_model_path: Path to the output model file
+        skip_optimization: Skip model optimization step if true. This may result in ONNX shape
+            inference failure for some models.
+        skip_onnx_shape: Skip ONNX shape inference. Symbolic shape inference is most effective
+            with transformer based models. Skipping all shape inferences may
+            reduce the effectiveness of quantization, as a tensor with unknown
+            shape can not be quantized.
+        skip_symbolic_shape: Skip symbolic shape inference. Symbolic shape inference is most
+            effective with transformer based models. Skipping all shape
+            inferences may reduce the effectiveness of quantization, as a tensor
+            with unknown shape can not be quantized.
+        auto_merge: For symbolic shape inference, automatically merge symbolic dims when
+            conflict happens.
+        int_max: For symbolic shape inference, specify the maximum value for integer to be
+            treated as boundless for ops like slice
+        guess_output_rank: Guess output rank to be the same as input 0 for unknown ops
+        verbose: Logs detailed info of inference, 0: turn off, 1: warnings, 3: detailed
+        save_as_external_data: Saving an ONNX model to external data
+        all_tensors_to_one_file: Saving all the external data to one file
+        external_data_location: The file location to save the external file
+        external_data_size_threshold: The size threshold for external data
+    """
+
+    if input_model is None:
+        input_model = deprecated_kwargs.pop("input_model_path", None)
+    assert input_model is not None
+
+    assert output_model_path is not None, "output_model_path is required."
+
+    with tempfile.TemporaryDirectory(prefix="pre.quant.") as quant_tmp_dir:
+        temp_path = Path(quant_tmp_dir)
+        model = None
+
+        if not skip_symbolic_shape:
+            logger.info("Performing symbolic shape inference...")
+            loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
+            model = SymbolicShapeInference.infer_shapes(
+                loaded_model,
+                int_max,
+                auto_merge,
+                guess_output_rank,
+                verbose,
+            )
+
+        # Since Upsample is deprecated after opset v10, and the model's opset will
+        # be upgraded to at least v11 during quantization, we need to replace Upsample
+        # with Resize first to avoid generating an invalid model.
+        if model:
+            ai_onnx_domain = [opset for opset in model.opset_import if not opset.domain or opset.domain == "ai.onnx"]
+            if len(ai_onnx_domain) == 1:
+                opset_version = ai_onnx_domain[0].version
+                if opset_version < 10:
+                    ReplaceUpsampleWithResize(ONNXModel(model), opset_version).apply()
+                    model.opset_import.remove(ai_onnx_domain[0])
+                    opset_version = 11
+                    model.opset_import.extend([onnx.helper.make_opsetid("", opset_version)])
+                    model = onnx.version_converter.convert_version(model, opset_version)
+                    model = save_and_reload_model_with_shape_infer(model)
+
+        if not skip_optimization:
+            # Use ORT optimizers (native code) to optimize model
+            if not skip_symbolic_shape:
+                # Need to save the inferenced model to file so as to run the optimizer
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
+                if save_as_external_data:
+                    onnx.save_model(
+                        model,
+                        input_model,
+                        save_as_external_data=True,
+                        all_tensors_to_one_file=all_tensors_to_one_file,
+                        size_threshold=external_data_size_threshold,
+                        convert_attribute=False,
+                    )
+                else:
+                    onnx.save(model, input_model)
+                model = None
+
+            opt_model_path = str(temp_path / "optimized.onnx")
+            try:
+                sess_option = onnxruntime.SessionOptions()
+                sess_option.optimized_model_filepath = opt_model_path
+                sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
+                # For large model, extract external data from model and add to session options
+                if isinstance(input_model, onnx.ModelProto):
+                    if has_external_data(input_model):
+                        raise ValueError(
+                            "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                            "Please load external data before calling this function. "
+                            "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+                        )
+                    external_names, external_values = extract_raw_data_from_model(input_model)
+                    sess_option.add_external_initializers(list(external_names), list(external_values))
+                    input_model = input_model.SerializeToString()
+                # the saved optimized model otherwise points to the original external data file name
+                # which is not available relative to the optimized model file
+                elif skip_symbolic_shape and save_as_external_data:
+                    sess_option.add_session_config_entry(
+                        "session.optimized_model_external_initializers_file_name", "optimized.onnx.data"
+                    )
+
+                sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
+                # Close the session to avoid the cleanup error on Windows for temp folders
+                # https://github.com/microsoft/onnxruntime/issues/17627
+                del sess
+            except Exception:
+                logger.error(
+                    "ONNX Runtime Model Optimization Failed! Consider rerun with option `--skip_optimization'."
+                )
+                logger.error(traceback.format_exc())
+
+            input_model = opt_model_path
+
+        if not skip_onnx_shape:
+            # ONNX shape inference.
+            # According to docs, infer_shapes_path should be used for 2G+ models.
+            # If the skip optimization is specified, we could be dealing with a
+            # large model. So be on the safe side, save the model
+            if model is not None:
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
+                if save_as_external_data:
+                    onnx.save_model(
+                        model,
+                        input_model,
+                        save_as_external_data=True,
+                        all_tensors_to_one_file=all_tensors_to_one_file,
+                        size_threshold=external_data_size_threshold,
+                        convert_attribute=False,
+                    )
+                else:
+                    onnx.save(model, input_model)
+                model = None
+
+            if isinstance(input_model, onnx.ModelProto):
+                input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
+                onnx.save_model(
+                    model,
+                    input_model,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=all_tensors_to_one_file,
+                    size_threshold=external_data_size_threshold,
+                    convert_attribute=False,
+                )
+
+            inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
+            onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
+            model = onnx.load(inferred_model_path)
+
+    if model is None:
+        model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
+
+    add_pre_process_metadata(model)
+
+    if save_as_external_data:
+        onnx.save_model(
+            model,
+            output_model_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=all_tensors_to_one_file,
+            location=external_data_location,
+            size_threshold=external_data_size_threshold,
+            convert_attribute=False,
+        )
+    else:
+        onnx.save(model, output_model_path)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/static_quantize_runner.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/static_quantize_runner.py
@@ -0,0 +1,256 @@
+import argparse
+import json
+import os
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
+from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
+
+
+class OnnxModelCalibrationDataReader(CalibrationDataReader):
+    def __init__(self, model_path):
+        self.model_dir = os.path.dirname(model_path)
+        data_dirs = [
+            os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
+        ]
+        model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
+        name2tensors = []
+        for data_dir in data_dirs:
+            name2tensor = {}
+            data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
+            data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
+            for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
+                name2tensor[model_input.name] = data_ndarray
+            name2tensors.append(name2tensor)
+        assert len(name2tensors) == len(data_dirs)
+        assert len(name2tensors[0]) == len(model_inputs)
+
+        self.calibration_data = iter(name2tensors)
+
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        return next(self.calibration_data, None)
+
+    def read_onnx_pb_data(self, file_pb):
+        tensor = onnx.TensorProto()
+        with open(file_pb, "rb") as f:
+            tensor.ParseFromString(f.read())
+        ret = onnx.numpy_helper.to_array(tensor)
+        return ret
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="The arguments for static quantization")
+    parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
+    parser.add_argument(
+        "-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
+    )
+    parser.add_argument(
+        "--activation_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="quint8",
+        help="Activation quantization type used",
+    )
+    parser.add_argument(
+        "--weight_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="qint8",
+        help="Weight quantization type used",
+    )
+    parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
+    parser.add_argument(
+        "--force_quantize_no_input_check",
+        action="store_true",
+        help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
+        " quantized already. Setting to True to force such operator always quantize input and so generate"
+        " quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
+    )
+    parser.add_argument(
+        "--matmul_const_b_only",
+        action="store_true",
+        help="If set, only MatMul with const B will be quantized.",
+    )
+    parser.add_argument(
+        "--add_qdq_pair_to_weight",
+        action="store_true",
+        help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
+        " nodes to weight.",
+    )
+    parser.add_argument(
+        "--dedicated_qdq_pair",
+        action="store_true",
+        help="If set, it will create identical and dedicated QDQ pair for each node.",
+    )
+    parser.add_argument(
+        "--op_types_to_exclude_output_quantization",
+        nargs="+",
+        default=[],
+        help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
+    )
+    parser.add_argument(
+        "--calibration_method",
+        default="minmax",
+        choices=["minmax", "entropy", "percentile", "distribution"],
+        help="Calibration method used",
+    )
+    parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
+    parser.add_argument(
+        "--calib_tensor_range_symmetric",
+        action="store_true",
+        help="If enabled, the final range of tensor during calibration will be explicitly"
+        " set to symmetric to central point 0",
+    )
+    # TODO: --calib_strided_minmax"
+    # TODO: --calib_moving_average_constant"
+    # TODO: --calib_max_intermediate_outputs"
+    parser.add_argument(
+        "--calib_moving_average",
+        action="store_true",
+        help="If enabled, the moving average of"
+        " the minimum and maximum values will be computed when the calibration method selected is MinMax.",
+    )
+    parser.add_argument(
+        "--disable_quantize_bias",
+        action="store_true",
+        help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
+        " If not set, it remains floating-point bias and does not insert any quantization nodes"
+        " associated with biases.",
+    )
+
+    # TODO: Add arguments related to Smooth Quant
+
+    parser.add_argument(
+        "--use_qdq_contrib_ops",
+        action="store_true",
+        help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
+        " which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
+    )
+    parser.add_argument(
+        "--minimum_real_range",
+        type=float,
+        default=0.0001,
+        help="If set to a floating-point value, the calculation of the quantization parameters"
+        " (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
+        " is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
+        " necessary for EPs like QNN that require a minimum floating-point range when determining "
+        " quantization parameters.",
+    )
+    parser.add_argument(
+        "--qdq_keep_removable_activations",
+        action="store_true",
+        help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
+        " and will be explicitly represented in the QDQ model.",
+    )
+    parser.add_argument(
+        "--qdq_disable_weight_adjust_for_int32_bias",
+        action="store_true",
+        help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
+        " has a scale (input_scale * weight_scale) that is too small.",
+    )
+    parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
+    parser.add_argument(
+        "--nodes_to_quantize",
+        nargs="+",
+        default=None,
+        help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
+    )
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        default=None,
+        help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
+    )
+    parser.add_argument(
+        "--op_per_channel_axis",
+        nargs=2,
+        action="append",
+        metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
+        default=[],
+        help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
+        " effective only when per channel quantization is supported and per_channel is True. If specific"
+        " op type supports per channel quantization but not explicitly specified with channel axis,"
+        " default channel axis will be used.",
+    )
+    parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
+    return parser.parse_args()
+
+
+def get_tensor_quant_overrides(file):
+    # TODO: Enhance the function to handle more real cases of json file
+    if not file:
+        return {}
+    with open(file) as f:
+        quant_override_dict = json.load(f)
+    for tensor in quant_override_dict:
+        for enc_dict in quant_override_dict[tensor]:
+            enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
+            enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
+    return quant_override_dict
+
+
+def main():
+    args = parse_arguments()
+    data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
+    arg2quant_type = {
+        "qint8": QuantType.QInt8,
+        "quint8": QuantType.QUInt8,
+        "qint16": QuantType.QInt16,
+        "quint16": QuantType.QUInt16,
+        "qint4": QuantType.QInt4,
+        "quint4": QuantType.QUInt4,
+        "qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
+    }
+    activation_type = arg2quant_type[args.activation_type]
+    weight_type = arg2quant_type[args.weight_type]
+    qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
+    extra_options = {
+        "EnableSubgraph": args.enable_subgraph,
+        "ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
+        "MatMulConstBOnly": args.matmul_const_b_only,
+        "AddQDQPairToWeight": args.add_qdq_pair_to_weight,
+        "OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
+        "DedicatedQDQPair": args.dedicated_qdq_pair,
+        "QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
+        "CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
+        "CalibMovingAverage": args.calib_moving_average,
+        "QuantizeBias": not args.disable_quantize_bias,
+        "UseQDQContribOps": args.use_qdq_contrib_ops,
+        "MinimumRealRange": args.minimum_real_range,
+        "QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
+        "QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
+        # Load json file for encoding override
+        "TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
+    }
+    arg2calib_method = {
+        "minmax": CalibrationMethod.MinMax,
+        "entropy": CalibrationMethod.Entropy,
+        "percentile": CalibrationMethod.Percentile,
+        "distribution": CalibrationMethod.Distribution,
+    }
+    arg2quant_format = {
+        "qdq": QuantFormat.QDQ,
+        "qoperator": QuantFormat.QOperator,
+    }
+    sqc = StaticQuantConfig(
+        calibration_data_reader=data_reader,
+        calibrate_method=arg2calib_method[args.calibration_method],
+        quant_format=arg2quant_format[args.quant_format],
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=None,
+        nodes_to_quantize=args.nodes_to_quantize,
+        nodes_to_exclude=args.nodes_to_exclude,
+        per_channel=args.per_channel,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,  # Use CPUExecutionProvider
+        extra_options=extra_options,
+    )
+    quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
+
+
+if __name__ == "__main__":
+    main()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/tensor_quant_overrides.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/quantization/tensor_quant_overrides.py
@@ -0,0 +1,520 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import json
+from collections.abc import MutableMapping
+from dataclasses import dataclass
+from typing import Any
+
+import onnx
+
+from .quant_utils import QuantType
+
+
+@dataclass
+class QuantTypeInfo:  # noqa: PLW1641
+    """
+    The quantization type information for a tensor override.
+    """
+
+    quant_type: QuantType
+    symmetric: bool | None = None  # If None, assumes default is used.
+    reduce_range: bool | None = None  # If None, assumes default is used.
+    axis: int | None = None  # If None, assumes per-tensor quantization
+
+    def __eq__(self, other: object):
+        if isinstance(other, QuantTypeInfo):
+            return (
+                self.quant_type == other.quant_type
+                and (self.symmetric is None or other.symmetric is None or self.symmetric == other.symmetric)
+                and (self.reduce_range is None or other.reduce_range is None or self.reduce_range == other.reduce_range)
+                and (self.axis == other.axis)
+            )
+        return NotImplemented
+
+    @staticmethod
+    def load_from_dict(
+        raw_dict: dict[str, Any],
+        default_qtype: QuantType | None = None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        return QuantTypeInfo(
+            raw_dict.get("quant_type", default_qtype),
+            raw_dict.get("symmetric", default_symmetric),
+            raw_dict.get("reduce_range", default_reduce_range),
+            raw_dict.get("axis"),
+        )
+
+    def save_to_dict(self, raw_dict: dict[str, Any]):
+        raw_dict["quant_type"] = self.quant_type
+        if self.symmetric is not None:
+            raw_dict["symmetric"] = self.symmetric
+        if self.reduce_range is not None:
+            raw_dict["reduce_range"] = self.reduce_range
+        if self.axis is not None:
+            raw_dict["axis"] = self.axis
+
+
+class TensorQuantOverridesHelper(MutableMapping):
+    """
+    Utility wrapper over the tensor quantization overrides passed via extra_options.
+    """
+
+    def __init__(self, raw_overrides: dict[str, list[dict[str, Any]]]):
+        self.overrides = raw_overrides
+        self.quant_types = None
+        self.keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
+
+    def has_per_tensor_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" not in overrides_list[0]
+
+    def has_per_channel_overrides(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and "axis" in overrides_list[0]
+
+    def overrides_scale_zp(self, tensor_name: str) -> bool:
+        overrides_list = self.overrides.get(tensor_name)
+        return overrides_list and ("scale" in overrides_list[0]) and ("zero_point" in overrides_list[0])
+
+    def get_per_tensor_overrides(
+        self,
+        tensor_name: str,
+        default_val: dict[str, Any] | None = None,
+    ) -> dict[str, Any] | None:
+        default_list_val = [default_val] if default_val is not None else None
+        overrides_list = self.overrides.get(tensor_name, default_list_val)
+        if overrides_list and "axis" in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
+                f"but found per-channel overrides."
+            )
+
+        return overrides_list[0] if overrides_list else None
+
+    def get_per_channel_overrides(
+        self,
+        tensor_name: str,
+        default_val: list[dict[str, Any]] | None = None,
+    ) -> list[dict[str, Any]] | None:
+        overrides_list = self.overrides.get(tensor_name, default_val)
+
+        if not overrides_list:
+            return None
+
+        if "axis" not in overrides_list[0]:
+            raise ValueError(
+                f"Expected tensor '{tensor_name}' to have per-channel quantization overrides (axis value is missing).",
+            )
+
+        return overrides_list
+
+    def get_quant_types(self) -> set[QuantType]:
+        if self.quant_types is not None:
+            return self.quant_types
+
+        self.quant_types = set()
+
+        if self.overrides:
+            for quant_overrides_list in self.overrides.values():
+                for quant_overrides in quant_overrides_list:
+                    if "quant_type" in quant_overrides:
+                        self.quant_types.add(quant_overrides["quant_type"])
+
+                    if "convert" in quant_overrides and "quant_type" in quant_overrides["convert"]:
+                        self.quant_types.add(quant_overrides["convert"]["quant_type"])
+
+        return self.quant_types
+
+    def _is_valid_per_tensor(
+        self,
+        initializers,
+        default_activation_qtype,
+        tensor_name: str,
+        quant_overrides: dict[str, Any],
+    ) -> tuple[bool, str | None]:
+        if not isinstance(quant_overrides, dict):
+            return (
+                False,
+                f"Tensor quantization overrides for '{tensor_name}' are not in a dict",
+            )
+
+        is_initializer = tensor_name in initializers
+
+        quant_type = quant_overrides.get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        has_scale = "scale" in quant_overrides
+        has_zero_point = "zero_point" in quant_overrides
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        if "reduce_range" in quant_overrides and not is_initializer:
+            return (
+                False,
+                f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+            )
+
+        if "convert" in quant_overrides:
+            if is_initializer:
+                return False, "Cannot use 'convert' override for initializers"
+
+            if "quant_type" not in quant_overrides["convert"]:
+                return False, f"'convert' options (tensor '{tensor_name}') must specify a 'quant_type'"
+
+            if "reduce_range" in quant_overrides["convert"]:
+                return (
+                    False,
+                    f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
+                )
+
+            convert_quant_type = quant_overrides["convert"]["quant_type"]
+            original_quant_type = quant_type if quant_type is not None else default_activation_qtype
+            if convert_quant_type == original_quant_type:
+                return (
+                    False,
+                    f"'convert' quant_type must differ from original quant_type (tensor '{tensor_name}')",
+                )
+
+            convert_has_scale = "scale" in quant_overrides["convert"]
+            convert_has_zero_point = "zero_point" in quant_overrides["convert"]
+
+            if (convert_has_scale and not convert_has_zero_point) or (convert_has_zero_point and not convert_has_scale):
+                return (
+                    False,
+                    f"Must provide both 'scale' and 'zero_point' if one of the overrides is provided (tensor '{tensor_name}')",
+                )
+
+            if convert_has_scale:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides["convert"]))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point' "
+                        f"(tensor '{tensor_name}')",
+                    )
+
+            self.quant_types.add(convert_quant_type)
+
+        return True, None
+
+    def _is_valid_per_channel(
+        self,
+        initializers,
+        tensor_name: str,
+        quant_overrides_list: list[dict[str, Any]],
+    ) -> tuple[bool, str | None]:
+        is_initializer = tensor_name in initializers
+
+        if not is_initializer:
+            return (
+                False,
+                f"Tensor '{tensor_name}' has per-channel overrides, but is not an initializer",
+            )
+
+        axis = quant_overrides_list[0].get("axis")
+
+        if axis is None:
+            return (
+                False,
+                f"Per-channel overrides for tensor {tensor_name} is missing an 'axis' value in "
+                "the first channel dictionary.",
+            )
+
+        weight_shape = list(initializers[tensor_name].dims)
+        weight_rank = len(weight_shape)
+        norm_axis = axis
+        if norm_axis < 0:
+            norm_axis += weight_rank
+
+        if norm_axis < 0 or norm_axis >= len(weight_shape):
+            return (
+                False,
+                f"Axis override value is out-of-bounds for tensor {tensor_name} (rank {len(weight_shape)})",
+            )
+
+        if len(quant_overrides_list) > 1 and len(quant_overrides_list) != weight_shape[norm_axis]:
+            return (
+                False,
+                f"Incorrect number of channel overrides for tensor {tensor_name} (axis {axis}), "
+                f"expected {weight_shape[axis]}, but found {len(quant_overrides_list)}.",
+            )
+
+        if "convert" in quant_overrides_list[0]:
+            return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+        quant_type = quant_overrides_list[0].get("quant_type")
+        if quant_type:
+            self.quant_types.add(quant_type)
+
+        symmetric = quant_overrides_list[0].get("symmetric")
+        reduce_range = quant_overrides_list[0].get("reduce_range")
+
+        has_scale = "scale" in quant_overrides_list[0]
+        has_zero_point = "zero_point" in quant_overrides_list[0]
+        has_scale_zp = has_scale and has_zero_point
+
+        if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
+            return (
+                False,
+                "Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
+            )
+
+        if has_scale_zp:
+            keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides_list[0]))
+            if keys:
+                return (
+                    False,
+                    f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                )
+
+        has_rmin = "rmin" in quant_overrides_list[0]
+        has_rmax = "rmax" in quant_overrides_list[0]
+        has_rmin_rmax = has_rmin and has_rmax
+        if (has_rmin and not has_rmax) or (not has_rmin and has_rmax):
+            return (
+                False,
+                "Must provide both 'rmin' and 'rmax' if one is provided",
+            )
+
+        for index, quant_overrides in enumerate(quant_overrides_list[1:]):
+            if not isinstance(quant_overrides, dict):
+                return (
+                    False,
+                    f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict",
+                )
+
+            if "convert" in quant_overrides:
+                return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
+
+            # For per-channel quantization, all channels must use the same quantization type, axis, symmetric
+            # and reduce_range values. And, if specified, they must be present in the first channel dict
+            # (i.e., quant_overrides_list[0]).
+            if "quant_type" in quant_overrides and quant_type != quant_overrides["quant_type"]:
+                return (
+                    False,
+                    "Channel quantization types for tensor '{tensor_name}' do not match at index {index}.",
+                )
+            if "axis" in quant_overrides and axis != quant_overrides["axis"] and norm_axis != quant_overrides["axis"]:
+                return (
+                    False,
+                    "Channel axis for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "symmetric" in quant_overrides and symmetric != quant_overrides["symmetric"]:
+                return (
+                    False,
+                    "Channel symmetric value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+            if "reduce_range" in quant_overrides and reduce_range != quant_overrides["reduce_range"]:
+                return (
+                    False,
+                    "Channel reduce_range value for tensor '{tensor_name}' does not match at index {index}.",
+                )
+
+            # If override scale/zp, must do so for all channels.
+            chan_has_scale_zp = "scale" in quant_overrides and "zero_point" in quant_overrides
+
+            if has_scale_zp and not chan_has_scale_zp:
+                return (
+                    False,
+                    "Per-channel overrides that specify scale/zero_point must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+            if chan_has_scale_zp:
+                keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
+                if keys:
+                    return (
+                        False,
+                        f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
+                    )
+
+            # If override rmin/rmax, must do so for all channels.
+            chan_has_rmin_rmax = "rmin" in quant_overrides and "rmax" in quant_overrides
+            if has_rmin_rmax and not chan_has_rmin_rmax:
+                return (
+                    False,
+                    "Per-channel overrides that specify rmin/rmax must do so for all channels, "
+                    f"but tensor '{tensor_name}' is missing them at index {index}.",
+                )
+
+        return True, None
+
+    def is_valid(
+        self,
+        initializers: dict[str, onnx.TensorProto],
+        activation_names: set[str],
+        default_activation_qtype,
+    ) -> tuple[bool, str | None]:
+        self.quant_types = set()
+
+        # Validate that compatible/valid overrides are provided.
+        if self.overrides:
+            for tensor_name, quant_overrides_list in self.overrides.items():
+                if tensor_name not in initializers and tensor_name not in activation_names:
+                    return False, f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model"
+
+                if not isinstance(quant_overrides_list, list):
+                    return False, f"Tensor quantization overrides for '{tensor_name}' are not in a list"
+
+                if not quant_overrides_list:
+                    continue
+
+                if not isinstance(quant_overrides_list[0], dict):
+                    return False, f"Tensor quantization overrides at index 0 for '{tensor_name}' are not in a dict"
+
+                if not quant_overrides_list[0]:
+                    continue
+
+                axis = quant_overrides_list[0].get("axis")
+                is_per_channel = len(quant_overrides_list) > 1 or axis is not None
+
+                if is_per_channel:
+                    return self._is_valid_per_channel(initializers, tensor_name, quant_overrides_list)
+
+                return self._is_valid_per_tensor(
+                    initializers, default_activation_qtype, tensor_name, quant_overrides_list[0]
+                )
+
+        return True, None
+
+    def update_tensor_overrides(
+        self,
+        tensor_name: str,
+        new_vals: dict[str, Any],
+        channels: list[int] | None = None,
+        overwrite: bool = True,
+    ) -> bool:
+        if not new_vals:
+            return False
+
+        channels = set(channels) if channels is not None else None
+        have_overrides = self.overrides.get(tensor_name)
+
+        # If `overwrite` is False, check if we would overwrite anything.
+        do_update = True
+        if not overwrite and have_overrides:
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                if set(new_vals).intersection(set(overrides)):
+                    do_update = False
+                    break
+
+        # Do the update if `overwrite` is True or if nothing is overwritten (do not want partial overwrites).
+        if do_update:
+            if not have_overrides:
+                self.overrides[tensor_name] = [{}]
+
+            for channel, overrides in enumerate(self.overrides[tensor_name]):
+                if channels is not None and channel not in channels:
+                    continue
+                overrides.update(new_vals)
+
+        return do_update
+
+    def get_node_output_qtype_info(
+        self,
+        output_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+    ) -> QuantTypeInfo:
+        # Outputs are activations, which do not support 'reduce_range' or 'axis'
+        if output_name not in self.overrides:
+            return QuantTypeInfo(default_qtype, default_symmetric)
+
+        tensor_overrides = self.overrides[output_name][0]
+
+        return QuantTypeInfo(
+            tensor_overrides.get("quant_type", default_qtype),
+            tensor_overrides.get("symmetric", default_symmetric),
+        )
+
+    def get_node_input_qtype_info(
+        self,
+        input_name: str,
+        node_name: str,
+        default_qtype: QuantType | None,
+        default_symmetric: bool | None = None,
+        default_reduce_range: bool | None = None,
+    ) -> QuantTypeInfo:
+        if input_name not in self.overrides or not self.overrides[input_name]:
+            return QuantTypeInfo(default_qtype, default_symmetric, default_reduce_range)
+
+        # Get the first overrides dict in the list. This works for both per-tensor and per-channel
+        # quantization because all channels must use the same quant type.
+        tensor_overrides = self.overrides[input_name][0]
+        producer_type = tensor_overrides.get("quant_type", default_qtype)
+
+        if "convert" not in tensor_overrides:
+            return QuantTypeInfo(
+                producer_type,
+                tensor_overrides.get("symmetric", default_symmetric),
+                tensor_overrides.get("reduce_range", default_reduce_range),
+                tensor_overrides.get("axis"),
+            )
+
+        # This tensor is converted. Check if the node gets the original qtype or the converted qtype.
+        convert_dict = tensor_overrides["convert"]
+        qtype_info = QuantTypeInfo(
+            producer_type,
+            convert_dict.get("symmetric", default_symmetric),
+            # Converted tensors are not initializers, so do not have 'axis' or 'reduce_range'.
+        )
+
+        # Check if all nodes receive the converted type (i.e., recv_nodes is None) or this node
+        # is in the list of consumers (recv_nodes).
+        if ("recv_nodes" not in convert_dict) or (node_name in convert_dict["recv_nodes"]):
+            qtype_info.quant_type = convert_dict["quant_type"]
+
+        return qtype_info
+
+    def pprint_str(self, indent=None) -> str:
+        return json.dumps(self.overrides, default=str, indent=indent)
+
+    def empty(self) -> bool:
+        return not self.overrides
+
+    def get_dict(self) -> dict[str, list[dict[str, Any]]]:
+        return self.overrides
+
+    # Required implementations of abstract methods in collections.abc.MutableMapping
+    # so that this class can be used like a dict.
+    def __setitem__(self, key: str, value: list[dict]):
+        self.overrides[key] = value
+
+    def __getitem__(self, key: str) -> list[dict]:
+        return self.overrides[key]
+
+    def __delitem__(self, key: str):
+        del self.overrides[key]
+
+    def __iter__(self):
+        return iter(self.overrides)
+
+    def __len__(self):
+        return len(self.overrides)
+
+    def __str__(self) -> str:
+        return str(self.overrides)
+
+    def __repr__(self) -> str:
+        return f"{super().__repr__()}, TensorQuantOverridesHelper({self.overrides})"
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/init.py
@@ -0,0 +1,10 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# appended to the __init__.py in the onnxruntime module's 'tools' folder from /tools/python/util/__init__append.py
+import importlib.util
+
+have_torch = importlib.util.find_spec("torch")
+if have_torch:
+    from .pytorch_export_helpers import infer_input_info  # noqa: F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/check_onnx_model_mobile_usability.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/check_onnx_model_mobile_usability.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import logging
+import pathlib
+
+# need this before the mobile helper imports for some reason
+logging.basicConfig(format="%(levelname)s:  %(message)s")
+
+from .mobile_helpers import usability_checker  # noqa: E402
+
+
+def check_usability():
+    parser = argparse.ArgumentParser(
+        description="""Analyze an ONNX model to determine how well it will work in mobile scenarios.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--log_level", choices=["debug", "info"], default="info", help="Logging level")
+    parser.add_argument("model_path", help="Path to ONNX model to check", type=pathlib.Path)
+
+    args = parser.parse_args()
+    logger = logging.getLogger("check_usability")
+
+    if args.log_level == "debug":
+        logger.setLevel(logging.DEBUG)
+    elif args.log_level == "info":
+        logger.setLevel(logging.INFO)
+    elif args.log_level == "warning":
+        logger.setLevel(logging.WARNING)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    try_eps = usability_checker.analyze_model(args.model_path, skip_optimize=False, logger=logger)
+
+    if try_eps:
+        logger.info(
+            "As NNAPI or CoreML may provide benefits with this model it is recommended to compare the "
+            "performance of the model using the NNAPI EP on Android, and the CoreML EP on iOS, "
+            "against the performance using the CPU EP."
+        )
+    else:
+        logger.info("For optimal performance the model should be used with the CPU EP. ")
+
+
+if __name__ == "__main__":
+    check_usability()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/convert_onnx_models_to_ort.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/convert_onnx_models_to_ort.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from __future__ import annotations
+
+import argparse
+import contextlib
+import enum
+import os
+import pathlib
+import tempfile
+
+import onnxruntime as ort
+
+from .file_utils import files_from_file_or_dir, path_match_suffix_ignore_case
+from .onnx_model_utils import get_optimization_level
+from .ort_format_model import create_config_from_models
+
+
+class OptimizationStyle(enum.Enum):
+    Fixed = 0
+    Runtime = 1
+
+
+def _optimization_suffix(optimization_level_str: str, optimization_style: OptimizationStyle, suffix: str):
+    return "{}{}{}".format(
+        f".{optimization_level_str}" if optimization_level_str != "all" else "",
+        ".with_runtime_opt" if optimization_style == OptimizationStyle.Runtime else "",
+        suffix,
+    )
+
+
+def _create_config_file_path(
+    model_path_or_dir: pathlib.Path,
+    output_dir: pathlib.Path | None,
+    optimization_level_str: str,
+    optimization_style: OptimizationStyle,
+    enable_type_reduction: bool,
+):
+    config_name = "{}{}".format(
+        "required_operators_and_types" if enable_type_reduction else "required_operators",
+        _optimization_suffix(optimization_level_str, optimization_style, ".config"),
+    )
+
+    if model_path_or_dir.is_dir():
+        return (output_dir or model_path_or_dir) / config_name
+
+    model_config_path = model_path_or_dir.with_suffix(f".{config_name}")
+
+    if output_dir is not None:
+        return output_dir / model_config_path.name
+
+    return model_config_path
+
+
+def _create_session_options(
+    optimization_level: ort.GraphOptimizationLevel,
+    output_model_path: pathlib.Path,
+    custom_op_library: pathlib.Path,
+    session_options_config_entries: dict[str, str],
+):
+    so = ort.SessionOptions()
+    so.optimized_model_filepath = str(output_model_path)
+    so.graph_optimization_level = optimization_level
+
+    if custom_op_library:
+        so.register_custom_ops_library(str(custom_op_library))
+
+    for key, value in session_options_config_entries.items():
+        so.add_session_config_entry(key, value)
+
+    return so
+
+
+def _convert(
+    model_path_or_dir: pathlib.Path,
+    output_dir: pathlib.Path | None,
+    optimization_level_str: str,
+    optimization_style: OptimizationStyle,
+    custom_op_library: pathlib.Path,
+    create_optimized_onnx_model: bool,
+    allow_conversion_failures: bool,
+    target_platform: str,
+    session_options_config_entries: dict[str, str],
+) -> list[pathlib.Path]:
+    model_dir = model_path_or_dir if model_path_or_dir.is_dir() else model_path_or_dir.parent
+    output_dir = output_dir or model_dir
+
+    optimization_level = get_optimization_level(optimization_level_str)
+
+    def is_model_file_to_convert(file_path: pathlib.Path):
+        if not path_match_suffix_ignore_case(file_path, ".onnx"):
+            return False
+        # ignore any files with an extension of .optimized.onnx which are presumably from previous executions
+        # of this script
+        if path_match_suffix_ignore_case(file_path, ".optimized.onnx"):
+            print(f"Ignoring '{file_path}'")
+            return False
+        return True
+
+    models = files_from_file_or_dir(model_path_or_dir, is_model_file_to_convert)
+
+    if len(models) == 0:
+        raise ValueError(f"No model files were found in '{model_path_or_dir}'")
+
+    providers = ["CPUExecutionProvider"]
+
+    # if the optimization level is greater than or equal to 'layout' we manually exclude the NCHWc transformer.
+    # It's not applicable to ARM devices, and creates a device specific model which won't run on all hardware.
+    # If someone really really really wants to run it they could manually create an optimized onnx model first,
+    # or they could comment out this code.
+    optimizer_filter = None
+    if (
+        (optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL)
+        or (optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_LAYOUT)
+    ) and target_platform != "amd64":
+        optimizer_filter = ["NchwcTransformer"]
+
+    converted_models = []
+
+    for model in models:
+        try:
+            relative_model_path = model.relative_to(model_dir)
+
+            (output_dir / relative_model_path).parent.mkdir(parents=True, exist_ok=True)
+
+            ort_target_path = (output_dir / relative_model_path).with_suffix(
+                _optimization_suffix(optimization_level_str, optimization_style, ".ort")
+            )
+
+            if create_optimized_onnx_model:
+                # Create an ONNX file with the same optimization level that will be used for the ORT format file.
+                # This allows the ONNX equivalent of the ORT format model to be easily viewed in Netron.
+                # If runtime optimizations are saved in the ORT format model, there may be some difference in the
+                # graphs at runtime between the ORT format model and this saved ONNX model.
+                optimized_target_path = (output_dir / relative_model_path).with_suffix(
+                    _optimization_suffix(optimization_level_str, optimization_style, ".optimized.onnx")
+                )
+                so = _create_session_options(
+                    optimization_level, optimized_target_path, custom_op_library, session_options_config_entries
+                )
+                if optimization_style == OptimizationStyle.Runtime:
+                    # Limit the optimizations to those that can run in a model with runtime optimizations.
+                    so.add_session_config_entry("optimization.minimal_build_optimizations", "apply")
+
+                print(f"Saving optimized ONNX model {model} to {optimized_target_path}")
+                _ = ort.InferenceSession(
+                    str(model), sess_options=so, providers=providers, disabled_optimizers=optimizer_filter
+                )
+
+            # Load ONNX model, optimize, and save to ORT format
+            so = _create_session_options(
+                optimization_level, ort_target_path, custom_op_library, session_options_config_entries
+            )
+            so.add_session_config_entry("session.save_model_format", "ORT")
+            if optimization_style == OptimizationStyle.Runtime:
+                so.add_session_config_entry("optimization.minimal_build_optimizations", "save")
+
+            print(f"Converting optimized ONNX model {model} to ORT format model {ort_target_path}")
+            _ = ort.InferenceSession(
+                str(model), sess_options=so, providers=providers, disabled_optimizers=optimizer_filter
+            )
+
+            converted_models.append(ort_target_path)
+
+            # orig_size = os.path.getsize(onnx_target_path)
+            # new_size = os.path.getsize(ort_target_path)
+            # print("Serialized {} to {}. Sizes: orig={} new={} diff={} new:old={:.4f}:1.0".format(
+            #     onnx_target_path, ort_target_path, orig_size, new_size, new_size - orig_size, new_size / orig_size))
+        except Exception as e:
+            print(f"Error converting {model}: {e}")
+            if not allow_conversion_failures:
+                raise
+
+    print(f"Converted {len(converted_models)}/{len(models)} models successfully.")
+
+    return converted_models
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        os.path.basename(__file__),
+        description="""Convert the ONNX format model/s in the provided directory to ORT format models.
+        All files with a `.onnx` extension will be processed. For each one, an ORT format model will be created in the
+        given output directory, if specified, or the same directory.
+        A configuration file will also be created containing the list of required operators for all
+        converted models. This configuration file should be used as input to the minimal build via the
+        `--include_ops_by_config` parameter.
+        """,
+    )
+
+    parser.add_argument(
+        "--output_dir",
+        type=pathlib.Path,
+        help="Provide an output directory for the converted model/s and configuration file. "
+        "If unspecified, the converted ORT format model/s will be in the same directory as the ONNX model/s.",
+    )
+
+    parser.add_argument(
+        "--optimization_style",
+        nargs="+",
+        default=[OptimizationStyle.Fixed.name, OptimizationStyle.Runtime.name],
+        choices=[e.name for e in OptimizationStyle],
+        help="Style of optimization to perform on the ORT format model. "
+        "Multiple values may be provided. The conversion will run once for each value. "
+        "The general guidance is to use models optimized with "
+        f"'{OptimizationStyle.Runtime.name}' style when using NNAPI or CoreML and "
+        f"'{OptimizationStyle.Fixed.name}' style otherwise. "
+        f"'{OptimizationStyle.Fixed.name}': Run optimizations directly before saving the ORT "
+        "format model. This bakes in any platform-specific optimizations. "
+        f"'{OptimizationStyle.Runtime.name}': Run basic optimizations directly and save certain "
+        "other optimizations to be applied at runtime if possible. This is useful when using a "
+        "compiling EP like NNAPI or CoreML that may run an unknown (at model conversion time) "
+        "number of nodes. The saved optimizations can further optimize nodes not assigned to the "
+        "compiling EP at runtime.",
+    )
+
+    parser.add_argument(
+        "--enable_type_reduction",
+        action="store_true",
+        help="Add operator specific type information to the configuration file to potentially reduce "
+        "the types supported by individual operator implementations.",
+    )
+
+    parser.add_argument(
+        "--custom_op_library",
+        type=pathlib.Path,
+        default=None,
+        help="Provide path to shared library containing custom operator kernels to register.",
+    )
+
+    parser.add_argument(
+        "--save_optimized_onnx_model",
+        action="store_true",
+        help="Save the optimized version of each ONNX model. "
+        "This will have the same level of optimizations applied as the ORT format model.",
+    )
+
+    parser.add_argument(
+        "--allow_conversion_failures",
+        action="store_true",
+        help="Whether to proceed after encountering model conversion failures.",
+    )
+
+    parser.add_argument(
+        "--target_platform",
+        type=str,
+        default=None,
+        choices=["arm", "amd64"],
+        help="Specify the target platform where the exported model will be used. "
+        "This parameter can be used to choose between platform-specific options, "
+        "such as QDQIsInt8Allowed(arm), NCHWc (amd64) and NHWC (arm/amd64) format, different "
+        "optimizer level options, etc.",
+    )
+
+    parser.add_argument(
+        "model_path_or_dir",
+        type=pathlib.Path,
+        help="Provide path to ONNX model or directory containing ONNX model/s to convert. "
+        "All files with a .onnx extension, including those in subdirectories, will be "
+        "processed.",
+    )
+
+    parsed_args = parser.parse_args()
+    parsed_args.optimization_style = [OptimizationStyle[style_str] for style_str in parsed_args.optimization_style]
+    return parsed_args
+
+
+def convert_onnx_models_to_ort(
+    model_path_or_dir: pathlib.Path,
+    output_dir: pathlib.Path | None = None,
+    optimization_styles: list[OptimizationStyle] | None = None,
+    custom_op_library_path: pathlib.Path | None = None,
+    target_platform: str | None = None,
+    save_optimized_onnx_model: bool = False,
+    allow_conversion_failures: bool = False,
+    enable_type_reduction: bool = False,
+):
+    if output_dir is not None:
+        if not output_dir.is_dir():
+            output_dir.mkdir(parents=True)
+        output_dir = output_dir.resolve(strict=True)
+
+    optimization_styles = optimization_styles or []
+
+    # setting optimization level is not expected to be needed by typical users, but it can be set with this
+    # environment variable
+    optimization_level_str = os.getenv("ORT_CONVERT_ONNX_MODELS_TO_ORT_OPTIMIZATION_LEVEL", "all")
+    model_path_or_dir = model_path_or_dir.resolve()
+    custom_op_library = custom_op_library_path.resolve() if custom_op_library_path else None
+
+    if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file():
+        raise FileNotFoundError(f"Model path '{model_path_or_dir}' is not a file or directory.")
+
+    if custom_op_library and not custom_op_library.is_file():
+        raise FileNotFoundError(f"Unable to find custom operator library '{custom_op_library}'")
+
+    session_options_config_entries = {}
+
+    if target_platform is not None and target_platform == "arm":
+        session_options_config_entries["session.qdqisint8allowed"] = "1"
+    else:
+        session_options_config_entries["session.qdqisint8allowed"] = "0"
+
+    for optimization_style in optimization_styles:
+        print(
+            f"Converting models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
+        )
+
+        converted_models = _convert(
+            model_path_or_dir=model_path_or_dir,
+            output_dir=output_dir,
+            optimization_level_str=optimization_level_str,
+            optimization_style=optimization_style,
+            custom_op_library=custom_op_library,
+            create_optimized_onnx_model=save_optimized_onnx_model,
+            allow_conversion_failures=allow_conversion_failures,
+            target_platform=target_platform,
+            session_options_config_entries=session_options_config_entries,
+        )
+
+        with contextlib.ExitStack() as context_stack:
+            if optimization_style == OptimizationStyle.Runtime:
+                # Convert models again without runtime optimizations.
+                # Runtime optimizations may not end up being applied, so we need to use both converted models with and
+                # without runtime optimizations to get a complete set of ops that may be needed for the config file.
+                model_dir = model_path_or_dir if model_path_or_dir.is_dir() else model_path_or_dir.parent
+                temp_output_dir = context_stack.enter_context(
+                    tempfile.TemporaryDirectory(dir=model_dir, suffix=".without_runtime_opt")
+                )
+                session_options_config_entries_for_second_conversion = session_options_config_entries.copy()
+                # Limit the optimizations to those that can run in a model with runtime optimizations.
+                session_options_config_entries_for_second_conversion["optimization.minimal_build_optimizations"] = (
+                    "apply"
+                )
+
+                print(
+                    "Converting models again without runtime optimizations to generate a complete config file. "
+                    "These converted models are temporary and will be deleted."
+                )
+                converted_models += _convert(
+                    model_path_or_dir=model_path_or_dir,
+                    output_dir=temp_output_dir,
+                    optimization_level_str=optimization_level_str,
+                    optimization_style=OptimizationStyle.Fixed,
+                    custom_op_library=custom_op_library,
+                    create_optimized_onnx_model=False,  # not useful as they would be created in a temp directory
+                    allow_conversion_failures=allow_conversion_failures,
+                    target_platform=target_platform,
+                    session_options_config_entries=session_options_config_entries_for_second_conversion,
+                )
+
+            print(
+                f"Generating config file from ORT format models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
+            )
+
+            config_file = _create_config_file_path(
+                model_path_or_dir,
+                output_dir,
+                optimization_level_str,
+                optimization_style,
+                enable_type_reduction,
+            )
+
+            create_config_from_models(converted_models, config_file, enable_type_reduction)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    convert_onnx_models_to_ort(
+        args.model_path_or_dir,
+        output_dir=args.output_dir,
+        optimization_styles=args.optimization_style,
+        custom_op_library_path=args.custom_op_library,
+        target_platform=args.target_platform,
+        save_optimized_onnx_model=args.save_optimized_onnx_model,
+        allow_conversion_failures=args.allow_conversion_failures,
+        enable_type_reduction=args.enable_type_reduction,
+    )
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/file_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/file_utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import os
+import pathlib
+import typing
+
+
+def path_match_suffix_ignore_case(path: pathlib.Path | str, suffix: str) -> bool:
+    """
+    Returns whether `path` ends in `suffix`, ignoring case.
+    """
+    if not isinstance(path, str):
+        path = str(path)
+    return path.casefold().endswith(suffix.casefold())
+
+
+def files_from_file_or_dir(
+    file_or_dir_path: pathlib.Path | str, predicate: typing.Callable[[pathlib.Path], bool] = lambda _: True
+) -> list[pathlib.Path]:
+    """
+    Gets the files in `file_or_dir_path` satisfying `predicate`.
+    If `file_or_dir_path` is a file, the single file is considered. Otherwise, all files in the directory are
+    considered.
+    :param file_or_dir_path: Path to a file or directory.
+    :param predicate: Predicate to determine if a file is included.
+    :return: A list of files.
+    """
+    if not isinstance(file_or_dir_path, pathlib.Path):
+        file_or_dir_path = pathlib.Path(file_or_dir_path)
+
+    selected_files = []
+
+    def process_file(file_path: pathlib.Path):
+        if predicate(file_path):
+            selected_files.append(file_path)
+
+    if file_or_dir_path.is_dir():
+        for root, _, files in os.walk(file_or_dir_path):
+            for file in files:
+                file_path = pathlib.Path(root, file)
+                process_file(file_path)
+    else:
+        process_file(file_or_dir_path)
+
+    return selected_files
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/logger.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/logger.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import logging
+
+
+def get_logger(name, level=logging.DEBUG):
+    logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s")
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    return logger
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/make_dynamic_shape_fixed.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/make_dynamic_shape_fixed.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import argparse
+import os
+import pathlib
+import sys
+
+import onnx
+
+from .onnx_model_utils import fix_output_shapes, make_dim_param_fixed, make_input_shape_fixed
+
+
+def make_dynamic_shape_fixed_helper():
+    parser = argparse.ArgumentParser(
+        f"{os.path.basename(__file__)}:{make_dynamic_shape_fixed_helper.__name__}",
+        description="""
+                                     Assign a fixed value to a dim_param or input shape
+                                     Provide either dim_param and dim_value or input_name and input_shape.""",
+    )
+
+    parser.add_argument(
+        "--dim_param", type=str, required=False, help="Symbolic parameter name. Provide dim_value if specified."
+    )
+    parser.add_argument(
+        "--dim_value", type=int, required=False, help="Value to replace dim_param with in the model. Must be > 0."
+    )
+    parser.add_argument(
+        "--input_name",
+        type=str,
+        required=False,
+        help="Model input name to replace shape of. Provide input_shape if specified.",
+    )
+    parser.add_argument(
+        "--input_shape",
+        type=lambda x: [int(i) for i in x.split(",")],
+        required=False,
+        help="Shape to use for input_shape. Provide comma separated list for the shape. "
+        "All values must be > 0. e.g. --input_shape 1,3,256,256",
+    )
+
+    parser.add_argument("input_model", type=pathlib.Path, help="Provide path to ONNX model to update.")
+    parser.add_argument("output_model", type=pathlib.Path, help="Provide path to write updated ONNX model to.")
+
+    args = parser.parse_args()
+
+    if (
+        (args.dim_param and args.input_name)
+        or (not args.dim_param and not args.input_name)
+        or (args.dim_param and (not args.dim_value or args.dim_value < 1))
+        or (args.input_name and (not args.input_shape or any(value < 1 for value in args.input_shape)))
+    ):
+        print("Invalid usage.")
+        parser.print_help()
+        sys.exit(-1)
+
+    model = onnx.load(str(args.input_model.resolve(strict=True)))
+
+    if args.dim_param:
+        make_dim_param_fixed(model.graph, args.dim_param, args.dim_value)
+    else:
+        make_input_shape_fixed(model.graph, args.input_name, args.input_shape)
+
+    # update the output shapes to make them fixed if possible.
+    fix_output_shapes(model)
+
+    onnx.save(model, str(args.output_model.resolve()))
+
+
+if __name__ == "__main__":
+    make_dynamic_shape_fixed_helper()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/init.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/coreml_supported_mlprogram_ops.md
@@ -0,0 +1,50 @@
+<!--
+Keep in sync with doco generated from /docs/execution-providers/CoreML-ExecutionProvider.md on the gh_pages branch
+-->
+|Operator|Note|
+|--------|------|
+|ai.onnx:Add||
+|ai.onnx:Argmax||
+|ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:Cast||
+|ai.onnx:Clip||
+|ai.onnx:Concat||
+|ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
+|ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
+|ai.onnx:DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
+|ai.onnx:Div||
+|ai.onnx:Erf||
+|ai.onnx:Gemm|Input B must be constant.|
+|ai.onnx:Gelu||
+|ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:GlobalMaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:GridSample|4D input.<br/>'mode' of 'linear' or 'zeros'.<br/>(mode==linear && padding_mode==reflection && align_corners==0) is not supported.|
+|ai.onnx:GroupNormalization||
+|ai.onnx:InstanceNormalization||
+|ai.onnx:LayerNormalization||
+|ai.onnx:LeakyRelu||
+|ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.|
+|ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:Max||
+|ai.onnx:Mul||
+|ai.onnx:Pow|Only supports cases when both inputs are fp32.|
+|ai.onnx:PRelu||
+|ai.onnx:Reciprocal|this ask for a `epislon` (default 1e-4) where onnx don't provide|
+|ai.onnx:ReduceSum||
+|ai.onnx:ReduceMean||
+|ai.onnx:ReduceMax||
+|ai.onnx:Relu||
+|ai.onnx:Reshape||
+|ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
+|ai.onnx:Round||
+|ai.onnx:Shape||
+|ai.onnx:Slice|starts/ends/axes/steps must be constant initializers.|
+|ai.onnx:Split|If provided, `splits` must be constant.|
+|ai.onnx:Sub||
+|ai.onnx:Sigmoid||
+|ai.onnx:Softmax||
+|ai.onnx:Sqrt||
+|ai.onnx:Squeeze||
+|ai.onnx:Tanh||
+|ai.onnx:Transpose||
+|ai.onnx:Unsqueeze||
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/coreml_supported_neuralnetwork_ops.md
@@ -0,0 +1,43 @@
+<!--
+Keep in sync with doco generated from /docs/execution-providers/CoreML-ExecutionProvider.md on the gh_pages branch
+-->
+|Operator|Note|
+|--------|------|
+|ai.onnx:Add||
+|ai.onnx:ArgMax||
+|ai.onnx:AveragePool|Only 2D Pool is supported.|
+|ai.onnx:BatchNormalization||
+|ai.onnx:Cast||
+|ai.onnx:Clip||
+|ai.onnx:Concat||
+|ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Weights and bias should be constant.|
+|ai.onnx:DepthToSpace|Only DCR mode DepthToSpace is supported.|
+|ai.onnx:Div||
+|ai.onnx:Flatten||
+|ai.onnx:Gather|Input `indices` with scalar value is not supported.|
+|ai.onnx:Gemm|Input B should be constant.|
+|ai.onnx:GlobalAveragePool|Only 2D Pool is supported.|
+|ai.onnx:GlobalMaxPool|Only 2D Pool is supported.|
+|ai.onnx:LeakyRelu||
+|ai.onnx:LRN||
+|ai.onnx:MatMul|Input B should be constant.|
+|ai.onnx:MaxPool|Only 2D Pool is supported.|
+|ai.onnx:Mul||
+|ai.onnx:Pad|Only constant mode and last two dim padding is supported.<br/>Input pads and constant_value should be constant.<br/>If provided, axes should be constant.|
+|ai.onnx:Pow|Only supports cases when both inputs are fp32.|
+|ai.onnx:PRelu|Input slope should be constant.<br/>Input slope should either have shape [C, 1, 1] or have 1 element.|
+|ai.onnx:Reciprocal||
+|ai.onnx.ReduceSum||
+|ai.onnx:Relu||
+|ai.onnx:Reshape||
+|ai.onnx:Resize|4D input.<br/>`coordinate_transformation_mode` == `asymmetric`.<br/>`mode` == `linear` or `nearest`.<br/>`nearest_mode` == `floor`.<br/>`exclude_outside` == false<br/>`scales` or `sizes` must be constant.|
+|ai.onnx:Shape|Attribute `start` with non-default value is not supported.<br/>Attribute `end` is not supported.|
+|ai.onnx:Sigmoid||
+|ai.onnx:Slice|Inputs `starts`, `ends`, `axes`, and `steps` should be constant. Empty slice is not supported.|
+|ai.onnx:Softmax||
+|ai.onnx:Split|If provided, `splits` must be constant.|
+|ai.onnx:Squeeze||
+|ai.onnx:Sqrt||
+|ai.onnx:Sub||
+|ai.onnx:Tanh||
+|ai.onnx:Transpose||
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/nnapi_supported_ops.md
@@ -0,0 +1,58 @@
+<!--
+Keep in sync with doco generated from /docs/execution-providers/NNAPI-ExecutionProvider.md on the gh_pages branch
+-->
+|Operator|Note|
+|--------|------|
+|ai.onnx:Abs||
+|ai.onnx:Add||
+|ai.onnx:AveragePool|Only 2D Pool is supported.|
+|ai.onnx:BatchNormalization||
+|ai.onnx:Cast||
+|ai.onnx:Clip||
+|ai.onnx:Concat||
+|ai.onnx:Conv|Only 2D Conv is supported.<br/>Weights and bias should be constant.|
+|ai.onnx:DepthToSpace|Only DCR mode DepthToSpace is supported.|
+|ai.onnx:DequantizeLinear|All quantization scales and zero points should be constant.|
+|ai.onnx:Div||
+|ai.onnx:Elu||
+|ai.onnx:Exp||
+|ai.onnx:Flatten||
+|ai.onnx:Floor||
+|ai.onnx:Gather|Input indices should be constant if not int32 type.|
+|ai.onnx:Gemm|If input B is not constant, transB should be 1.|
+|ai.onnx:GlobalAveragePool|Only 2D Pool is supported.|
+|ai.onnx:GlobalMaxPool|Only 2D Pool is supported.|
+|ai.onnx:Identity||
+|ai.onnx:LeakyRelu||
+|ai.onnx:Log||
+|ai.onnx:LRN||
+|ai.onnx:MatMul||
+|ai.onnx:MaxPool|Only 2D Pool is supported.|
+|ai.onnx:Max||
+|ai.onnx:Min||
+|ai.onnx:Mul||
+|ai.onnx:Neg||
+|ai.onnx:Pad|Only constant mode Pad is supported.<br/>Input pads and constant_value should be constant.<br/>Input pads values should be non-negative.|
+|ai.onnx:Pow||
+|ai.onnx:PRelu||
+|ai.onnx:QLinearConv|Only 2D Conv is supported.<br/>Weights and bias should be constant.<br/>All quantization scales and zero points should be constant.|
+|ai.onnx:QLinearMatMul|All quantization scales and zero points should be constant.|
+|ai.onnx:QuantizeLinear|All quantization scales and zero points should be constant.|
+|ai.onnx:ReduceMean||
+|ai.onnx:Relu||
+|ai.onnx:Reshape||
+|ai.onnx:Resize|Only 2D Resize is supported.|
+|ai.onnx:Sigmoid||
+|ai.onnx:Sin||
+|ai.onnx:Slice||
+|ai.onnx:Softmax||
+|ai.onnx:Split|Number of splits must evenly divide split axis size. Input split should be constant if provided.|
+|ai.onnx:Sqrt||
+|ai.onnx:Squeeze|Input axes should be constant.|
+|ai.onnx:Sub||
+|ai.onnx:Tanh||
+|ai.onnx:Transpose||
+|ai.onnx:Unsqueeze|Input axes should be constant.|
+|com.microsoft:QLinearAdd|All quantization scales and zero points should be constant.|
+|com.microsoft:QLinearAveragePool|Only 2D Pool is supported.<br/>All quantization scales and zero points should be constant.|
+|com.microsoft:QLinearSigmoid|All quantization scales and zero points should be constant.|
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/usability_checker.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/mobile_helpers/usability_checker.py
@@ -0,0 +1,738 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import pathlib
+import tempfile
+from collections import deque
+from enum import IntEnum
+
+import onnx
+
+from ..onnx_model_utils import ModelProtoWithShapeInfo, get_producer_consumer_maps, is_fixed_size_tensor, optimize_model
+
+
+class _SupportedOpsChecker:
+    """
+    Class to process the md file with list of supported ops and caveats for an execution provider.
+    e.g. /tools/ci_build/github/android/nnapi_supported_ops.md
+         /tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+         /tools/ci_build/github/apple/coreml_supported_neuralnetwork_ops.md
+    """
+
+    def __init__(self, filename):
+        self._filename = filename
+        self._ops = {}  # op to caveats
+        self._ops_seen = set()
+
+        with open(filename) as f:
+            for line in f:
+                # we're looking for a markdown table with 2 columns. first is op name. second is caveats
+                # op name is domain:op
+                if line.startswith("|"):
+                    pieces = line.strip().split("|")
+                    if len(pieces) == 4:  # pre-first '|'. op, caveat, post-last '|'
+                        domain_op = pieces[1]
+                        caveat = pieces[2]
+                        caveat = caveat.replace("<br/>", " ")  # remove some HTML tags
+                        # skip lines that don't have the ':' which separates the domain and op
+                        # e.g. the table header will fail this check
+                        if ":" in domain_op:
+                            self._ops[domain_op] = caveat
+
+    def is_op_supported(self, node):
+        domain = node.domain if node.domain else "ai.onnx"
+        domain_op = domain + ":" + node.op_type
+
+        is_supported = domain_op in self._ops
+        if is_supported:
+            self._ops_seen.add(domain_op)
+
+        return is_supported
+
+    def get_caveats(self):
+        caveats = []
+        for op in sorted(self._ops_seen):
+            caveat = self._ops[op]
+            if caveat:
+                caveats.append(f"{op}:{caveat}")
+
+        return caveats
+
+
+class PartitioningInfo:
+    class TryWithEP(IntEnum):
+        NO = (0,)
+        MAYBE = (1,)
+        YES = 2
+
+    def __init__(
+        self,
+        num_nodes: int,
+        num_supported_nodes: int,
+        num_partitions: int,
+        supported_ops_checker: _SupportedOpsChecker,
+        supported_groups: list[onnx.NodeProto],
+        unsupported_ops: set[str],
+        nodes_unsupported_due_to_op: int,
+        nodes_unsupported_due_to_dynamic_input: int,
+        num_unsupported_nodes_due_to_rank: int,
+        ops_with_unsupported_rank: set[str],
+    ):
+        self.num_nodes = num_nodes
+        self.num_supported_nodes = num_supported_nodes
+        self.num_partitions = num_partitions
+        self.supported_ops_checker = supported_ops_checker
+        self.supported_groups = supported_groups
+        self.unsupported_ops = unsupported_ops
+        self.nodes_unsupported_due_to_op = nodes_unsupported_due_to_op
+        self.nodes_unsupported_due_to_dynamic_input = nodes_unsupported_due_to_dynamic_input
+        self.num_unsupported_nodes_due_to_rank = num_unsupported_nodes_due_to_rank
+        self.ops_with_unsupported_rank = ops_with_unsupported_rank
+
+        self.num_subgraphs = 0
+        self.num_nodes_in_subgraphs = 0
+
+    def merge(self, other: PartitioningInfo):
+        """
+        Merge the information from another PartitioningInfo instance into this one.
+        """
+        self.num_nodes += other.num_nodes
+        self.num_supported_nodes += other.num_supported_nodes
+        self.num_partitions += other.num_partitions
+        self.supported_groups.extend(other.supported_groups)
+        self.unsupported_ops.update(other.unsupported_ops)
+        self.nodes_unsupported_due_to_op += other.nodes_unsupported_due_to_op
+        self.nodes_unsupported_due_to_dynamic_input += other.nodes_unsupported_due_to_dynamic_input
+        self.num_unsupported_nodes_due_to_rank += other.num_unsupported_nodes_due_to_rank
+        self.ops_with_unsupported_rank.update(other.ops_with_unsupported_rank)
+
+        # hard assumption that we merge into the main graph partitioning info
+        self.num_subgraphs += 1
+        self.num_nodes_in_subgraphs += other.num_nodes
+
+    def suitability(self):
+        # semi-arbitrary choices that err on the side of MAYBE.
+        # having 1 partition is always preferred, but if that is small it may not be useful.
+        # having 2 partitions may be okay if they cover most nodes
+        # more than 2 partitions and the device copy cost is almost guaranteed to outweigh the benefit of using the NPU
+        # NOTE: This assumes the EP is not CPU based and there is device copy overhead to consider
+        pct_supported = self.num_supported_nodes / self.num_nodes * 100
+        if self.num_partitions == 1:
+            if pct_supported > 75:
+                return PartitioningInfo.TryWithEP.YES
+            elif pct_supported > 50:
+                return PartitioningInfo.TryWithEP.MAYBE
+            else:
+                return PartitioningInfo.TryWithEP.NO
+
+        if self.num_partitions == 2:
+            if pct_supported > 75:
+                return PartitioningInfo.TryWithEP.MAYBE
+            else:
+                return PartitioningInfo.TryWithEP.NO
+
+        return PartitioningInfo.TryWithEP.NO
+
+    def print_analysis(self, logger: logging.Logger, ep_name: str):
+        """
+        Analyze the partitioning information and log the analysis
+        :param logger: Logger to use
+        :param ep_name: Execution provider name to use in the log messages
+        """
+
+        logger.info(
+            f"{self.num_partitions} partitions with a total of {self.num_supported_nodes}/{self.num_nodes} "
+            f"nodes can be handled by the {ep_name} EP."
+        )
+
+        if self.supported_groups:
+            logger.info(
+                f"\tPartition sizes: [{', '.join([str(len(partition)) for partition in self.supported_groups])}]"
+            )
+
+            # dump full groups if debug output is enabled
+            for group in self.supported_groups:
+                logger.debug(f"Nodes in group: {','.join([f'{node.op_type}:{node.name}' for node in group])}")
+
+        logger.info(f"Unsupported nodes due to operator={self.nodes_unsupported_due_to_op}")
+        if self.unsupported_ops:
+            logger.info(f"\tUnsupported ops: {','.join(sorted(self.unsupported_ops))}")
+
+        caveats = self.supported_ops_checker.get_caveats()
+        if caveats:
+            indent = " " * 5
+            logger.info(
+                "\tCaveats that have not been checked and may result in a node not actually being supported:  "
+                f"{''.join([os.linesep + indent + caveat for caveat in caveats])}"
+            )
+
+        if self.nodes_unsupported_due_to_dynamic_input:
+            logger.info(
+                "Unsupported nodes due to input having a dynamic shape=%d",
+                self.nodes_unsupported_due_to_dynamic_input,
+            )
+
+        if self.num_unsupported_nodes_due_to_rank:
+            logger.info(f"Unsupported nodes due to rank of input data={self.num_unsupported_nodes_due_to_rank}")
+            logger.info(f"\tOps with unsupported rank: {','.join(sorted(self.ops_with_unsupported_rank))}")
+
+        if self.num_subgraphs > 0:
+            # TODO: CoreML has a flag. NNAPI doesn't. Either should be able to support a subgraph when treated as a
+            # separate graph (only extra detail would be making sure implicit inputs are handled).
+            # Merging the subgraph into the parent graph would be more complex.
+            #   e.g. for CoreML we could potentially convert Loop to while_loop and If to cond if the subgraphs in the
+            #        control flow node are fully supported.
+            #        NNAPI also has While and If.
+
+            # It most likely will be necessary to support merging in If nodes with fully supported subgraphs,
+            # as the subgraphs in those are often very simple, so the performance cost of going to the CPU EP and back
+            # is high.
+            logger.info(
+                f"{self.num_nodes_in_subgraphs} nodes are in {self.num_subgraphs} subgraphs. "
+                "Check EP as to whether subgraphs are supported."
+            )
+
+        pct_nodes_using_ep = self.num_supported_nodes / self.num_nodes * 100
+        if self.num_partitions == 0:
+            logger.info(f"{ep_name} cannot run any nodes in this model.")
+        elif self.num_partitions == 1:
+            if pct_nodes_using_ep > 75:
+                logger.info(
+                    f"{ep_name} should work well for this model as there is one partition "
+                    f"covering {pct_nodes_using_ep:.1f}% of the nodes in the model."
+                )
+            elif pct_nodes_using_ep > 50:
+                logger.info(
+                    f"{ep_name} may work well for this model, however only {pct_nodes_using_ep:.1f}% of nodes "
+                    "will use it. Performance testing is required to validate."
+                )
+            else:
+                logger.info(
+                    f"{ep_name} will probably not work will for this model as only {pct_nodes_using_ep:.2f}% "
+                    "of nodes will use it."
+                )
+
+        elif self.num_partitions == 2 and pct_nodes_using_ep > 75:
+            logger.info(
+                f"{ep_name} can be considered for this model as there are two partitions "
+                f"covering {pct_nodes_using_ep:.1f}% of the nodes. "
+                "Performance testing is required to validate."
+            )
+        else:
+            logger.info(
+                f"{ep_name} is not recommended with this model as there are {self.num_partitions} partitions "
+                f"covering {pct_nodes_using_ep:.1f}% of the nodes in the model. "
+                "This will most likely result in worse performance than just using the CPU EP."
+            )
+
+
+def _check_partitioning_for_graph(
+    graph: onnx.GraphProto,
+    node_to_producers: dict[onnx.NodeProto, set[onnx.NodeProto]],
+    node_to_consumers: dict[onnx.NodeProto, set[onnx.NodeProto]],
+    supported_ops_checker: _SupportedOpsChecker,
+    outer_scope_initializers: set[str],
+    require_fixed_input_sizes: bool,
+    value_info: dict[str, onnx.ValueInfoProto],
+    max_rank: int = 999,  # max rank if EP has a limitation
+):
+    # initializers have fixed sizes.
+    initializers = [i.name for i in graph.initializer]
+
+    def _is_fixed_shape_value(value):
+        if value in value_info:
+            return is_fixed_size_tensor(value_info[value])
+
+        if value in initializers or value in outer_scope_initializers:
+            return True
+
+        # if something has an unknown shape (e.g. something downstream of a Reshape with dynamic input for the shape)
+        # it won't have an entry in value_info
+        return False
+
+    #
+    # Replicate logic from /onnxruntime/core/providers/partitioning_utils.cc:CreateSupportedPartitionNodeGroups
+    # to roughly estimate number of partitions for nodes that is_node_supported_fn returns true for.
+    #
+    # We keep the structure and variable names as close as possible to the C++ implementation to simplify keeping them
+    # in sync if future updates are needed.
+    #
+    # NOTE: CreateSupportedPartitionNodeGroups was recently updated to be QDQ aware so that partitions did not split
+    # QDQ node groups. This code does not need to be QDQ aware as splitting a QDQ node group does not affect the total
+    # number of partitions or supported nodes.
+    #
+
+    # we don't currently support a callback for additional group closure checks in the python implementation
+    on_group_closed_fn = None
+
+    supported_groups = []
+    # number of inputs from unprocessed nodes (in-degree) per node
+    in_degree = {}
+    # nodes that are ready to process
+    nodes_to_process = deque()  # deque of Node instances
+    # nodes that will be processed when considering the next partition node group
+    nodes_to_process_with_next_group = deque()
+
+    # initialize in-degrees and find root nodes
+    for node in graph.node:
+        node_input_edge_count = len(node_to_producers[node]) if node in node_to_producers else 0
+        in_degree[node] = node_input_edge_count
+        if node_input_edge_count == 0:
+            # node is only dependent on graph input or initializers
+            nodes_to_process.append(node)
+
+    supported_group = []
+    # the partition node group's border is the aggregate of its nodes' output nodes
+    supported_group_border = set()
+    num_supported_nodes = 0
+    num_unsupported_nodes_due_to_op = 0
+    num_unsupported_nodes_due_to_dynamic_input = 0
+    num_unsupported_nodes_due_to_rank = 0
+    unsupported_ops = set()
+    ops_with_unsupported_rank = set()
+
+    def close_group():
+        if supported_group:
+            keep_partition = not on_group_closed_fn or on_group_closed_fn(supported_group)
+
+            if keep_partition:
+                supported_groups.append(supported_group.copy())
+
+            supported_group.clear()
+            supported_group_border.clear()
+
+    while nodes_to_process or nodes_to_process_with_next_group:
+        if not nodes_to_process:
+            close_group()
+            nodes_to_process = nodes_to_process_with_next_group
+            nodes_to_process_with_next_group = deque()
+            continue
+
+        node = nodes_to_process.popleft()
+
+        is_op_supported = supported_ops_checker.is_op_supported(node)
+        is_input_shape_supported = not require_fixed_input_sizes or all(_is_fixed_shape_value(i) for i in node.input)
+
+        is_rank_supported = True
+        if value_info:
+            for node_input in node.input:
+                if node_input and node_input in value_info and value_info[node_input].type.HasField("tensor_type"):
+                    input_rank = len(value_info[node_input].type.tensor_type.shape.dim)
+                    if input_rank > max_rank:
+                        is_rank_supported = False
+                        break
+
+        # special-case if we can infer the rank from the length of the 'perms' Transpose attribute
+        # e.g. this works with SegmentAnything where dynamic Reshape operators result in no shape info.
+        if node.op_type == "Transpose" and len(node.attribute[0].ints) > max_rank:
+            is_rank_supported = False
+
+        is_node_supported = is_op_supported and is_input_shape_supported and is_rank_supported
+
+        if not is_node_supported:
+            if node in supported_group_border:
+                # an unsupported node on the border will be processed after the current partition node group
+                # so skip any additional processing/counting here
+                nodes_to_process_with_next_group.append(node)
+                continue
+
+            if not is_op_supported:
+                unsupported_ops.add(f"{node.domain if node.domain else 'ai.onnx'}:{node.op_type}")
+                num_unsupported_nodes_due_to_op += 1
+
+            if not is_input_shape_supported:
+                num_unsupported_nodes_due_to_dynamic_input += 1
+
+            if not is_rank_supported:
+                num_unsupported_nodes_due_to_rank += 1
+                ops_with_unsupported_rank.add(f"{node.domain if node.domain else 'ai.onnx'}:{node.op_type}")
+
+        if is_node_supported:
+            num_supported_nodes += 1
+
+            # add node to the partition node group
+            supported_group.append(node)
+
+            # remove node from the border and add its outputs to the border
+            if node in supported_group_border:  # noqa: FURB132
+                supported_group_border.remove(node)
+
+            # for each consumer node add to supported_group_border
+            if node in node_to_consumers:
+                for consumer in node_to_consumers[node]:
+                    supported_group_border.add(consumer)
+
+        # adjust in-degrees of the node outputs and add any new nodes to process
+        if node in node_to_consumers:
+            for consumer in node_to_consumers[node]:
+                consumer_node_in_degree = in_degree[consumer]
+                consumer_node_in_degree -= 1
+                if consumer_node_in_degree == 0:
+                    nodes_to_process.append(consumer)
+
+                in_degree[consumer] = consumer_node_in_degree
+
+    close_group()
+
+    num_nodes = len(graph.node)
+    num_partitions = len(supported_groups)
+
+    info = PartitioningInfo(
+        num_nodes,
+        num_supported_nodes,
+        num_partitions,
+        supported_ops_checker,
+        supported_groups,
+        unsupported_ops,
+        num_unsupported_nodes_due_to_op,
+        num_unsupported_nodes_due_to_dynamic_input,
+        num_unsupported_nodes_due_to_rank,
+        ops_with_unsupported_rank,
+    )
+
+    return info
+
+
+def check_partitioning(
+    main_graph: onnx.GraphProto,
+    supported_ops_checker: _SupportedOpsChecker,
+    require_fixed_input_sizes: bool,
+    max_rank: int = 999,
+) -> PartitioningInfo:
+    """
+    Estimate the partitions the graph will be split into for nodes that is_node_supported_fn returns true for.
+
+    The check on whether a node is supported is purely based on the operator type. Additional limitations
+    (e.g. NNAPI EP only supports 2D Conv) are not checked, so partitions may not be 100% accurate. The limitations
+    for operators in the partitions are printed so the user can manually check.
+    :param main_graph: Graph to process
+    :param supported_ops_checker: Checker with info on supported ops.
+    :param require_fixed_input_sizes: If True, require that the inputs to a potentially supported node are fixed size
+                                      tensors for it to be considered as supported. This requires
+                                      onnx.shape_inference.infer_shapes to have been run on the model to populate the
+                                      shape information.
+                                      If False, shapes are ignored during the check.
+    :param max_rank: Set if EP has a limitation on the rank of tensors it supports.
+    :return PartitioningInfo instance with details
+    """
+
+    if require_fixed_input_sizes and len(main_graph.value_info) == 0 and len(main_graph.node) > 1:
+        raise ValueError("Run onnx.shape_inference.infer_shapes on the model to populate the shape information.")
+
+    # create lookup map from ValueInfo for efficiency
+    def _update_value_info(graph: onnx.GraphProto, value_to_shape: dict[str, onnx.ValueInfoProto]):
+        for v in graph.input:
+            value_to_shape[v.name] = v
+        for v in graph.output:
+            value_to_shape[v.name] = v
+        for v in graph.value_info:
+            value_to_shape[v.name] = v
+
+    # the producer/consumer maps are for the entire model
+    node_to_producers, node_to_consumers = get_producer_consumer_maps(main_graph)
+
+    def _check_graph(
+        graph: onnx.GraphProto,
+        outer_scope_value_info: dict[str, onnx.ValueInfoProto] | None,
+        outer_scope_initializers: set[str] | None = None,
+        partitioning_info: PartitioningInfo | None = None,
+    ) -> PartitioningInfo:
+        if outer_scope_value_info is not None:
+            # extend value info if we're using it. we replace any value shadowed with a local one
+            value_info = outer_scope_value_info.copy()
+            _update_value_info(graph, value_info)
+        else:
+            value_info = {}
+
+        if outer_scope_initializers is None:
+            outer_scope_initializers = set()
+
+        info = _check_partitioning_for_graph(
+            graph,
+            node_to_producers,
+            node_to_consumers,
+            supported_ops_checker,
+            outer_scope_initializers,
+            require_fixed_input_sizes,
+            value_info,
+            max_rank,
+        )
+
+        if partitioning_info:
+            # merge in subgraph info
+            partitioning_info.merge(info)
+        else:
+            # main graph info
+            partitioning_info = info
+
+        # setup outer scope initializers. we copy the input set as a model may have multiple subgraphs
+        # on multiple levels, so we need to keep the set for each descent separate
+        subgraph_outer_scope_initializers = set(outer_scope_initializers)
+        for initializer in graph.initializer:
+            subgraph_outer_scope_initializers.add(initializer.name)
+
+        for node in graph.node:
+            # recurse into nodes with subgraphs
+            for attr in node.attribute:
+                if attr.HasField("g"):
+                    subgraph = attr.g
+                    partitioning_info = _check_graph(
+                        subgraph, value_info, subgraph_outer_scope_initializers, partitioning_info
+                    )
+
+        return partitioning_info
+
+    aggregated_partitioning_info = _check_graph(main_graph, {} if require_fixed_input_sizes else None)
+
+    return aggregated_partitioning_info
+
+
+def _check_ep_partitioning(
+    model: onnx.ModelProto, supported_ops_config: pathlib.Path, require_fixed_input_sizes: bool, max_rank: int = 999
+):
+    supported_ops = _SupportedOpsChecker(supported_ops_config)
+    partition_info = check_partitioning(model.graph, supported_ops, require_fixed_input_sizes, max_rank)
+    return partition_info
+
+
+def check_nnapi_partitions(model, require_fixed_input_sizes: bool):
+    # if we're running in the ORT python package the file should be local. otherwise assume we're running from the
+    # ORT repo
+    script_dir = pathlib.Path(__file__).parent
+    local_config = script_dir / "nnapi_supported_ops.md"
+    if local_config.exists():
+        config_path = local_config
+    else:
+        ort_root = script_dir.parents[3]
+        config_path = ort_root / "tools" / "ci_build" / "github" / "android" / "nnapi_supported_ops.md"
+
+    return _check_ep_partitioning(model, config_path, require_fixed_input_sizes)
+
+
+def check_coreml_partitions(model: onnx.ModelProto, require_fixed_input_sizes: bool, config_filename: str):
+    # if we're running in the ORT python package the file should be local. otherwise assume we're running from the
+    # ORT repo
+    script_dir = pathlib.Path(__file__).parent
+    local_config = script_dir / config_filename
+    if local_config.exists():
+        config_path = local_config
+    else:
+        ort_root = script_dir.parents[3]
+        config_path = ort_root / "tools" / "ci_build" / "github" / "apple" / config_filename
+
+    max_rank = 5
+    return _check_ep_partitioning(model, config_path, require_fixed_input_sizes, max_rank)
+
+
+def check_shapes(graph: onnx.GraphProto, logger: logging.Logger | None = None):
+    """
+    Check the shapes of graph inputs, values and graph outputs to determine if they have static or dynamic sizes.
+    NNAPI does not support dynamically sized values. CoreML does, but it will most likely cost performance.
+    :param graph: Graph to check. If shape inferencing has been run the checks on values will be meaningful.
+    :param logger: Optional logger for diagnostic information.
+    :return: Tuple of List of inputs with dynamic shapes, Number of dynamic values found
+    """
+
+    # it's OK if the input is dynamically sized and we do a Resize early to a fixed size.
+    # it's not good if lots of ops have dynamic inputs
+
+    num_fixed_values = 0
+    num_dynamic_values = 0
+
+    dynamic_inputs = []
+    for i in graph.input:
+        if not is_fixed_size_tensor(i):
+            dynamic_inputs.append(i)
+            # split/join to remove repeated whitespace and newlines from str(i)
+            if logger:
+                logger.info(f"Input is not a fixed size tensor: {' '.join(str(i).split())}")
+            num_dynamic_values += 1
+        else:
+            num_fixed_values += 1
+
+    dynamic_outputs = []
+    for o in graph.output:
+        if not is_fixed_size_tensor(o):
+            dynamic_outputs.append(o)
+            if logger:
+                logger.info(f"Output is not a fixed size tensor: {' '.join(str(o).split())}")
+            num_dynamic_values += 1
+        else:
+            num_fixed_values += 1
+
+    # check we have value info.
+    # special case some test graphs with a single node which only have graph input and output values, and
+    # a model where all inputs are dynamic (results in no value_info)
+    if not graph.value_info and not (len(graph.node) == 1 or len(dynamic_inputs) == len(graph.input)):
+        logger.warning(
+            "Unable to check shapes within model. ONNX shape inferencing should be run on the model prior to checking."
+        )
+
+    for vi in graph.value_info:
+        if is_fixed_size_tensor(vi):
+            num_fixed_values += 1
+        else:
+            num_dynamic_values += 1
+
+    if logger:
+        logger.info(
+            f"Num values with fixed shape={num_fixed_values}. Num values with dynamic shape={num_dynamic_values}"
+        )
+
+        if dynamic_inputs:
+            if dynamic_outputs:
+                logger.info(
+                    "Model has dynamic inputs and outputs. Consider re-exporting model with fixed sizes "
+                    "if NNAPI or CoreML can be used with this model."
+                )
+            else:
+                logger.info(
+                    """Model has dynamically sized inputs but fixed sized outputs.
+                       If the sizes become fixed early in the model (e.g. pre-processing of a dynamic input size
+                       results in a fixed input size for the majority of the model) performance with NNAPI and CoreML,
+                       if applicable, should not be significantly impacted."""
+                )
+
+    return dynamic_inputs, num_dynamic_values
+
+
+def checker(model_path: pathlib.Path, logger: logging.Logger):
+    model_with_shape_info_wrapper = ModelProtoWithShapeInfo(model_path)
+    model_with_shape_info = model_with_shape_info_wrapper.model_with_shape_info
+
+    dynamic_inputs, num_dynamic_values = check_shapes(model_with_shape_info.graph)
+
+    def check_ep(ep_name, checker_func):
+        logger.info(f"Checking {ep_name}")
+
+        # check with shape info first so supported nodes takes into account values with dynamic shapes
+        require_fixed_input_sizes = True
+        partition_info = checker_func(model_with_shape_info, require_fixed_input_sizes)
+        if logger.getEffectiveLevel() <= logging.INFO:
+            partition_info.print_analysis(logger, ep_name)
+
+        suitability = partition_info.suitability()
+        logger.info(f"Model should perform well with {ep_name} as is: {suitability.name}")
+
+        if suitability != PartitioningInfo.TryWithEP.YES and dynamic_inputs:
+            logger.info("--------")
+            logger.info("Checking if model will perform better if the dynamic shapes are fixed...")
+            require_fixed_input_sizes = False
+            partition_info_with_fixed_shapes = checker_func(model_with_shape_info, require_fixed_input_sizes)
+
+            if logger.getEffectiveLevel() <= logging.INFO:
+                # analyze and log detailed info
+                logger.info("Partition information if the model was updated to make the shapes fixed:")
+                partition_info_with_fixed_shapes.print_analysis(logger, ep_name)
+
+            fixed_shape_suitability = partition_info_with_fixed_shapes.suitability()
+            logger.info(
+                f"Model should perform well with {ep_name} if modified to have fixed input shapes: "
+                f"{fixed_shape_suitability.name}"
+            )
+
+            if fixed_shape_suitability != PartitioningInfo.TryWithEP.NO:
+                logger.info("Shapes can be altered using python -m onnxruntime.tools.make_dynamic_shape_fixed")
+
+            if fixed_shape_suitability.value > suitability.value:
+                suitability = fixed_shape_suitability
+
+        logger.info("================")
+        logger.info("")
+
+        return suitability
+
+    nnapi_suitability = check_ep("NNAPI", check_nnapi_partitions)
+
+    # Check for NeuralNetwork CoreML model
+    def check_nn_coreml(model: onnx.ModelProto, require_fixed_input_sizes):
+        return check_coreml_partitions(model, require_fixed_input_sizes, "coreml_supported_neuralnetwork_ops.md")
+
+    # Check for MLProgram CoreML model
+    def check_mlprogram_coreml(model: onnx.ModelProto, require_fixed_input_sizes):
+        return check_coreml_partitions(model, require_fixed_input_sizes, "coreml_supported_mlprogram_ops.md")
+
+    coreml_nn_suitability = check_ep("CoreML NeuralNetwork", check_nn_coreml)
+    coreml_mlprogram_suitability = check_ep("CoreML MLProgram", check_mlprogram_coreml)
+
+    if (
+        nnapi_suitability != PartitioningInfo.TryWithEP.YES
+        or coreml_nn_suitability != PartitioningInfo.TryWithEP.YES
+        or coreml_mlprogram_suitability != PartitioningInfo.TryWithEP.YES
+    ) and logger.getEffectiveLevel() > logging.INFO:
+        logger.info("Re-run with log level of INFO for more details on the NNAPI/CoreML issues.")
+
+    return (
+        nnapi_suitability != PartitioningInfo.TryWithEP.NO
+        or coreml_nn_suitability != PartitioningInfo.TryWithEP.NO
+        or coreml_mlprogram_suitability != PartitioningInfo.TryWithEP.NO
+    )
+
+
+def analyze_model(model_path: pathlib.Path, skip_optimize: bool = False, logger: logging.Logger | None = None):
+    """
+    Analyze the provided model to determine if it's likely to work well with the NNAPI or CoreML Execution Providers
+    :param model_path: Model to analyze.
+    :param skip_optimize: Skip optimizing to BASIC level before checking. When exporting to ORT format we will do this
+                          optimization..
+    :param logger: Logger for output
+    :return: True if either the NNAPI or CoreML Execution Providers may work well with this model.
+    """
+    if not logger:
+        logger = logging.getLogger("usability_checker")
+        logger.setLevel(logging.INFO)
+
+    logger.info(f"Checking {model_path} for usability with ORT Mobile.")
+
+    with tempfile.TemporaryDirectory() as tmp:
+        if not skip_optimize:
+            tmp_path = pathlib.Path(tmp) / model_path.name
+            optimize_model(model_path, tmp_path, use_external_initializers=True)
+            model_path = tmp_path
+
+        try_eps = checker(model_path.resolve(strict=True), logger)
+
+    return try_eps
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        os.path.basename(__file__), description="""Analyze an ONNX model for usage with the ORT mobile"""
+    )
+
+    parser.add_argument("--log_level", choices=["debug", "info"], default="info", help="Logging level")
+    parser.add_argument(
+        "--skip_optimize",
+        action="store_true",
+        help="Don't optimize the model to BASIC level prior to analyzing. "
+        "Optimization will occur when exporting the model to ORT format, so in general "
+        "should not be skipped unless you have a specific reason to do so.",
+    )
+    parser.add_argument("model_path", type=pathlib.Path, help="Provide path to ONNX model")
+
+    return parser.parse_args()
+
+
+def run_analyze_model():
+    args = parse_args()
+    logger = logging.getLogger("default")
+
+    if args.log_level == "debug":
+        logger.setLevel(logging.DEBUG)
+    elif args.log_level == "info":
+        logger.setLevel(logging.INFO)
+    elif args.log_level == "warning":
+        logger.setLevel(logging.WARNING)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    model_path = args.model_path.resolve()
+    analyze_model(model_path, args.skip_optimize, logger)
+
+
+if __name__ == "__main__":
+    run_analyze_model()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/offline_tuning.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/offline_tuning.py
@@ -0,0 +1,169 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import copy
+import json
+import sys
+from collections import OrderedDict
+from pprint import pprint
+from typing import Any
+
+import onnx
+
+TuningResults = dict[str, Any]
+
+_TUNING_RESULTS_KEY = "tuning_results"
+
+
+def _find_tuning_results_in_props(metadata_props):
+    for idx, prop in enumerate(metadata_props):
+        if prop.key == _TUNING_RESULTS_KEY:
+            return idx
+    return -1
+
+
+def extract(model: onnx.ModelProto):
+    idx = _find_tuning_results_in_props(model.metadata_props)
+    if idx < 0:
+        return None
+
+    tuning_results_prop = model.metadata_props[idx]
+    return json.loads(tuning_results_prop.value)
+
+
+def embed(model: onnx.ModelProto, tuning_results: list[TuningResults], overwrite=False):
+    idx = _find_tuning_results_in_props(model.metadata_props)
+    assert overwrite or idx <= 0, "the supplied onnx file already have tuning results embedded!"
+
+    if idx >= 0:
+        model.metadata_props.pop(idx)
+
+    entry = model.metadata_props.add()
+    entry.key = _TUNING_RESULTS_KEY
+    entry.value = json.dumps(tuning_results)
+    return model
+
+
+class Merger:
+    class EpAndValidators:
+        def __init__(self, ep: str, validators: dict[str, str]):
+            self.ep = ep
+            self.validators = copy.deepcopy(validators)
+            self.key = (ep, tuple(sorted(validators.items())))
+
+        def __hash__(self):
+            return hash(self.key)
+
+        def __eq__(self, other):
+            return self.ep == other.ep and self.key == other.key
+
+    def __init__(self):
+        self.ev_to_results = OrderedDict()
+
+    def merge(self, tuning_results: list[TuningResults]):
+        for trs in tuning_results:
+            self._merge_one(trs)
+
+    def get_merged(self):
+        tuning_results = []
+        for ev, flat_results in self.ev_to_results.items():
+            results = {}
+            trs = {
+                "ep": ev.ep,
+                "validators": ev.validators,
+                "results": results,
+            }
+            for (op_sig, params_sig), kernel_id in flat_results.items():
+                kernel_map = results.setdefault(op_sig, {})
+                kernel_map[params_sig] = kernel_id
+            tuning_results.append(trs)
+        return tuning_results
+
+    def _merge_one(self, trs: TuningResults):
+        ev = Merger.EpAndValidators(trs["ep"], trs["validators"])
+        flat_results = self.ev_to_results.setdefault(ev, {})
+        for op_sig, kernel_map in trs["results"].items():
+            for params_sig, kernel_id in kernel_map.items():
+                if (op_sig, params_sig) not in flat_results:
+                    flat_results[(op_sig, params_sig)] = kernel_id
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    sub_parsers = parser.add_subparsers(help="Command to execute", dest="cmd")
+
+    extract_parser = sub_parsers.add_parser("extract", help="Extract embedded tuning results from an onnx file.")
+    extract_parser.add_argument("input_onnx")
+    extract_parser.add_argument("output_json")
+
+    embed_parser = sub_parsers.add_parser("embed", help="Embed the tuning results into an onnx file.")
+    embed_parser.add_argument("--force", "-f", action="store_true", help="Overwrite the tuning results if it existed.")
+    embed_parser.add_argument("output_onnx", help="Path of the output onnx file.")
+    embed_parser.add_argument("input_onnx", help="Path of the input onnx file.")
+    embed_parser.add_argument("input_json", nargs="+", help="Path(s) of the tuning results file(s) to be embedded.")
+
+    merge_parser = sub_parsers.add_parser("merge", help="Merge multiple tuning results files as a single one.")
+    merge_parser.add_argument("output_json", help="Path of the output tuning results file.")
+    merge_parser.add_argument("input_json", nargs="+", help="Paths of the tuning results files to be merged.")
+
+    pprint_parser = sub_parsers.add_parser("pprint", help="Pretty print the tuning results.")
+    pprint_parser.add_argument("json_or_onnx", help="A tuning results json file or an onnx file.")
+
+    args = parser.parse_args()
+    if len(vars(args)) == 0:
+        parser.print_help()
+        exit(-1)
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.cmd == "extract":
+        tuning_results = extract(onnx.load_model(args.input_onnx))
+        if tuning_results is None:
+            sys.stderr.write(f"{args.input_onnx} does not have tuning results embedded!\n")
+            sys.exit(-1)
+        json.dump(tuning_results, open(args.output_json, "w"))  # noqa: SIM115
+    elif args.cmd == "embed":
+        model = onnx.load_model(args.input_onnx)
+        merger = Merger()
+        for tuning_results in [json.load(open(f)) for f in args.input_json]:  # noqa: SIM115
+            merger.merge(tuning_results)
+        model = embed(model, merger.get_merged(), args.force)
+        onnx.save_model(model, args.output_onnx)
+    elif args.cmd == "merge":
+        merger = Merger()
+        for tuning_results in [json.load(open(f)) for f in args.input_json]:  # noqa: SIM115
+            merger.merge(tuning_results)
+        json.dump(merger.get_merged(), open(args.output_json, "w"))  # noqa: SIM115
+    elif args.cmd == "pprint":
+        tuning_results = None
+        try:  # noqa: SIM105
+            tuning_results = json.load(open(args.json_or_onnx))  # noqa: SIM115
+        except Exception:
+            # it might be an onnx file otherwise, try it latter
+            pass
+
+        if tuning_results is None:
+            try:
+                model = onnx.load_model(args.json_or_onnx)
+                tuning_results = extract(model)
+                if tuning_results is None:
+                    sys.stderr.write(f"{args.input_onnx} does not have tuning results embedded!\n")
+                    sys.exit(-1)
+            except Exception:
+                pass
+
+        if tuning_results is None:
+            sys.stderr.write(f"{args.json_or_onnx} is not a valid tuning results file or onnx file!")
+            sys.exit(-1)
+
+        pprint(tuning_results)
+    else:
+        # invalid choice will be handled by the parser
+        pass
+
+
+if __name__ == "__main__":
+    main()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnx_model_utils.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnx_model_utils.py
@@ -0,0 +1,416 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import logging
+import pathlib
+
+import onnx
+from onnx import version_converter
+
+import onnxruntime as ort
+
+
+def iterate_graph_per_node_func(graph, per_node_func, **func_args):
+    """
+    Iterate the graph including subgraphs calling the per_node_func for each node.
+    :param graph: Graph to iterate
+    :param per_node_func: Function to call for each node. Signature is fn(node: onnx:NodeProto, **kwargs)
+    :param func_args: The keyword args to pass through.
+    """
+
+    for node in graph.node:
+        per_node_func(node, **func_args)
+        # recurse into subgraph for control flow nodes (Scan/Loop/If)
+        for attr in node.attribute:
+            if attr.HasField("g"):
+                iterate_graph_per_node_func(attr.g, per_node_func, **func_args)
+
+
+def iterate_graph_per_graph_func(graph, per_graph_func, **func_args):
+    """
+    Iterate the graph including subgraphs calling the per_graph_func for each Graph.
+    :param graph: Graph to iterate
+    :param per_graph_func: Function to call for each graph. Signature is fn(graph: onnx:GraphProto, **kwargs)
+    :param func_args: The keyword args to pass through.
+    """
+
+    per_graph_func(graph, **func_args)
+
+    for node in graph.node:
+        # recurse into subgraph for control flow nodes (Scan/Loop/If)
+        for attr in node.attribute:
+            if attr.HasField("g"):
+                iterate_graph_per_graph_func(attr.g, per_graph_func, **func_args)
+
+
+def get_opsets_imported(model: onnx.ModelProto):
+    """
+    Get the opsets imported by the model
+    :param model: Model to check.
+    :return: Map of domain to opset.
+    """
+    opsets = {}
+    for entry in model.opset_import:
+        # if empty it's ai.onnx
+        domain = entry.domain or "ai.onnx"
+        opsets[domain] = entry.version
+
+    return opsets
+
+
+def update_onnx_opset(
+    model_path: pathlib.Path,
+    opset: int,
+    out_path: pathlib.Path | None = None,
+    logger: logging.Logger | None = None,
+):
+    """
+    Helper to update the opset of a model using onnx version_converter. Target opset must be greater than current opset.
+    :param model_path: Path to model to update
+    :param opset: Opset to update model to
+    :param out_path: Optional output path for updated model to be saved to.
+    :param logger: Optional logger for diagnostic output
+    :returns: Updated onnx.ModelProto
+    """
+
+    model_path_str = str(model_path.resolve(strict=True))
+    if logger:
+        logger.info("Updating %s to opset %d", model_path_str, opset)
+
+    model = onnx.load(model_path_str)
+
+    new_model = version_converter.convert_version(model, opset)
+
+    if out_path:
+        onnx.save(new_model, str(out_path))
+        if logger:
+            logger.info("Saved updated model to %s", out_path)
+
+    return new_model
+
+
+def optimize_model(
+    model_path: pathlib.Path,
+    output_path: pathlib.Path,
+    level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
+    log_level: int = 3,
+    use_external_initializers: bool = False,
+):
+    """
+    Optimize an ONNX model using ONNX Runtime to the specified level
+    :param model_path: Path to ONNX model
+    :param output_path: Path to save optimized model to.
+    :param level: onnxruntime.GraphOptimizationLevel to use. Default is ORT_ENABLE_BASIC.
+    :param log_level: Log level. Defaults to Error (3) so we don't get output about unused initializers being removed.
+                      Warning (2) or Info (1) may be desirable in some scenarios.
+    :param use_external_initializers: Set flag to write initializers to an external file. Required if model > 2GB.
+                                      Requires onnxruntime 1.17+
+    """
+    so = ort.SessionOptions()
+    so.optimized_model_filepath = str(output_path.resolve())
+    so.graph_optimization_level = level
+    so.log_severity_level = log_level
+
+    # save using external initializers so models > 2 GB are handled
+    if use_external_initializers:
+        major, minor, rest = ort.__version__.split(".", 3)
+        if (int(major), int(minor)) >= (1, 17):
+            so.add_session_config_entry("session.optimized_model_external_initializers_file_name", "external_data.pb")
+        else:
+            raise ValueError(
+                "ONNX Runtime 1.17 or higher required to save initializers as external data when optimizing model. "
+                f"Current ONNX Runtime version is {ort.__version__}"
+            )
+
+    # create session to optimize. this will write the updated model to output_path
+    _ = ort.InferenceSession(str(model_path.resolve(strict=True)), so, providers=["CPUExecutionProvider"])
+
+
+def _replace_symbolic_dim_value(graph: onnx.GraphProto, **kwargs):
+    param_to_replace = kwargs["dim_param"]
+    value = kwargs["value"]
+
+    def update_dim_values(value_infos):
+        for vi in value_infos:
+            if vi.type.HasField("tensor_type"):
+                shape = vi.type.tensor_type.shape
+                if shape:
+                    for dim in shape.dim:
+                        if dim.HasField("dim_param") and dim.dim_param == param_to_replace:
+                            dim.Clear()
+                            dim.dim_value = value
+
+    update_dim_values(graph.input)
+    update_dim_values(graph.output)
+    update_dim_values(graph.value_info)
+
+
+def _remove_invalid_dim_values_impl(graph: onnx.GraphProto):
+    def clear_invalid_values(value):
+        if value.type.HasField("tensor_type"):
+            shape = value.type.tensor_type.shape
+            if shape:
+                for dim in shape.dim:
+                    if dim.HasField("dim_value") and dim.dim_value < 1:
+                        dim.Clear()
+
+    for i in graph.input:
+        clear_invalid_values(i)
+
+    for o in graph.output:
+        clear_invalid_values(o)
+
+    for vi in graph.value_info:
+        clear_invalid_values(vi)
+
+
+def remove_invalid_dim_values(graph: onnx.GraphProto):
+    """
+    Iterate the graph and subgraphs, unsetting any dim_value entries that have a value of less than 1.
+    These are typically erroneously inserted by a converter to represent a dynamic dimension.
+    :param graph: GraphProto to update
+    """
+    iterate_graph_per_graph_func(graph, _remove_invalid_dim_values_impl)
+
+
+def make_dim_param_fixed(graph: onnx.GraphProto, param_name: str, value: int):
+    """
+    Iterate all values in the graph, replacing dim_param in a tensor shape with the provided value.
+    :param graph: GraphProto to update
+    :param param_name: dim_param to set
+    :param value: value to use
+    """
+    iterate_graph_per_graph_func(graph, _replace_symbolic_dim_value, dim_param=param_name, value=value)
+
+
+def make_input_shape_fixed(graph: onnx.GraphProto, input_name: str, fixed_shape: [int]):
+    """
+    Update the named graph input to set shape to the provided value. This can be used to set unknown dims as well
+    as to replace dim values.
+    If setting the input shape replaces a dim_param, update any other values in the graph that use the dim_param.
+    :param graph: Graph to update
+    :param input_name: Name of graph input to update.
+    :param fixed_shape: Shape to use.
+    """
+
+    # remove any invalid dim values first. typically this is a dim_value of -1.
+    remove_invalid_dim_values(graph)
+
+    for i in graph.input:
+        if i.name == input_name:
+            if not i.type.HasField("tensor_type"):
+                raise ValueError(f"Input {input_name} is not a tensor")
+
+            # graph inputs are required to have a shape to provide the rank
+            shape = i.type.tensor_type.shape
+            if len(shape.dim) != len(fixed_shape):
+                raise ValueError(f"Rank mismatch. Existing:{len(shape.dim)} Replacement:{len(fixed_shape)}")
+
+            for idx, dim in enumerate(shape.dim):
+                # check any existing fixed dims match
+                if dim.HasField("dim_value"):
+                    if dim.dim_value != fixed_shape[idx]:
+                        raise ValueError(
+                            f"Can't replace existing fixed size of {dim.dim_value} with {fixed_shape[idx]} "
+                            f"for dimension {idx + 1}"
+                        )
+                elif dim.HasField("dim_param"):
+                    # replacing a dim_param so have to do that through the entire graph
+                    make_dim_param_fixed(graph, dim.dim_param, fixed_shape[idx])
+                else:
+                    # replacing an unknown dim
+                    dim.Clear()
+                    dim.dim_value = fixed_shape[idx]
+
+            return
+
+    raise ValueError(
+        f"Input {input_name} was not found in graph inputs. "
+        f"Valid input names are: {','.join([i.name for i in graph.input])}"
+    )
+
+
+def fix_output_shapes(model: onnx.ModelProto):
+    """
+    Update the output shapesof a model where the input shape/s were made fixed, if possible.
+    This is mainly to make the model usage clearer if the output shapes can be inferred from the new input shapes.
+    :param model: Model that had input shapes fixed.
+    """
+
+    # get a version of the model with shape inferencing info in it. this will provide fixed output shapes if possible.
+    m2 = onnx.shape_inference.infer_shapes(model)
+    onnx.checker.check_model(m2)
+
+    for idx, o in enumerate(model.graph.output):
+        if not is_fixed_size_tensor(o):
+            new_o = m2.graph.output[idx]
+            if is_fixed_size_tensor(new_o):
+                o.type.tensor_type.shape.CopyFrom(new_o.type.tensor_type.shape)
+
+
+def _create_producer_consumer_link(
+    node_to_producers: dict, node_to_consumers: dict, producer: onnx.NodeProto, consumer: onnx.NodeProto
+):
+    """
+    Create links between two nodes for a value produced by one and consumed by the other.
+    :param node_to_producers: Map of NodeProto to set of nodes that produce values the node consumes as inputs.
+    :param node_to_consumers: Map of NodeProto to set of nodes that consume values the node produces as outputs.
+    :param producer: Producer node
+    :param consumer: Consumer node
+    """
+
+    if consumer not in node_to_producers:
+        node_to_producers[consumer] = set()
+
+    if producer not in node_to_consumers:
+        node_to_consumers[producer] = set()
+
+    # add entry mapping this node to the producer of this input
+    node_to_producers[consumer].add(producer)
+    node_to_consumers[producer].add(consumer)
+
+
+def _map_node_dependencies(graph: onnx.GraphProto, node_to_producers: dict, node_to_consumers: dict):
+    graph_inputs = {i.name for i in graph.input}
+    initializers = {i.name for i in graph.initializer}
+
+    # map of value name to node that creates it. copy parent values but override if values get shadowed
+    producers = {}
+
+    implicit_inputs = set()
+
+    def is_local_value(value):
+        return value in producers or value in initializers or value in graph_inputs
+
+    for node in graph.node:
+        inputs = list(node.input)
+
+        for attr in node.attribute:
+            if attr.HasField("g"):
+                subgraph_implicit_inputs = _map_node_dependencies(attr.g, node_to_producers, node_to_consumers)
+                inputs += subgraph_implicit_inputs
+
+        for i in inputs:
+            if not i:
+                # missing optional input
+                continue
+
+            if is_local_value(i):
+                if i in producers:
+                    producer = producers[i]
+                    _create_producer_consumer_link(node_to_producers, node_to_consumers, producer, node)
+            else:
+                implicit_inputs.add(i)
+
+        for o in node.output:
+            producers[o] = node
+
+    return implicit_inputs
+
+
+def get_producer_consumer_maps(graph: onnx.GraphProto):
+    """
+    Get maps for connections between the node that produces each value and the nodes that consume the value.
+    Processing includes subgraphs. As the map key is a Node instance from the Graph there should be no ambiguity.
+    :param graph: Graph to process.
+    :return: Tuple with two maps.
+             First is node_to_producers map of a node to set of all nodes producing input it consumes.
+             Second is node_to_consumers map of a node to set of all nodes consuming output it creates.
+             e.g. NodeA and NodeB provide inputs to NodeC. NodeC provides input to NodeD
+             node_to_consumers[NodeA] = set([NodeC])
+             node_to_consumers[NodeB] = set([NodeC])
+             node_to_producers[NodeC] = set([NodeA, NodeB])
+             node_to_consumers[NodeC] = set([NodeD])
+             node_to_producers[NodeD] = set([NodeC])
+    """
+
+    # use a hash of the object id for NodeProto.
+    # we need this for the partitioning checker where we keep maps with nodes as the key.
+    onnx.NodeProto.__hash__ = lambda self: id(self)
+
+    node_to_producers = {}  # map of node instance to nodes producing input values it consumes
+    node_to_consumers = {}  # map of node instance to nodes consuming output values it produces
+
+    implicit_inputs = _map_node_dependencies(graph, node_to_producers, node_to_consumers)
+
+    # top level graph should have no implicit inputs
+    if implicit_inputs:
+        raise ValueError(
+            f"This appears to be an invalid model with missing inputs of {','.join(sorted(implicit_inputs))}"
+        )
+
+    return node_to_producers, node_to_consumers
+
+
+def is_fixed_size_tensor(value: onnx.ValueInfoProto):
+    """
+    Check if value is a tensor with a fixed shape.
+    :param value: onnx.ValueInfoProto to check
+    :return: True if value is a tensor, with a shape, where all dimensions have fixed values.
+    """
+
+    is_fixed = False
+    if value.type.HasField("tensor_type"):
+        shape = value.type.tensor_type.shape
+        if shape:
+            is_fixed = True  # scalar has no dims so set to True and unset if we hit a dim without a valid value
+            for dim in shape.dim:
+                if dim.HasField("dim_value") and dim.dim_value > 0:
+                    continue
+
+                # anything else means it's a dynamic value
+                is_fixed = False
+                break
+
+    return is_fixed
+
+
+def get_optimization_level(level):
+    """Convert string to GraphOptimizationLevel."""
+    if level == "disable":
+        return ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+    if level == "basic":
+        # Constant folding and other optimizations that only use ONNX operators
+        return ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+    if level == "extended":
+        # Optimizations using custom operators, excluding NCHWc and NHWC layout optimizers
+        return ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+    if level == "layout":
+        # NCHWc and NHWC layout optimizers
+        return ort.GraphOptimizationLevel.ORT_ENABLE_LAYOUT
+    if level == "all":
+        return ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+    raise ValueError("Invalid optimization level of " + level)
+
+
+class ModelProtoWithShapeInfo:
+    """
+    Class to load an ONNX model and run shape inferencing on it to populate the ValueInfo.
+    The model_with_shape_info property will contain the updated model.
+    If the model is > 2GB and uses external data a temporary file is required to run shape inferencing successfully.
+    This helper class handles automatic removal of the temporary file.
+    """
+
+    def __init__(self, model_path: pathlib.Path):
+        """
+        :param model_path: Path to ONNX model to load and run shape inferencing on.
+        """
+
+        self.model_path = model_path
+
+        model = onnx.load(str(model_path))
+        self.model_with_shape_info = onnx.shape_inference.infer_shapes(model, strict_mode=True)
+
+        # ONNX has a silent failure from the call to infer_shapes when the model is > 2GB.
+        # We detect that by checking the nodes in the returned model.
+        self._tmp_model_path = None
+        if len(model.graph.node) > 0 and len(self.model_with_shape_info.graph.node) == 0:
+            self._tmp_model_path = pathlib.Path(model_path).with_suffix(".temp_with_shapeinf.onnx")
+            onnx.shape_inference.infer_shapes_path(str(model_path), str(self._tmp_model_path), strict_mode=True)
+            self.model_with_shape_info = onnx.load(str(self._tmp_model_path))
+
+    def __del__(self):
+        if self._tmp_model_path:
+            self._tmp_model_path.unlink(missing_ok=True)
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnx_randomizer.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnx_randomizer.py
@@ -0,0 +1,85 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# An offline standalone script to declassify an ONNX model by randomizing the tensor data in initializers.
+# The ORT Performance may change especially on generative models.
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+from onnx import load_model, numpy_helper, onnx_pb, save_model
+
+# An experimental small value for differentiating shape data and weights.
+# The tensor data with larger size can't be shape data.
+# User may adjust this value as needed.
+SIZE_THRESHOLD = 10
+
+
+def graph_iterator(model, func):
+    graph_queue = [model.graph]
+    while graph_queue:
+        graph = graph_queue.pop(0)
+        func(graph)
+        for node in graph.node:
+            for attr in node.attribute:
+                if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPH:
+                    assert isinstance(attr.g, onnx_pb.GraphProto)
+                    graph_queue.append(attr.g)
+                if attr.type == onnx_pb.AttributeProto.AttributeType.GRAPHS:
+                    for g in attr.graphs:
+                        assert isinstance(g, onnx_pb.GraphProto)
+                        graph_queue.append(g)
+
+
+def randomize_graph_initializer(graph):
+    for i_tensor in graph.initializer:
+        array = numpy_helper.to_array(i_tensor)
+        # TODO: need to find a better way to differentiate shape data and weights.
+        if array.size > SIZE_THRESHOLD:
+            random_array = np.random.uniform(array.min(), array.max(), size=array.shape).astype(array.dtype)
+            o_tensor = numpy_helper.from_array(random_array, i_tensor.name)
+            i_tensor.CopyFrom(o_tensor)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Randomize the weights of an ONNX model")
+    parser.add_argument("-m", type=str, required=True, help="input onnx model path")
+    parser.add_argument("-o", type=str, required=True, help="output onnx model path")
+    parser.add_argument(
+        "--use_external_data_format",
+        required=False,
+        action="store_true",
+        help="Store or Save in external data format",
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        required=False,
+        action="store_true",
+        help="Save all tensors to one file",
+    )
+    args = parser.parse_args()
+
+    data_path = None
+    if args.use_external_data_format:
+        if Path(args.m).parent == Path(args.o).parent:
+            raise RuntimeError("Please specify output directory with different parent path to input directory.")
+        if args.all_tensors_to_one_file:
+            data_path = Path(args.o).name + ".data"
+
+    Path(args.o).parent.mkdir(parents=True, exist_ok=True)
+    onnx_model = load_model(args.m, load_external_data=args.use_external_data_format)
+    graph_iterator(onnx_model, randomize_graph_initializer)
+    save_model(
+        onnx_model,
+        args.o,
+        save_as_external_data=args.use_external_data_format,
+        all_tensors_to_one_file=args.all_tensors_to_one_file,
+        location=data_path,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnxruntime_test.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/onnxruntime_test.py
@@ -0,0 +1,164 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from timeit import default_timer as timer
+
+import numpy as np
+
+import onnxruntime as onnxrt
+
+float_dict = {
+    "tensor(float16)": "float16",
+    "tensor(float)": "float32",
+    "tensor(double)": "float64",
+}
+
+integer_dict = {
+    "tensor(int32)": "int32",
+    "tensor(int8)": "int8",
+    "tensor(uint8)": "uint8",
+    "tensor(int16)": "int16",
+    "tensor(uint16)": "uint16",
+    "tensor(int64)": "int64",
+    "tensor(uint64)": "uint64",
+}
+
+
+def generate_feeds(sess, symbolic_dims: dict | None = None):
+    feeds = {}
+    symbolic_dims = symbolic_dims or {}
+    for input_meta in sess.get_inputs():
+        # replace any symbolic dimensions
+        shape = []
+        for dim in input_meta.shape:
+            if not dim:
+                # unknown dim
+                shape.append(1)
+            elif isinstance(dim, str):
+                # symbolic dim. see if we have a value otherwise use 1
+                if dim in symbolic_dims:
+                    shape.append(int(symbolic_dims[dim]))
+                else:
+                    shape.append(1)
+            else:
+                shape.append(dim)
+
+        if input_meta.type in float_dict:
+            feeds[input_meta.name] = np.random.rand(*shape).astype(float_dict[input_meta.type])
+        elif input_meta.type in integer_dict:
+            feeds[input_meta.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
+                integer_dict[input_meta.type]
+            )
+        elif input_meta.type == "tensor(bool)":
+            feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
+        else:
+            print(f"unsupported input type {input_meta.type} for input {input_meta.name}")
+            sys.exit(-1)
+    return feeds
+
+
+# simple test program for loading onnx model, feeding all inputs and running the model num_iters times.
+def run_model(
+    model_path,
+    num_iters=1,
+    debug=None,
+    profile=None,
+    symbolic_dims=None,
+    feeds=None,
+    override_initializers=True,
+):
+    symbolic_dims = symbolic_dims or {}
+    if debug:
+        print(f"Pausing execution ready for debugger to attach to pid: {os.getpid()}")
+        print("Press key to continue.")
+        sys.stdin.read(1)
+
+    sess_options = None
+    if profile:
+        sess_options = onnxrt.SessionOptions()
+        sess_options.enable_profiling = True
+        sess_options.profile_file_prefix = os.path.basename(model_path)
+
+    sess = onnxrt.InferenceSession(
+        model_path,
+        sess_options=sess_options,
+        providers=onnxrt.get_available_providers(),
+    )
+    meta = sess.get_modelmeta()
+
+    if not feeds:
+        feeds = generate_feeds(sess, symbolic_dims)
+
+    if override_initializers:
+        # Starting with IR4 some initializers provide default values
+        # and can be overridden (available in IR4). For IR < 4 models
+        # the list would be empty
+        for initializer in sess.get_overridable_initializers():
+            shape = [dim if dim else 1 for dim in initializer.shape]
+            if initializer.type in float_dict:
+                feeds[initializer.name] = np.random.rand(*shape).astype(float_dict[initializer.type])
+            elif initializer.type in integer_dict:
+                feeds[initializer.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(
+                    integer_dict[initializer.type]
+                )
+            elif initializer.type == "tensor(bool)":
+                feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
+            else:
+                print(f"unsupported initializer type {initializer.type} for initializer {initializer.name}")
+                sys.exit(-1)
+
+    start = timer()
+    for _i in range(num_iters):
+        outputs = sess.run([], feeds)  # fetch all outputs
+    end = timer()
+
+    print(f"model: {meta.graph_name}")
+    print(f"version: {meta.version}")
+    print(f"iterations: {num_iters}")
+    print(f"avg latency: {((end - start) * 1000) / num_iters} ms")
+
+    if profile:
+        trace_file = sess.end_profiling()
+        print(f"trace file written to: {trace_file}")
+
+    return 0, feeds, num_iters > 0 and outputs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Simple ONNX Runtime Test Tool.")
+    parser.add_argument("model_path", help="model path")
+    parser.add_argument(
+        "num_iters",
+        nargs="?",
+        type=int,
+        default=1000,
+        help="model run iterations. default=1000",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="pause execution to allow attaching a debugger.",
+    )
+    parser.add_argument("--profile", action="store_true", help="enable chrome timeline trace profiling.")
+    parser.add_argument(
+        "--symbolic_dims",
+        default={},
+        type=lambda s: dict(x.split("=") for x in s.split(",")),
+        help="Comma separated name=value pairs for any symbolic dimensions in the model input. "
+        "e.g. --symbolic_dims batch=1,seqlen=5. "
+        "If not provided, the value of 1 will be used for all symbolic dimensions.",
+    )
+
+    args = parser.parse_args()
+    exit_code, _, _ = run_model(args.model_path, args.num_iters, args.debug, args.profile, args.symbolic_dims)
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/optimize_onnx_model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/optimize_onnx_model.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import argparse
+import os
+import pathlib
+
+from .onnx_model_utils import get_optimization_level, optimize_model
+
+
+def optimize_model_helper():
+    parser = argparse.ArgumentParser(
+        f"{os.path.basename(__file__)}:{optimize_model_helper.__name__}",
+        description="""
+                                     Optimize an ONNX model using ONNX Runtime to the specified level.
+                                     See https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html for more
+                                     details of the optimization levels.""",
+    )
+
+    parser.add_argument(
+        "--opt_level",
+        default="basic",
+        choices=["disable", "basic", "extended", "layout", "all"],
+        help="Optimization level to use.",
+    )
+    parser.add_argument(
+        "--log_level",
+        choices=["debug", "info", "warning", "error"],
+        type=str,
+        required=False,
+        default="error",
+        help="Log level. Defaults to Error so we don't get output about unused initializers "
+        "being removed. Warning or Info may be desirable in some scenarios.",
+    )
+
+    parser.add_argument("input_model", type=pathlib.Path, help="Provide path to ONNX model to update.")
+    parser.add_argument("output_model", type=pathlib.Path, help="Provide path to write optimized ONNX model to.")
+
+    args = parser.parse_args()
+
+    if args.log_level == "error":
+        log_level = 3
+    elif args.log_level == "debug":
+        log_level = 0  # ORT verbose level
+    elif args.log_level == "info":
+        log_level = 1
+    elif args.log_level == "warning":
+        log_level = 2
+
+    optimize_model(args.input_model, args.output_model, get_optimization_level(args.opt_level), log_level)
+
+
+if __name__ == "__main__":
+    optimize_model_helper()
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/init.py
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import sys
+
+# need to add the path to the ORT flatbuffers python module before we import anything else here.
+# we also auto-magically adjust to whether we're running from the ORT repo, or from within the ORT python package
+script_dir = os.path.dirname(os.path.realpath(__file__))
+fbs_py_schema_dirname = "ort_flatbuffers_py"
+if os.path.isdir(os.path.join(script_dir, fbs_py_schema_dirname)):
+    # fbs bindings are in this directory, so we're running in the ORT python package
+    ort_fbs_py_parent_dir = script_dir
+else:
+    # running directly from ORT repo, so fbs bindings are under onnxruntime/core/flatbuffers
+    ort_root = os.path.abspath(os.path.join(script_dir, "..", "..", "..", ".."))
+    ort_fbs_py_parent_dir = os.path.join(ort_root, "onnxruntime", "core", "flatbuffers")
+
+sys.path.append(ort_fbs_py_parent_dir)
+
+from .operator_type_usage_processors import (  # noqa: E402
+    GloballyAllowedTypesOpTypeImplFilter,  # noqa: F401
+    OperatorTypeUsageManager,  # noqa: F401
+    OpTypeImplFilterInterface,  # noqa: F401
+)
+from .ort_model_processor import OrtFormatModelProcessor  # noqa: E402, F401
+from .utils import create_config_from_models  # noqa: E402, F401
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/operator_type_usage_processors.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/operator_type_usage_processors.py
@@ -0,0 +1,653 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import json
+from abc import ABC, abstractmethod
+
+import ort_flatbuffers_py.fbs as fbs
+
+from .types import FbsTypeInfo, value_name_to_typestr
+
+
+def _create_op_key(domain: str, optype: str):
+    return f"{domain}:{optype}"
+
+
+def _ort_constant_for_domain(domain: str):
+    """
+    Map a string domain value to the internal ONNX Runtime constant for that domain.
+    :param domain: Domain string to map.
+    :return: Internal ONNX Runtime constant
+    """
+
+    # constants are defined in <ORT root>/include/onnxruntime/core/graph/constants.h
+    # This list is limited to just the domains we have processors for
+    domain_to_constant_map = {"ai.onnx": "kOnnxDomain", "ai.onnx.ml": "kMLDomain", "com.microsoft": "kMSDomain"}
+
+    if domain not in domain_to_constant_map:
+        raise ValueError(f"Domain {domain} not found in map to ONNX Runtime constant. Please update map.")
+
+    return domain_to_constant_map[domain]
+
+
+def _reg_type_to_cpp_type(reg_type: str):
+    if reg_type == "string":
+        return "std::string"
+    return reg_type
+
+
+def _split_reg_types(reg_types_str: str):
+    """
+    Split on underscores but append "_t" to the previous element.
+    """
+    tokens = reg_types_str.split("_")
+    reg_types = []
+    for token in tokens:
+        if token == "t" and len(reg_types) > 0:
+            reg_types[-1] += "_t"
+        else:
+            reg_types += [token]
+    return reg_types
+
+
+class TypeUsageProcessor(ABC):
+    """
+    Abstract base class for processors which implement operator specific logic to determine the type or types required.
+    """
+
+    def __init__(self, domain: str, optype: str):
+        self.domain = domain
+        self.optype = optype
+        self.name = _create_op_key(domain, optype)
+
+    @abstractmethod
+    def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
+        pass
+
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None):
+        """
+        Given the string from a kernel registration, determine if the registration is required or not.
+        :param type_in_registration: Type string from kernel registration
+        :param globally_allowed_types: Optional set of globally allowed types. If provided, these types take precedence
+                                       in determining the required types.
+        :return: True is required. False if not.
+        """
+        # Not all operators have typed registrations, so this is optionally implemented by derived classes
+        raise RuntimeError(f"Did not expect processor for {self.name} to have typed registrations.")
+
+    def get_cpp_entry(self):
+        """
+        Get the C++ code that specifies this operator's required types.
+        :return: List with any applicable C++ code for this operator's required types. One line per entry.
+        """
+        # Not applicable for some ops, so return no lines by default.
+        return []
+
+    @abstractmethod
+    def to_config_entry(self):
+        """
+        Generate a configuration file entry in JSON format with the required types for the operator.
+        :return: JSON string with required type information.
+        """
+
+    @abstractmethod
+    def from_config_entry(self, entry: str):
+        """
+        Re-create the types required from a configuration file entry created with to_config_entry.
+        NOTE: Any existing type information should be cleared prior to re-creating from a config file entry.
+        :param entry: Configuration file entry
+        """
+
+
+class DefaultTypeUsageProcessor(TypeUsageProcessor):
+    """
+    Operator processor which tracks the types used for selected input/s and/or output/s.
+    """
+
+    def __init__(
+        self,
+        domain: str,
+        optype: str,
+        inputs: [int] = [0],  # noqa: B006
+        outputs: [int] = [],  # noqa: B006
+        required_input_types: dict[int, set[str]] = {},  # noqa: B006
+        required_output_types: dict[int, set[str]] = {},  # noqa: B006
+    ):
+        """
+        Create DefaultTypeUsageProcessor. Types for one or more inputs and/or outputs can be tracked by the processor.
+        The default is to track the types required for input 0, as this is the most common use case in ONNX.
+
+        Required input and output types may be specified. These are only applicable to is_typed_registration_needed().
+        If a registration type matches a required type, the typed registration is needed.
+        There is a separate mechanism for specifying required types from C++ for kernels with untyped registration.
+
+        :param domain: Operator domain.
+        :param optype: Operator name.
+        :param inputs: Inputs to track. Zero based index. May be empty.
+        :param outputs: Outputs to track. Zero based index. May be empty.
+        :param required_input_types: Required input types. May be empty.
+        :param required_output_types: Required output types. May be empty.
+        """
+        super().__init__(domain, optype)
+        self._input_types = {}
+        self._output_types = {}
+
+        for i in inputs:
+            self._input_types[i] = set()
+
+        for o in outputs:
+            self._output_types[o] = set()
+
+        if not inputs and not outputs:
+            raise ValueError("At least one input or output must be tracked")
+
+        self._required_input_types = required_input_types
+        self._required_output_types = required_output_types
+
+    def _is_type_enabled(self, reg_type, index, required_types, allowed_type_set):
+        cpp_type = _reg_type_to_cpp_type(reg_type)
+        return cpp_type in required_types.get(index, set()) or cpp_type in allowed_type_set
+
+    def is_input_type_enabled(self, reg_type, index, allowed_type_set=None):
+        """Whether input type is enabled based on required and allowed types."""
+        if allowed_type_set is None:
+            allowed_type_set = self._input_types[index]
+        return self._is_type_enabled(reg_type, index, self._required_input_types, allowed_type_set)
+
+    def is_output_type_enabled(self, reg_type, index, allowed_type_set=None):
+        """Whether output type is enabled based on required and allowed types."""
+        if allowed_type_set is None:
+            allowed_type_set = self._output_types[index]
+        return self._is_type_enabled(reg_type, index, self._required_output_types, allowed_type_set)
+
+    def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
+        for i in self._input_types:
+            if i >= node.InputsLength():
+                # Some operators have fewer inputs in earlier versions where data that was as an attribute
+                # become an input in later versions to allow it to be dynamically provided. Allow for that.
+                # e.g. Slice-1 had attributes for the indices, and Slice-10 moved those to be inputs
+                # raise RuntimeError('Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.'
+                #                    .format(node.OutputsLength(), self.name, o))
+                pass
+            else:
+                type_str = value_name_to_typestr(node.Inputs(i), value_name_to_typeinfo)
+                self._input_types[i].add(type_str)
+
+        for o in self._output_types:
+            # Don't know of any ops where the number of outputs changed across versions, so require a valid length
+            if o >= node.OutputsLength():
+                raise RuntimeError(
+                    f"Node has {node.OutputsLength()} outputs. Tracker for {self.name} incorrectly configured as it requires {o}."
+                )
+
+            type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo)
+            self._output_types[o].add(type_str)
+
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None):
+        if 0 not in self._input_types:
+            # currently all standard typed registrations are for input 0.
+            # custom registrations can be handled by operator specific processors (e.g. OneHotProcessor below).
+            raise RuntimeError(f"Expected typed registration to use type from input 0. Node:{self.name}")
+
+        return self.is_input_type_enabled(type_in_registration, 0, globally_allowed_types)
+
+    def get_cpp_entry(self):
+        entries = []
+        domain = _ort_constant_for_domain(self.domain)
+        for i in sorted(self._input_types.keys()):
+            if self._input_types[i]:
+                entries.append(
+                    "ORT_SPECIFY_OP_KERNEL_ARG_ALLOWED_TYPES({}, {}, Input, {}, {});".format(
+                        domain, self.optype, i, ", ".join(sorted(self._input_types[i]))
+                    )
+                )
+
+        for o in sorted(self._output_types.keys()):
+            if self._output_types[o]:
+                entries.append(
+                    "ORT_SPECIFY_OP_KERNEL_ARG_ALLOWED_TYPES({}, {}, Output, {}, {});".format(
+                        domain, self.optype, o, ", ".join(sorted(self._output_types[o]))
+                    )
+                )
+
+        return entries
+
+    def to_config_entry(self):
+        # convert the sets of types to lists so they can easily written out using the json model
+        aggregate_info = {"inputs": {}, "outputs": {}}
+
+        # filter out empty entries and sort the types
+        for i in sorted(self._input_types.keys()):
+            if self._input_types[i]:
+                aggregate_info["inputs"][i] = sorted(self._input_types[i])
+
+        for o in sorted(self._output_types.keys()):
+            if self._output_types[o]:
+                aggregate_info["outputs"][o] = sorted(self._output_types[o])
+
+        # remove any empty keys
+        if not aggregate_info["inputs"]:
+            aggregate_info.pop("inputs")
+        if not aggregate_info["outputs"]:
+            aggregate_info.pop("outputs")
+
+        entry = json.dumps(aggregate_info) if aggregate_info else None
+        return entry
+
+    def from_config_entry(self, entry: str):
+        self._input_types.clear()
+        self._output_types.clear()
+
+        aggregate_info = json.loads(entry)
+        if "inputs" in aggregate_info:
+            for i_str, values in aggregate_info["inputs"].items():
+                self._input_types[int(i_str)] = set(values)
+
+        if "outputs" in aggregate_info:
+            for o_str, values in aggregate_info["outputs"].items():
+                self._output_types[int(o_str)] = set(values)
+
+
+class Input1TypedRegistrationProcessor(DefaultTypeUsageProcessor):
+    """
+    Processor for operators where the second input type is used in a typed kernel registration.
+    """
+
+    def __init__(self, domain: str, optype: str):
+        # init with tracking of input 1 only.
+        super().__init__(domain, optype, inputs=[1], outputs=[])
+
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None):
+        return self.is_input_type_enabled(type_in_registration, 1, globally_allowed_types)
+
+
+class Output0TypedRegistrationProcessor(DefaultTypeUsageProcessor):
+    """
+    Processor for operators where the first output type is used in a typed kernel registration.
+    """
+
+    def __init__(self, domain: str, optype: str):
+        # init with tracking of output 0 only.
+        super().__init__(domain, optype, inputs=[], outputs=[0])
+
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None):
+        return self.is_output_type_enabled(type_in_registration, 0, globally_allowed_types)
+
+
+class OneHotProcessor(TypeUsageProcessor):
+    """
+    Processor for the OneHot operator, which requires custom logic as the type registration key is a concatenation of
+    the three types involved instead of a single type name.
+    """
+
+    def __init__(self):
+        super().__init__("ai.onnx", "OneHot")
+        self._triples = set()
+
+    def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
+        type0 = value_name_to_typestr(node.Inputs(0), value_name_to_typeinfo)
+        type1 = value_name_to_typestr(node.Inputs(1), value_name_to_typeinfo)
+        type2 = value_name_to_typestr(node.Inputs(2), value_name_to_typeinfo)
+        # types in kernel registration are ordered this way: input (T1), output (T3), depth (T2)
+        key = (type0, type2, type1)
+        self._triples.add(key)
+
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None):
+        # the OneHot registration involves a concatenation of the 3 types involved
+        reg_types = tuple([_reg_type_to_cpp_type(reg_type) for reg_type in _split_reg_types(type_in_registration)])
+        if globally_allowed_types is not None:
+            return all(reg_type in globally_allowed_types for reg_type in reg_types)
+        else:
+            return reg_types in self._triples
+
+    def to_config_entry(self):
+        if not self._triples:
+            return None
+
+        aggregate_info = {"custom": sorted(self._triples)}
+        entry = json.dumps(aggregate_info)
+        return entry
+
+    def from_config_entry(self, entry: str):
+        self._triples.clear()
+        aggregate_info = json.loads(entry)
+        if "custom" in aggregate_info:
+            self._triples = {tuple(triple) for triple in aggregate_info["custom"]}
+
+
+def _create_operator_type_usage_processors():
+    """
+    Create a set of processors that determine the required types for all enabled operators.
+    :return: Dictionary of operator key to processor. Key is 'domain:operator (e.g. ai.onnx:Cast)'.
+    """
+    operator_processors = {}
+
+    def add(processor):
+        if processor.name in operator_processors:
+            raise RuntimeError("Duplicate processor for " + processor.name)
+
+        operator_processors[processor.name] = processor
+
+    # Starting with ops from:
+    #   - Priority 1P models
+    #   - Mobilenet + SSD Mobilenet + MobileBert
+    #   - some known large kernels
+    #
+    # Ops we are ignoring currently so as not to produce meaningless/unused output:
+    # - Implementation is type agnostic:
+    #    ai.onnx: If, Loop, Reshape, Scan, Shape, Squeeze, Tile, Unsqueeze
+    #    com.microsoft: DynamicQuantizeMatMul, MatMulIntegerToFloat
+    # - Only one type supported in the ORT implementation:
+    #    ai.onnx: NonMaxSuppression
+    #    com.microsoft: FusedConv, FusedGemm, FusedMatMul
+    # - Implementation does not have any significant type specific code:
+    #    ai.onnx: Concat, Flatten, Not, Reshape, Shape, Squeeze, Unsqueeze
+    #
+    default_processor_onnx_ops = [
+        "Abs",
+        "ArgMax",
+        "ArgMin",
+        "AveragePool",
+        "BatchNormalization",
+        "BitShift",
+        "Ceil",
+        "Clip",
+        "Conv",
+        "CumSum",
+        "Exp",
+        "Expand",
+        "Floor",
+        "Gemm",
+        "IsNaN",
+        "Log",
+        "LogSoftmax",
+        "LpNormalization",
+        "MatMul",
+        "Max",
+        "MaxPool",
+        "Mean",
+        "Min",
+        "NonZero",
+        "Pad",
+        "QLinearConv",
+        "QLinearMatMul",
+        "Range",
+        "Reciprocal",
+        "ReduceL1",
+        "ReduceL2",
+        "ReduceLogSum",
+        "ReduceLogSumExp",
+        "ReduceMax",
+        "ReduceMean",
+        "ReduceMin",
+        "ReduceProd",
+        "ReduceSum",
+        "ReduceSumSquare",
+        "Relu",
+        "Resize",
+        "ReverseSequence",
+        "RoiAlign",
+        "Round",
+        "Scatter",
+        "ScatterElements",
+        "ScatterND",
+        "Shrink",
+        "Sigmoid",
+        "Sign",
+        "Sin",
+        "Softmax",
+        "Split",
+        "SplitToSequence",
+        "Sqrt",
+        "Sum",
+        "Tanh",
+        "TopK",
+        "Transpose",
+        "Unique",
+    ]
+
+    # ops that are used to manipulate shapes or indices so require int32_t and int64_t to be available
+    default_processor_onnx_ops_requiring_ints_for_input_0 = [
+        "Add",
+        "Concat",
+        "Div",
+        "Equal",
+        "Greater",
+        "Less",
+        "Mul",
+        "Neg",  # used in tflite TransposeConv conversion
+        "Sub",
+    ]
+
+    # NOTE: QLinearConv has ONNX and internal implementations
+    internal_ops = ["QLinearAdd", "QLinearMul", "QLinearConv"]
+
+    # TODO - review and add ML ops as needed
+    # ML Op notes.
+    #  CastMap: Switch on value type of input map type, and output type
+    #  DictVectorizer: Templatized on key+value of input so need to handle like OneHot with custom processor
+    #  LabelEncoder: Implementation switches on input and output types (only supports string and int64 in T1 and T2)
+    #  LinearClassifier: Internal switch on input type and also switch on output type
+    #  SVMClassifier: ditto
+    #  TreeEnsembleClassifier: Templatized on input type and also switch on output type
+    #  ZipMap: Switch on output type (derived from attributes)
+    default_processor_onnxml_ops = []
+
+    [add(DefaultTypeUsageProcessor("ai.onnx", op)) for op in default_processor_onnx_ops]
+    [
+        add(DefaultTypeUsageProcessor("ai.onnx", op, required_input_types={0: {"int32_t", "int64_t"}}))
+        for op in default_processor_onnx_ops_requiring_ints_for_input_0
+    ]
+    [add(DefaultTypeUsageProcessor("ai.onnx.ml", op)) for op in default_processor_onnxml_ops]
+    [add(DefaultTypeUsageProcessor("com.microsoft", op)) for op in internal_ops]
+
+    #
+    # Operators that require custom handling
+    #
+
+    # Cast switches on types of input 0 and output 0
+    add(DefaultTypeUsageProcessor("ai.onnx", "Cast", inputs=[0], outputs=[0]))
+
+    # Operators that switch on the type of input 0 and 1
+    add(DefaultTypeUsageProcessor("ai.onnx", "Gather", inputs=[0, 1]))
+    add(DefaultTypeUsageProcessor("ai.onnx", "GatherElements", inputs=[0, 1]))
+    add(DefaultTypeUsageProcessor("ai.onnx", "Pow", inputs=[0, 1]))
+    add(DefaultTypeUsageProcessor("ai.onnx", "Slice", inputs=[0, 1]))
+
+    # Operators that switch on output type
+    add(DefaultTypeUsageProcessor("ai.onnx", "ConstantOfShape", inputs=[], outputs=[0]))
+
+    # Random generator ops produce new data so we track the output type
+    onnx_random_ops = ["RandomNormal", "RandomNormalLike", "RandomUniform", "RandomUniformLike", "Multinomial"]
+    [add(DefaultTypeUsageProcessor("ai.onnx", op, inputs=[], outputs=[0])) for op in onnx_random_ops]
+
+    # Where always has a boolean first input so track the second input type for typed registration
+    add(Input1TypedRegistrationProcessor("ai.onnx", "Where"))
+
+    # we only support 'float' as input for [Dynamic]QuantizeLinear so just track the output type
+    # as that's what is used in the typed registration
+    add(Output0TypedRegistrationProcessor("ai.onnx", "QuantizeLinear"))
+    add(Output0TypedRegistrationProcessor("ai.onnx", "DynamicQuantizeLinear"))
+
+    # make sure all the dequantize types are enabled. we use int32_t for parts of GEMM and Conv so just
+    # enabling int8 and uint8 is not enough.
+    # TODO: Only apply required types to the global type list and ignore if it's model based per-op type reduction
+    add(
+        DefaultTypeUsageProcessor(
+            "ai.onnx", "DequantizeLinear", inputs=[0], required_input_types={0: {"int8_t", "uint8_t", "int32_t"}}
+        )
+    )
+
+    # OneHot concatenates type strings into a triple in the typed registration
+    #   e.g. float_int64_t_int64_t
+    add(OneHotProcessor())
+
+    return operator_processors
+
+
+class OpTypeImplFilterInterface(ABC):
+    """
+    Class that filters operator implementations based on type.
+    """
+
+    @abstractmethod
+    def is_typed_registration_needed(self, domain: str, optype: str, type_registration_str: str):
+        """
+        Given the string from a kernel registration, determine if the registration is required or not.
+        :param domain: Operator domain.
+        :param optype: Operator type.
+        :param type_registration_str: Type string from kernel registration
+        :return: True is required. False if not.
+        """
+
+    @abstractmethod
+    def get_cpp_entries(self):
+        """
+        Get the C++ code that specifies the operator types to enable.
+        :return: List of strings. One line of C++ code per entry.
+        """
+
+
+class OperatorTypeUsageManager:
+    """
+    Class to manage the operator type usage processors.
+    TODO: Currently the type tracking is not specific to a version of the operator.
+    It's unclear how/where version specific logic could/should be added, and it would add significant complexity
+    to track types on a per-version basis. Not clear there's enough benefit from doing so either.
+    """
+
+    def __init__(self):
+        self._all_operator_processors = _create_operator_type_usage_processors()  # all possible processors
+        self._operator_processors = {}  # processors we have actually used so we can limit output to be meaningful
+
+    def _get_op_processor(self, key):
+        "Add the processor to _operator_processors as it is about to be used."
+        processor = None
+        if key in self._all_operator_processors:
+            if key not in self._operator_processors:
+                self._operator_processors[key] = self._all_operator_processors[key]
+
+            processor = self._operator_processors[key]
+
+        return processor
+
+    def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
+        """
+        Process a Node and record info on the types used.
+        :param node: Node from ORT format model
+        :param value_name_to_typeinfo: Map of value names to TypeInfo instances
+        """
+        optype = node.OpType().decode()
+        domain = node.Domain().decode() or "ai.onnx"  # empty domain defaults to ai.onnx
+
+        key = _create_op_key(domain, optype)
+        op_processor = self._get_op_processor(key)
+        if op_processor:
+            op_processor.process_node(node, value_name_to_typeinfo)
+
+    def get_config_entry(self, domain: str, optype: str):
+        """
+        Get the config entry specifying the types for this operator.
+        :param domain: Operator domain.
+        :param optype: Operator type.
+        :return: JSON string with type info if available, else None
+        """
+        key = _create_op_key(domain, optype)
+        config_str = None
+        if key in self._operator_processors:
+            config_str = self._operator_processors[key].to_config_entry()
+
+        return config_str
+
+    def restore_from_config_entry(self, domain: str, optype: str, config_entry: str):
+        """
+        Restore the per-operator type information from a configuration file entry.
+        :param domain: Operator domain.
+        :param optype: Operator type.
+        :param config_entry: JSON string with type info as created by get_config_entry
+        """
+        key = _create_op_key(domain, optype)
+        op_processor = self._get_op_processor(key)
+        if op_processor:
+            op_processor.from_config_entry(config_entry)
+
+    def debug_dump(self):
+        print("C++ code that will be emitted:")
+        [print(cpp_line) for cpp_line in self.get_cpp_entries()]
+
+        print("Config file type information that will be returned by get_config_entry:")
+        for key in sorted(self._operator_processors.keys()):
+            entry = self._operator_processors[key].to_config_entry()
+            if entry:
+                print(f"{key} -> {entry}")
+
+                # roundtrip test to validate that we can initialize the processor from the entry and get the
+                # same values back
+                self._operator_processors[key].from_config_entry(entry)
+                assert entry == self._operator_processors[key].to_config_entry()
+
+    class _OpTypeImplFilter(OpTypeImplFilterInterface):
+        def __init__(self, manager):
+            self._manager = manager
+
+        def is_typed_registration_needed(self, domain: str, optype: str, type_registration_str: str):
+            needed = True  # we keep the registration unless the per-operator processor says not to
+            key = _create_op_key(domain, optype)
+            if key in self._manager._operator_processors:
+                needed = self._manager._operator_processors[key].is_typed_registration_needed(
+                    type_in_registration=type_registration_str, globally_allowed_types=None
+                )
+
+            return needed
+
+        def get_cpp_entries(self):
+            entries = []
+            for key in sorted(self._manager._operator_processors.keys()):
+                entries.extend(self._manager._operator_processors[key].get_cpp_entry())
+
+            return entries
+
+    def make_op_type_impl_filter(self):
+        """
+        Creates an OpTypeImplFilterInterface instance from this manager.
+        Filtering uses the manager's operator type usage processor state.
+        """
+        return OperatorTypeUsageManager._OpTypeImplFilter(self)
+
+
+class GloballyAllowedTypesOpTypeImplFilter(OpTypeImplFilterInterface):
+    """
+    Operator implementation filter which uses globally allowed types.
+    """
+
+    _valid_allowed_types = set(FbsTypeInfo.tensordatatype_to_string.values())  # noqa: RUF012
+
+    def __init__(self, globally_allowed_types: set[str]):
+        self._operator_processors = _create_operator_type_usage_processors()
+
+        if not globally_allowed_types.issubset(self._valid_allowed_types):
+            raise ValueError(
+                f"Globally allowed types must all be valid. Invalid types: {sorted(globally_allowed_types - self._valid_allowed_types)}"
+            )
+
+        self._globally_allowed_types = globally_allowed_types
+
+    def is_typed_registration_needed(self, domain: str, optype: str, type_registration_str: str):
+        key = _create_op_key(domain, optype)
+        if key in self._operator_processors:
+            needed = self._operator_processors[key].is_typed_registration_needed(
+                type_in_registration=type_registration_str, globally_allowed_types=self._globally_allowed_types
+            )
+        else:
+            needed = _reg_type_to_cpp_type(type_registration_str) in self._globally_allowed_types
+
+        return needed
+
+    def get_cpp_entries(self):
+        return [
+            "ORT_SPECIFY_OP_KERNEL_GLOBAL_ALLOWED_TYPES({});".format(", ".join(sorted(self._globally_allowed_types)))
+        ]
+
+    def global_type_list(self):
+        return self._globally_allowed_types
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/ort_flatbuffers_py/init.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/ort_flatbuffers_py/init.py
--- a/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py
+++ b/backend_service/venv/lib/python3.13/site-packages/onnxruntime/tools/ort_format_model/ort_flatbuffers_py/fbs/ArgType.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: fbs
+
+class ArgType(object):
+    INPUT = 0
+    OUTPUT = 1
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`from .weight_only import gptq_quantize, rtn_quantize # noqa: F401`