chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
# automatically generated by the FlatBuffers compiler, do not modify
|
||||
|
||||
# namespace: CalTableFlatBuffers
|
||||
|
||||
import flatbuffers
|
||||
from flatbuffers.compat import import_numpy
|
||||
|
||||
np = import_numpy()
|
||||
|
||||
|
||||
class KeyValue:
|
||||
__slots__ = ["_tab"]
|
||||
|
||||
@classmethod
|
||||
def GetRootAs(cls, buf, offset=0): # noqa: N802
|
||||
n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
|
||||
x = KeyValue()
|
||||
x.Init(buf, n + offset)
|
||||
return x
|
||||
|
||||
@classmethod
|
||||
def GetRootAsKeyValue(cls, buf, offset=0): # noqa: N802
|
||||
"""This method is deprecated. Please switch to GetRootAs."""
|
||||
return cls.GetRootAs(buf, offset)
|
||||
|
||||
# KeyValue
|
||||
def Init(self, buf, pos): # noqa: N802
|
||||
self._tab = flatbuffers.table.Table(buf, pos)
|
||||
|
||||
# KeyValue
|
||||
def Key(self): # noqa: N802
|
||||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
||||
if o != 0:
|
||||
return self._tab.String(o + self._tab.Pos)
|
||||
return None
|
||||
|
||||
# KeyValue
|
||||
def Value(self): # noqa: N802
|
||||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
|
||||
if o != 0:
|
||||
return self._tab.String(o + self._tab.Pos)
|
||||
return None
|
||||
|
||||
|
||||
def Start(builder): # noqa: N802
|
||||
builder.StartObject(2)
|
||||
|
||||
|
||||
def KeyValueStart(builder): # noqa: N802
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return Start(builder)
|
||||
|
||||
|
||||
def AddKey(builder, key): # noqa: N802
|
||||
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
|
||||
|
||||
|
||||
def KeyValueAddKey(builder, key): # noqa: N802
|
||||
"""This method is deprecated. Please switch to AddKey."""
|
||||
return AddKey(builder, key)
|
||||
|
||||
|
||||
def AddValue(builder, value): # noqa: N802
|
||||
builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
|
||||
|
||||
|
||||
def KeyValueAddValue(builder, value): # noqa: N802
|
||||
"""This method is deprecated. Please switch to AddValue."""
|
||||
return AddValue(builder, value)
|
||||
|
||||
|
||||
def End(builder): # noqa: N802
|
||||
return builder.EndObject()
|
||||
|
||||
|
||||
def KeyValueEnd(builder): # noqa: N802
|
||||
"""This method is deprecated. Please switch to End."""
|
||||
return End(builder)
|
||||
@@ -0,0 +1,90 @@
|
||||
# automatically generated by the FlatBuffers compiler, do not modify
|
||||
|
||||
# namespace: CalTableFlatBuffers
|
||||
|
||||
import flatbuffers
|
||||
from flatbuffers.compat import import_numpy
|
||||
|
||||
np = import_numpy()
|
||||
|
||||
|
||||
class TrtTable:
|
||||
__slots__ = ["_tab"]
|
||||
|
||||
@classmethod
|
||||
def GetRootAs(cls, buf, offset=0): # noqa: N802
|
||||
n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
|
||||
x = TrtTable()
|
||||
x.Init(buf, n + offset)
|
||||
return x
|
||||
|
||||
@classmethod
|
||||
def GetRootAsTrtTable(cls, buf, offset=0): # noqa: N802
|
||||
"""This method is deprecated. Please switch to GetRootAs."""
|
||||
return cls.GetRootAs(buf, offset)
|
||||
|
||||
# TrtTable
|
||||
def Init(self, buf, pos): # noqa: N802
|
||||
self._tab = flatbuffers.table.Table(buf, pos)
|
||||
|
||||
# TrtTable
|
||||
def Dict(self, j): # noqa: N802
|
||||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
||||
if o != 0:
|
||||
x = self._tab.Vector(o)
|
||||
x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
|
||||
x = self._tab.Indirect(x)
|
||||
from onnxruntime.quantization.CalTableFlatBuffers.KeyValue import KeyValue # noqa: PLC0415
|
||||
|
||||
obj = KeyValue()
|
||||
obj.Init(self._tab.Bytes, x)
|
||||
return obj
|
||||
return None
|
||||
|
||||
# TrtTable
|
||||
def DictLength(self): # noqa: N802
|
||||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
||||
if o != 0:
|
||||
return self._tab.VectorLen(o)
|
||||
return 0
|
||||
|
||||
# TrtTable
|
||||
def DictIsNone(self): # noqa: N802
|
||||
o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
|
||||
return o == 0
|
||||
|
||||
|
||||
def Start(builder): # noqa: N802
|
||||
builder.StartObject(1)
|
||||
|
||||
|
||||
def TrtTableStart(builder): # noqa: N802
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return Start(builder)
|
||||
|
||||
|
||||
def AddDict(builder, dict): # noqa: N802
|
||||
builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
|
||||
|
||||
|
||||
def TrtTableAddDict(builder, dict): # noqa: N802
|
||||
"""This method is deprecated. Please switch to AddDict."""
|
||||
return AddDict(builder, dict)
|
||||
|
||||
|
||||
def StartDictVector(builder, numElems): # noqa: N802
|
||||
return builder.StartVector(4, numElems, 4)
|
||||
|
||||
|
||||
def TrtTableStartDictVector(builder, numElems): # noqa: N802
|
||||
"""This method is deprecated. Please switch to Start."""
|
||||
return StartDictVector(builder, numElems)
|
||||
|
||||
|
||||
def End(builder): # noqa: N802
|
||||
return builder.EndObject()
|
||||
|
||||
|
||||
def TrtTableEnd(builder): # noqa: N802
|
||||
"""This method is deprecated. Please switch to End."""
|
||||
return End(builder)
|
||||
@@ -0,0 +1,19 @@
|
||||
from .calibrate import ( # noqa: F401
|
||||
CalibraterBase,
|
||||
CalibrationDataReader,
|
||||
CalibrationMethod,
|
||||
MinMaxCalibrater,
|
||||
create_calibrator,
|
||||
)
|
||||
from .qdq_quantizer import QDQQuantizer # noqa: F401
|
||||
from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401
|
||||
from .quantize import (
|
||||
DynamicQuantConfig, # noqa: F401
|
||||
QuantizationMode, # noqa: F401
|
||||
StaticQuantConfig, # noqa: F401
|
||||
get_qdq_config, # noqa: F401
|
||||
quantize, # noqa: F401
|
||||
quantize_dynamic, # noqa: F401
|
||||
quantize_static, # noqa: F401
|
||||
)
|
||||
from .shape_inference import quant_pre_process # noqa: F401
|
||||
@@ -0,0 +1,529 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
import onnx.numpy_helper
|
||||
|
||||
try:
|
||||
from onnx.reference.op_run import to_array_extended
|
||||
except ImportError:
|
||||
# old version of onnx.
|
||||
to_array_extended = None
|
||||
|
||||
from .calibrate import TensorData
|
||||
from .onnx_model import ONNXModel
|
||||
from .quant_utils import (
|
||||
DEQUANT_OP_NAME,
|
||||
ONNX_TYPE_TO_NP_TYPE,
|
||||
QUANT_OP_NAME,
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
find_by_name,
|
||||
get_opset_version,
|
||||
model_has_infer_metadata,
|
||||
normalize_axis,
|
||||
pack_bytes_to_4bit,
|
||||
quantize_data,
|
||||
quantize_nparray,
|
||||
save_and_reload_model_with_shape_infer,
|
||||
tensor_proto_to_array,
|
||||
)
|
||||
from .tensor_quant_overrides import TensorQuantOverridesHelper
|
||||
|
||||
|
||||
class QuantizationParams:
|
||||
def __init__(self, **data: dict[str, Any]):
|
||||
self.data = {}
|
||||
for k, v in data.items():
|
||||
if not isinstance(k, str):
|
||||
raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
|
||||
if k != "axis" and not isinstance(v, (int, str, np.ndarray, float)):
|
||||
raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
|
||||
if k == "axis" and not isinstance(v, int) and v is not None:
|
||||
raise TypeError(f"Axis value must be an int or None, not {type(v)}.")
|
||||
if k == "scale" and v.dtype not in (np.float32, np.float16):
|
||||
raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
|
||||
self.data[k] = v
|
||||
|
||||
def get(self, key, default_value=None):
|
||||
return self.data.get(key, default_value)
|
||||
|
||||
def __iter__(self):
|
||||
yield from self.data
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.data[key]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.data[key] = value
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
||||
class BaseQuantizer:
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
weight_qType,
|
||||
activation_qType,
|
||||
tensors_range,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options=None,
|
||||
):
|
||||
if not model_has_infer_metadata(model):
|
||||
model = save_and_reload_model_with_shape_infer(model)
|
||||
self.value_infos = {vi.name: vi for vi in model.graph.value_info}
|
||||
self.value_infos.update({ot.name: ot for ot in model.graph.output})
|
||||
self.value_infos.update({it.name: it for it in model.graph.input})
|
||||
|
||||
self.model = ONNXModel(model)
|
||||
self.opset_version = get_opset_version(model)
|
||||
self.per_channel = per_channel # weight-pack per channel
|
||||
self.reduce_range = reduce_range
|
||||
|
||||
self.extra_options = extra_options if extra_options else {}
|
||||
self.enable_subgraph_quantization = (
|
||||
"EnableSubgraph" in self.extra_options and self.extra_options["EnableSubgraph"]
|
||||
)
|
||||
self.parent = None
|
||||
self.force_quantize_no_input_check = (
|
||||
"ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"]
|
||||
)
|
||||
|
||||
# If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines
|
||||
# the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
|
||||
self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
|
||||
self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
|
||||
self.min_real_range = self.extra_options.get("MinimumRealRange")
|
||||
|
||||
self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
|
||||
self.weight_qType = getattr(weight_qType, "tensor_type", weight_qType)
|
||||
|
||||
"""
|
||||
Dictionary specifying the min and max values for tensors. It has following format:
|
||||
{
|
||||
"param_name": [min, max]
|
||||
}
|
||||
example:
|
||||
{
|
||||
'Conv_3:0': [np.float32(0), np.float32(0.5)],
|
||||
'Conv_4:0': [np.float32(1), np.float32(3.5)]
|
||||
}
|
||||
"""
|
||||
if tensors_range is not None and any(not isinstance(t, TensorData) for t in tensors_range.values()):
|
||||
raise TypeError(
|
||||
f"tensors_range contains unexpected types { {type(v) for v in tensors_range.values()} }, not TensorData."
|
||||
)
|
||||
self.tensors_range = tensors_range
|
||||
self.nodes_to_quantize = nodes_to_quantize # specific nodes to quantize
|
||||
self.nodes_to_exclude = nodes_to_exclude # specific nodes to exclude
|
||||
self.op_types_to_quantize = op_types_to_quantize
|
||||
|
||||
# Get tensor-level quantization overrides and ensure they are valid.
|
||||
self.tensor_quant_overrides = TensorQuantOverridesHelper(self.extra_options.get("TensorQuantOverrides", {}))
|
||||
|
||||
self.initializers = {initzer.name: initzer for initzer in self.model.initializer()}
|
||||
overrides_valid, overrides_err = self.tensor_quant_overrides.is_valid(
|
||||
self.initializers, self.value_infos.keys(), activation_qType
|
||||
)
|
||||
if not overrides_valid:
|
||||
raise ValueError(overrides_err)
|
||||
|
||||
self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types()
|
||||
|
||||
def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool:
|
||||
if self._is_weight_symmetric is not None:
|
||||
return self._is_weight_symmetric # Return value explicitly set by user.
|
||||
return weight_quant_type in (
|
||||
onnx.TensorProto.INT4,
|
||||
onnx.TensorProto.INT8,
|
||||
onnx.TensorProto.INT16,
|
||||
onnx.TensorProto.FLOAT8E4M3FN,
|
||||
)
|
||||
|
||||
def quantize_model(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_input_a_initializer(self, input_name):
|
||||
initializer = find_by_name(input_name, self.model.initializer())
|
||||
return initializer is not None
|
||||
|
||||
def is_per_channel(self):
|
||||
return self.per_channel
|
||||
|
||||
def is_valid_quantize_weight(self, weight_name):
|
||||
weight = find_by_name(weight_name, self.model.initializer())
|
||||
if weight is not None:
|
||||
return weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16)
|
||||
if (not self.enable_subgraph_quantization) or (self.parent is None):
|
||||
return False
|
||||
return self.parent.is_valid_quantize_weight(weight_name)
|
||||
|
||||
def should_quantize_node(self, node):
|
||||
if (
|
||||
self.nodes_to_quantize is not None
|
||||
and len(self.nodes_to_quantize) != 0
|
||||
and node.name not in self.nodes_to_quantize
|
||||
):
|
||||
return False
|
||||
|
||||
if node.op_type not in self.op_types_to_quantize:
|
||||
return False
|
||||
|
||||
if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME):
|
||||
return False
|
||||
|
||||
if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1.0):
|
||||
"""
|
||||
Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
|
||||
"""
|
||||
|
||||
# get bias
|
||||
bias_initializer = find_by_name(bias_name, self.model.initializer())
|
||||
bias_data = tensor_proto_to_array(bias_initializer)
|
||||
quantized_bias_name = bias_name + TENSOR_NAME_QUANT_SUFFIX
|
||||
|
||||
# quantize bias
|
||||
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
|
||||
data = np.asarray(bias_data)
|
||||
if data.dtype == np.float16:
|
||||
node_qtype = onnx.TensorProto.FLOAT16
|
||||
elif data.dtype == np.float32:
|
||||
node_qtype = onnx.TensorProto.FLOAT
|
||||
else:
|
||||
raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
|
||||
quantized_data = data.astype(np.float32)
|
||||
bias_scale = np.array([1], dtype=quantized_data.dtype)
|
||||
bias_scale_data = bias_scale.reshape(-1)
|
||||
packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
|
||||
self.model.initializer_extend([packed_bias_initializer])
|
||||
node_type = "Cast"
|
||||
else:
|
||||
# calculate scale for bias
|
||||
# TODO: This formula should be explained including why the scale is not estimated for the bias as well.
|
||||
bias_scale = input_scale * weight_scale * beta
|
||||
|
||||
# Quantize by dividing by bias_scale
|
||||
quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64)
|
||||
quantized_data = quantized_data.round()
|
||||
|
||||
# Clip quantized data to the range of a int32
|
||||
int32_min = np.float64(np.iinfo(np.int32).min)
|
||||
int32_max = np.float64(np.iinfo(np.int32).max)
|
||||
if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max):
|
||||
logging.warning(
|
||||
f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small."
|
||||
)
|
||||
|
||||
quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32)
|
||||
|
||||
# update bias initializer
|
||||
bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)
|
||||
packed_bias_initializer = onnx.numpy_helper.from_array(bias_np_data, quantized_bias_name)
|
||||
self.model.initializer_extend([packed_bias_initializer])
|
||||
|
||||
# Bias's scale dtype should match the original bias data's unquantized type (float32 or float16).
|
||||
bias_scale_data = np.asarray(bias_scale, dtype=bias_data.dtype).reshape(-1)
|
||||
node_type = "DequantizeLinear"
|
||||
node_qtype = self.weight_qType
|
||||
|
||||
# update scale initializer
|
||||
quantized_bias_scale_name = quantized_bias_name + "_scale"
|
||||
packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
|
||||
self.model.initializer_extend([packed_bias_scale_initializer])
|
||||
|
||||
# update zero initializer
|
||||
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
|
||||
tensor_type = self.weight_qType
|
||||
else:
|
||||
tensor_type = onnx.TensorProto.INT32
|
||||
|
||||
quantized_bias_zp_name = quantized_bias_name + "_zero_point"
|
||||
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
|
||||
packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, self.weight_qType, [1], [0.0])
|
||||
elif bias_scale.size > 1:
|
||||
bias_zp_data = np.zeros(bias_scale.shape, dtype=np.int32).reshape(-1)
|
||||
packed_bias_zp_initializer = onnx.numpy_helper.from_array(bias_zp_data, quantized_bias_zp_name)
|
||||
else:
|
||||
packed_bias_zp_initializer = onnx.helper.make_tensor(quantized_bias_zp_name, tensor_type, [], [0])
|
||||
self.model.initializer_extend([packed_bias_zp_initializer])
|
||||
|
||||
return (
|
||||
quantized_bias_name,
|
||||
quantized_bias_scale_name,
|
||||
quantized_bias_zp_name,
|
||||
bias_scale_data,
|
||||
node_type,
|
||||
node_qtype,
|
||||
)
|
||||
|
||||
def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_float_weight=False):
|
||||
"""
|
||||
:param weight: TensorProto initializer
|
||||
:param qType: type to quantize to
|
||||
:param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
|
||||
If keep_float_weight is False, quantize the weight, or don't quantize the weight.
|
||||
:return: quantized weight name, zero point name, scale name
|
||||
"""
|
||||
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
|
||||
q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX
|
||||
zp_name = weight.name + "_zero_point"
|
||||
scale_name = weight.name + "_scale"
|
||||
|
||||
# Quantize weight data. Use quantization overrides if provided by the user.
|
||||
weight_data = tensor_proto_to_array(weight)
|
||||
quant_overrides = self.tensor_quant_overrides.get_per_tensor_overrides(weight.name, default_val={})
|
||||
if "quant_type" in quant_overrides:
|
||||
qType = quant_overrides["quant_type"].tensor_type # noqa: N806
|
||||
|
||||
if "scale" in quant_overrides and "zero_point" in quant_overrides:
|
||||
zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
|
||||
scale = np.array(quant_overrides["scale"])
|
||||
q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
|
||||
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
|
||||
assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
|
||||
f"Unexpected dtype {zero_point.dtype}"
|
||||
)
|
||||
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
|
||||
|
||||
else:
|
||||
symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric
|
||||
zero_point, scale, q_weight_data = quantize_data(
|
||||
weight_data.flatten(),
|
||||
qType,
|
||||
quant_overrides.get("symmetric", symmetric),
|
||||
reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
|
||||
min_real_range=self.min_real_range,
|
||||
rmin_override=quant_overrides.get("rmin"),
|
||||
rmax_override=quant_overrides.get("rmax"),
|
||||
)
|
||||
|
||||
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
|
||||
assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
|
||||
f"Unexpected dtype {zero_point.dtype}"
|
||||
)
|
||||
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
|
||||
|
||||
scale_dtype = weight.data_type
|
||||
scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
|
||||
zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
|
||||
self.model.initializer_extend([scale_initializer, zero_initializer])
|
||||
|
||||
if not keep_float_weight:
|
||||
if self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
|
||||
q_weight_initializer = onnx.TensorProto()
|
||||
q_weight_initializer.data_type = self.weight_qType
|
||||
q_weight_initializer.dims.extend(weight.dims)
|
||||
q_weight_initializer.name = q_weight_name
|
||||
# Do not remove .flatten().copy() numpy is not clear about data persistence.
|
||||
q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes()
|
||||
if to_array_extended is not None:
|
||||
# This test should not be needed but it helped catch some issues
|
||||
# with data persistence and tobytes.
|
||||
check = to_array_extended(q_weight_initializer)
|
||||
if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes():
|
||||
raise RuntimeError(
|
||||
f"The initializer of shape {weight_data.shape} could not be created, expecting "
|
||||
f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}"
|
||||
f"\nraw={str(q_weight_initializer)[:200]}."
|
||||
)
|
||||
elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
|
||||
if q_weight_data.dtype not in (np.int8, np.uint8):
|
||||
raise RuntimeError(
|
||||
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
|
||||
)
|
||||
|
||||
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
|
||||
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
|
||||
packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
|
||||
|
||||
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
|
||||
q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
|
||||
else:
|
||||
q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
|
||||
weight.dims
|
||||
)
|
||||
q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
|
||||
self.model.initializer_extend([q_weight_initializer])
|
||||
|
||||
return q_weight_name, zp_name, scale_name
|
||||
|
||||
def quantize_weight_per_channel_impl(
|
||||
self,
|
||||
weight_name,
|
||||
weight_qType,
|
||||
channel_axis,
|
||||
reduce_range=True,
|
||||
keep_float_weight=False,
|
||||
):
|
||||
# TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there.
|
||||
initializer = find_by_name(weight_name, self.model.initializer())
|
||||
if initializer is None:
|
||||
raise ValueError("{} is not an initializer", weight_name)
|
||||
|
||||
weights = tensor_proto_to_array(initializer)
|
||||
weights_rank = len(weights.shape)
|
||||
is_axis_valid, axis_norm = normalize_axis(channel_axis, weights_rank)
|
||||
if not is_axis_valid:
|
||||
raise ValueError(
|
||||
f"Weight {weight_name} has a per-channel axis with value {channel_axis} that is "
|
||||
f"out-of-bounds for rank {weights_rank}"
|
||||
)
|
||||
|
||||
channel_axis = axis_norm
|
||||
channel_count = weights.shape[channel_axis]
|
||||
quant_overrides_for_channels = self.tensor_quant_overrides.get_per_channel_overrides(
|
||||
weight_name, default_val=[{"axis": channel_axis}]
|
||||
)
|
||||
|
||||
num_channel_overrides = len(quant_overrides_for_channels)
|
||||
if num_channel_overrides != 1 and num_channel_overrides != channel_count:
|
||||
raise ValueError(
|
||||
f"Per-channel tensor quantization overrides for {weight_name} must have "
|
||||
f"either 1 or {channel_count} elements in the list of dictionaries."
|
||||
)
|
||||
|
||||
is_axis_override_valid, axis_override = normalize_axis(quant_overrides_for_channels[0]["axis"], weights_rank)
|
||||
if not is_axis_override_valid or axis_override != channel_axis:
|
||||
raise ValueError(
|
||||
f"Tensor quantization overrides for {weight_name} specify an unexpected axis. "
|
||||
f"Expected {channel_axis}, but got {quant_overrides_for_channels[0]['axis']}."
|
||||
)
|
||||
|
||||
# If user provides per-channel quantization overrides, all channels must use the same quant_type,
|
||||
# axis, symmetric, and reduce_range values. So, just use the first channel's values.
|
||||
if "quant_type" in quant_overrides_for_channels[0]:
|
||||
weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806
|
||||
|
||||
symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType))
|
||||
reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range)
|
||||
zero_point_list = []
|
||||
scale_list = []
|
||||
quantized_per_channel_data_list = []
|
||||
weights_shape = list(weights.shape)
|
||||
reshape_dims = list(weights_shape) # deep copy
|
||||
reshape_dims[channel_axis] = 1 # only one per channel for reshape
|
||||
for i in range(channel_count):
|
||||
per_channel_data = weights.take(i, channel_axis)
|
||||
channel_override_index = i if i < num_channel_overrides else 0
|
||||
channel_quant_overrides = quant_overrides_for_channels[channel_override_index]
|
||||
|
||||
if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
|
||||
zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
|
||||
scale = np.array(channel_quant_overrides["scale"])
|
||||
quantized_per_channel_data = quantize_nparray(
|
||||
weight_qType, per_channel_data.flatten(), scale, zero_point
|
||||
)
|
||||
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
|
||||
assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
|
||||
f"Unexpected dtype {zero_point.dtype}"
|
||||
)
|
||||
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
|
||||
assert isinstance(quantized_per_channel_data, np.ndarray), (
|
||||
f"Unexpected type {type(quantized_per_channel_data)}"
|
||||
)
|
||||
|
||||
else:
|
||||
zero_point, scale, quantized_per_channel_data = quantize_data(
|
||||
per_channel_data.flatten(),
|
||||
weight_qType,
|
||||
symmetric,
|
||||
reduce_range=reduce_range,
|
||||
min_real_range=self.min_real_range,
|
||||
rmin_override=channel_quant_overrides.get("rmin"),
|
||||
rmax_override=channel_quant_overrides.get("rmax"),
|
||||
)
|
||||
|
||||
assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
|
||||
assert zero_point.dtype != np.float32 and zero_point.dtype != np.float16, (
|
||||
f"Unexpected dtype {zero_point.dtype}"
|
||||
)
|
||||
assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
|
||||
assert isinstance(quantized_per_channel_data, np.ndarray), (
|
||||
f"Unexpected type {type(quantized_per_channel_data)}"
|
||||
)
|
||||
|
||||
zero_point_list.append(zero_point)
|
||||
scale_list.append(scale)
|
||||
quantized_per_channel_data_list.append(np.asarray(quantized_per_channel_data).reshape(reshape_dims))
|
||||
|
||||
# combine per_channel_data into one
|
||||
quantized_weights = np.concatenate(quantized_per_channel_data_list, channel_axis)
|
||||
q_weight_name = weight_name + TENSOR_NAME_QUANT_SUFFIX
|
||||
zp_name = weight_name + "_zero_point"
|
||||
scale_name = weight_name + "_scale"
|
||||
|
||||
# Update packed weight, zero point, and scale initializers
|
||||
zero_scale_shape = [initializer.dims[channel_axis]]
|
||||
scale_initializer = onnx.helper.make_tensor(
|
||||
scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
|
||||
)
|
||||
zero_initializer = onnx.helper.make_tensor(
|
||||
zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
|
||||
)
|
||||
|
||||
self.model.initializer_extend([scale_initializer, zero_initializer])
|
||||
|
||||
if not keep_float_weight:
|
||||
if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
|
||||
if quantized_weights.dtype not in (np.int8, np.uint8):
|
||||
raise RuntimeError(
|
||||
f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
|
||||
)
|
||||
|
||||
# We do not use onnx.helper.pack_float32_to_4bit() due to performance.
|
||||
# This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
|
||||
packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
|
||||
|
||||
# We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
|
||||
q_weight_initializer = onnx.helper.make_tensor(
|
||||
q_weight_name, weight_qType, weights_shape, packed_data, raw=True
|
||||
)
|
||||
self.model.initializer_extend([q_weight_initializer])
|
||||
else:
|
||||
quantized_weights = np.asarray(
|
||||
quantized_weights,
|
||||
dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_qType),
|
||||
).reshape(initializer.dims)
|
||||
q_weight_initializer = onnx.numpy_helper.from_array(quantized_weights, q_weight_name)
|
||||
self.model.initializer_extend([q_weight_initializer])
|
||||
|
||||
return q_weight_name, zp_name, scale_name
|
||||
|
||||
def adjust_tensor_ranges(self):
|
||||
if self.tensors_range is None:
|
||||
return
|
||||
|
||||
for node in self.model.nodes():
|
||||
# adjust tensor_ranges for input of Clip and Relu node
|
||||
if node.op_type in ["Clip", "Relu"]:
|
||||
if not self.should_quantize_node(node):
|
||||
continue
|
||||
if len(self.model.input_name_to_nodes()[node.input[0]]) != 1:
|
||||
continue
|
||||
if node.input[0] not in self.tensors_range or node.output[0] not in self.tensors_range:
|
||||
continue
|
||||
td = self.tensors_range[node.output[0]]
|
||||
if not isinstance(td, TensorData):
|
||||
raise TypeError(f"Unexpected type {type(td)} for {node.output[0]!r}.")
|
||||
self.tensors_range[node.input[0]] = td
|
||||
# Adjust Softmax to range from 0.0 to 1.0
|
||||
elif node.op_type == "Softmax":
|
||||
if not self.should_quantize_node(node):
|
||||
continue
|
||||
self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,2 @@
|
||||
from .preprocess import qnn_preprocess_model # noqa: F401
|
||||
from .quant_config import get_qnn_qdq_config # noqa: F401
|
||||
@@ -0,0 +1,132 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import onnx
|
||||
|
||||
from ...fusions import Fusion
|
||||
from ...onnx_model import ONNXModel
|
||||
|
||||
|
||||
class FusionLpNormalization(Fusion):
|
||||
def __init__(self, model: ONNXModel, epsilon: float = 1e-12):
|
||||
super().__init__(model, "LpNormalization", "ReduceL2")
|
||||
self.epsilon = epsilon
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
reduce_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""
|
||||
Interface function that tries to fuse a node sequence containing a ReduceL2 node into a single
|
||||
LpNormalization node.
|
||||
|
||||
Pattern 1:
|
||||
[root] --> ReduceL2 -----> Clip --> Expand ----> Div -->
|
||||
| (axis=-1) (min=epsilon) (shape=root) ^
|
||||
| (keepdims=True) |
|
||||
| |
|
||||
+-----------------------------------------------+
|
||||
Notes:
|
||||
- ReduceL2 must use the last axis, and keepdims == True
|
||||
- Clip must only have a min attribute that is ~1e-12
|
||||
- Expand must restore the shape to root.shape
|
||||
- The output of Expand must be the second input to Div.
|
||||
"""
|
||||
if reduce_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
|
||||
# ReduceL2 must have one Clip child
|
||||
children = input_name_to_nodes[reduce_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Clip":
|
||||
return
|
||||
|
||||
# ReduceL2 must have keepdims == True
|
||||
keepdims = self.get_node_attribute(reduce_node, "keepdims")
|
||||
if not keepdims:
|
||||
return
|
||||
|
||||
# ReduceL2 axes must refer only to the last dimension.
|
||||
# Axes became an input in opset 18. Before then, axes was an attribute
|
||||
reduce_input_ttype = self.model.get_tensor_type(reduce_node.input[0])
|
||||
if not reduce_input_ttype:
|
||||
return
|
||||
|
||||
reduce_input_shape = self.tensor_shape_to_list(reduce_input_ttype)
|
||||
if not reduce_input_shape:
|
||||
return
|
||||
|
||||
axes = self.get_node_attribute(reduce_node, "axes")
|
||||
if not axes and len(reduce_node.input) > 1:
|
||||
axes = self.model.get_constant_value(reduce_node.input[1])
|
||||
|
||||
if not axes or len(axes) != 1:
|
||||
return
|
||||
|
||||
last_dim = len(reduce_input_shape) - 1
|
||||
if axes[0] != -1 and axes[0] != last_dim:
|
||||
return
|
||||
|
||||
# Clip node must have a min attribute approximately equal to 1e-12
|
||||
clip_node = children[0]
|
||||
clip_min = self.get_node_attribute(clip_node, "min")
|
||||
if clip_min is None and len(clip_node.input) > 1:
|
||||
clip_min = self.model.get_constant_value(clip_node.input[1])
|
||||
|
||||
clip_max = self.get_node_attribute(clip_node, "max") # TODO: clip_max could be FLOAT_MAX
|
||||
if clip_max is None and len(clip_node.input) > 2:
|
||||
clip_max = self.model.get_constant_value(clip_node.input[2])
|
||||
|
||||
if not (clip_max is None and clip_min is not None and clip_min > 0 and abs(clip_min - self.epsilon) < 1e-13):
|
||||
return
|
||||
|
||||
if clip_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
|
||||
# Clip must have a single Expand child.
|
||||
children = input_name_to_nodes[clip_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Expand":
|
||||
return
|
||||
|
||||
expand_node = children[0]
|
||||
if expand_node.output[0] not in input_name_to_nodes:
|
||||
return
|
||||
|
||||
# Expand must have a single Div child
|
||||
children = input_name_to_nodes[expand_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Div":
|
||||
return
|
||||
|
||||
div_node = children[0]
|
||||
|
||||
# The first input to Div must be the root of the subgraph (i.e., reduce_node.input[0])
|
||||
# The second input to Div must be the output of the Expand.
|
||||
# As long as these two inputs go to the same Div node, then ONNX validation will ensure that
|
||||
# their shapes match.
|
||||
if div_node.input[0] != reduce_node.input[0]:
|
||||
return
|
||||
if div_node.input[1] != expand_node.output[0]:
|
||||
return
|
||||
|
||||
subgraph_input = reduce_node.input[0]
|
||||
subgraph_output = div_node.output[0]
|
||||
|
||||
subgraph_nodes = [reduce_node, clip_node, expand_node, div_node]
|
||||
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = onnx.helper.make_node(
|
||||
self.fused_op_type,
|
||||
name=self.create_unique_node_name(),
|
||||
inputs=[subgraph_input],
|
||||
outputs=[subgraph_output],
|
||||
p=2,
|
||||
axis=-1,
|
||||
)
|
||||
self.nodes_to_add.append(fused_node)
|
||||
@@ -0,0 +1,162 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
"""Define SpaceToDepth fusion."""
|
||||
|
||||
import onnx
|
||||
|
||||
from ... import fusions, onnx_model
|
||||
|
||||
|
||||
class FusionSpaceToDepth(fusions.Fusion):
|
||||
"""Fusion for SpaceToDepth."""
|
||||
|
||||
def __init__(self, model: onnx_model.ONNXModel):
|
||||
"""Initialize.
|
||||
|
||||
Args:
|
||||
model: An onnx_model.ONNXModel instance.
|
||||
"""
|
||||
super().__init__(model, "SpaceToDepth", "Reshape")
|
||||
|
||||
def _fuse_yolo(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""Fuse for early version of YOLO.
|
||||
|
||||
Pattern:
|
||||
|
||||
| [N, C, H, W]
|
||||
Reshape
|
||||
| [N, C, H/blk, blk, W/blk, blk]
|
||||
Transpose
|
||||
| [N, C, H/blk, W/blk, blk, blk]
|
||||
Reshape
|
||||
| [N, C, H/blk * W/blk, blk * blk]
|
||||
Transpose
|
||||
| [N, C, blk * blk, H/blk * W/blk]
|
||||
Reshape
|
||||
| [N, C, blk * blk, H/blk, W/blk]
|
||||
Transpose
|
||||
| [N, blk * blk, C, H/blk, W/blk]
|
||||
Reshape
|
||||
| [N, blk * blk * C, H/blk, W/blk]
|
||||
|
||||
This sequence can be fused into a single SpaceToDepth with blocksize `blk`. Note that unlike DepthToSpace
|
||||
supporting DCR or CRD mode, SpaceToDepth only supports DCR mode in its latest opset version (13), which matches
|
||||
the pattern here.
|
||||
"""
|
||||
reshape_node1 = node
|
||||
|
||||
def get_target_child(parent_node, target_op_type):
|
||||
"""Get target child of given node."""
|
||||
if parent_node.output[0] not in input_name_to_nodes:
|
||||
return None
|
||||
|
||||
children = input_name_to_nodes[parent_node.output[0]]
|
||||
if len(children) > 1 or children[0].op_type != target_op_type:
|
||||
return None
|
||||
|
||||
return children[0]
|
||||
|
||||
if (
|
||||
(transpose_node1 := get_target_child(reshape_node1, "Transpose")) is None
|
||||
or (reshape_node2 := get_target_child(transpose_node1, "Reshape")) is None
|
||||
or (transpose_node2 := get_target_child(reshape_node2, "Transpose")) is None
|
||||
or (reshape_node3 := get_target_child(transpose_node2, "Reshape")) is None
|
||||
or (transpose_node3 := get_target_child(reshape_node3, "Transpose")) is None
|
||||
or (reshape_node4 := get_target_child(transpose_node3, "Reshape")) is None
|
||||
):
|
||||
return False
|
||||
|
||||
def get_tensor_shape(tensor_name):
|
||||
"""Get shape for given tensor name."""
|
||||
tensor_type = self.model.get_tensor_type(tensor_name)
|
||||
if not tensor_type:
|
||||
return None
|
||||
|
||||
tensor_shape = self.tensor_shape_to_list(tensor_type)
|
||||
if not tensor_shape:
|
||||
return None
|
||||
|
||||
return tensor_shape
|
||||
|
||||
if (
|
||||
(input_shape := get_tensor_shape(reshape_node1.input[0])) is None
|
||||
or (reshape_shape1 := get_tensor_shape(reshape_node1.output[0])) is None
|
||||
or (reshape_shape2 := get_tensor_shape(reshape_node2.output[0])) is None
|
||||
or (reshape_shape3 := get_tensor_shape(reshape_node3.output[0])) is None
|
||||
or (reshape_shape4 := get_tensor_shape(reshape_node4.output[0])) is None
|
||||
):
|
||||
return False
|
||||
|
||||
transpose_perm1 = self.get_node_attribute(transpose_node1, "perm")
|
||||
transpose_perm2 = self.get_node_attribute(transpose_node2, "perm")
|
||||
transpose_perm3 = self.get_node_attribute(transpose_node3, "perm")
|
||||
|
||||
# Check rank.
|
||||
if (
|
||||
len(input_shape) != 4
|
||||
or len(reshape_shape1) != 6
|
||||
or len(reshape_shape2) != 4
|
||||
or len(reshape_shape3) != 5
|
||||
or len(reshape_shape4) != 4
|
||||
):
|
||||
return False
|
||||
|
||||
# Check shape and perm.
|
||||
batch, channel, height, width = input_shape
|
||||
blocksize = reshape_shape1[3]
|
||||
if (
|
||||
reshape_shape1 != [batch, channel, height // blocksize, blocksize, width // blocksize, blocksize]
|
||||
or transpose_perm1 != [0, 1, 2, 4, 3, 5]
|
||||
or reshape_shape2 != [batch, channel, (height // blocksize) * (width // blocksize), blocksize**2]
|
||||
or transpose_perm2 != [0, 1, 3, 2]
|
||||
or reshape_shape3 != [batch, channel, blocksize**2, height // blocksize, width // blocksize]
|
||||
or transpose_perm3 != [0, 2, 1, 3, 4]
|
||||
or reshape_shape4 != [batch, blocksize**2 * channel, height // blocksize, width // blocksize]
|
||||
):
|
||||
return False
|
||||
|
||||
self.nodes_to_remove.extend(
|
||||
[
|
||||
reshape_node1,
|
||||
transpose_node1,
|
||||
reshape_node2,
|
||||
transpose_node2,
|
||||
reshape_node3,
|
||||
transpose_node3,
|
||||
reshape_node4,
|
||||
]
|
||||
)
|
||||
|
||||
s2d_node = onnx.helper.make_node(
|
||||
self.fused_op_type,
|
||||
name=self.create_unique_node_name(),
|
||||
inputs=[reshape_node1.input[0]],
|
||||
outputs=[reshape_node4.output[0]],
|
||||
blocksize=blocksize,
|
||||
)
|
||||
self.nodes_to_add.append(s2d_node)
|
||||
|
||||
return True
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""Fuse a sequence of Reshape and Transpose nodes into a single SpaceToDepth node.
|
||||
|
||||
Args:
|
||||
node: An onnx.NodeProto matching the specified search type (i.e., Reshape).
|
||||
input_name_to_nodes: A dict mapping tensor name to consumed nodes.
|
||||
output_name_to_node: A dict mapping tensor name to produced node.
|
||||
"""
|
||||
self._fuse_yolo(node, input_name_to_nodes, output_name_to_node)
|
||||
@@ -0,0 +1,413 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
import onnx
|
||||
|
||||
from ...quant_utils import QuantType
|
||||
from ...tensor_quant_overrides import QuantTypeInfo, TensorQuantOverridesHelper
|
||||
|
||||
|
||||
@dataclass
|
||||
class TensorTypeRequest:
|
||||
"""
|
||||
Bundles desired quantization type requests for a tensor. A distinction is made between the
|
||||
produced type and the consumed type.
|
||||
"""
|
||||
|
||||
# The tensor's quant type at the producer end. If None, assumed to be the default activation quant type.
|
||||
producer: QuantTypeInfo | None
|
||||
|
||||
# The tensor's quant type received by a set of consumer nodes.
|
||||
# If None, assumed to be the default activation quant type for all consumers.
|
||||
# consumers[1] is a set of consumer node names.
|
||||
consumers: tuple[QuantTypeInfo, set[str]] | None
|
||||
|
||||
|
||||
class MixedPrecisionTensorQuantOverridesFixer:
|
||||
"""
|
||||
Helper that generates tensor quantization overrides for mixed-precision QDQ models.
|
||||
|
||||
Specifically, this helper fixes an initial set of quantization overrides that assign a non-default
|
||||
activation quantization type to one or more tensors by doing the following:
|
||||
- Inferring which other tensors need to be overridden to the non-default activation quantization type.
|
||||
- Inserting quantization data type conversions.
|
||||
|
||||
Example:
|
||||
--------
|
||||
|
||||
Float model:
|
||||
|
||||
input_0 --> Op1 --> Op3 --> Op5 --> Op6 --> output_0
|
||||
^
|
||||
|
|
||||
input_1 --> Op2 -+-> Op4 ----+
|
||||
|
|
||||
+-> Op7 --> output_1
|
||||
|
|
||||
+-> Op8 --> output_2
|
||||
|
||||
If we'd like to quantize this model to uint8 precision, but would like to make sure tensor "Op4_out"
|
||||
is quantized to 16-bit, then we would specify the following initial tensor quantization overrides:
|
||||
|
||||
```
|
||||
init_overrides = {"Op4_out": [{"quant_type": QuantType.QUInt16}]}
|
||||
```
|
||||
|
||||
These initial overrides may not create a valid model because Op4 and Op5 may require both the input and output
|
||||
to be the same type (e.g., uint16). This helper fixes the overrides so that input/output data types
|
||||
are valid:
|
||||
|
||||
```
|
||||
overrides = TensorQuantOverridesHelper(init_overrides)
|
||||
|
||||
fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(overrides, model, QuantType.QUInt8)
|
||||
fixer.apply(
|
||||
default_activation_qtype=QuantType.QUInt8,
|
||||
default_activation_symmetric=False,
|
||||
)
|
||||
```
|
||||
|
||||
The above snippet generates the following "fixed" overrides (get via overrides.get_dict()):
|
||||
|
||||
{
|
||||
"Op2_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op4"}}}],
|
||||
"Op3_out": [{"quant_type": QUInt8, "convert": {"quant_type": QUInt16, "recv_nodes": {"Op5"}}}],
|
||||
"Op4_out": [{"quant_type": QUInt16}],
|
||||
"Op5_out": [{"quant_type": QUInt16, "convert": {"quant_type": QUInt8, "recv_nodes": {"Op6"}}}]
|
||||
}
|
||||
|
||||
How to interpret the fixed overrides:
|
||||
- Op2's output is consumed by Op4, Op7, and Op8. Op4 consumes the converted u16 type,
|
||||
but Op7 and Op8 consume the original u8 type.
|
||||
- Op3's output is converted from u8 to u16. Op5 consumes the converted u16 type.
|
||||
- Op4's output is just u16 (not converted). All consumers of Op4_out get the u16 type.
|
||||
- Op5's output is converted from u16 to u8. Op6 consumes the u8 type.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
overrides: TensorQuantOverridesHelper,
|
||||
producers: dict[str, onnx.NodeProto],
|
||||
consumers: dict[str, list[onnx.NodeProto]],
|
||||
value_infos: dict[str, onnx.ValueInfoProto],
|
||||
initializers: dict[str, onnx.TensorProto],
|
||||
):
|
||||
"""
|
||||
Params:
|
||||
overrides: The initial tensor quantization overrides to fix.
|
||||
producers: Dictionary that maps a tensor name to the producer node that generates the tensor.
|
||||
consumers: Dictionary that maps a tensor name to the consumer nodes that take the tensor as input.
|
||||
value_infos: Dictionary that maps a tensor name to its onnx.ValueInfoProto.
|
||||
initializers: Dictionary that maps an initializer name to its onnx.TensorProto.
|
||||
"""
|
||||
self.overrides = overrides
|
||||
self.consumers = consumers
|
||||
self.producers = producers
|
||||
self.value_infos = value_infos
|
||||
self.initializers = initializers
|
||||
|
||||
@staticmethod
|
||||
def create_from_model(
|
||||
overrides: TensorQuantOverridesHelper, model: onnx.ModelProto, default_activation_qtype: QuantType
|
||||
) -> MixedPrecisionTensorQuantOverridesFixer:
|
||||
"""
|
||||
Helper function that creates an instance of this class from a loaded ONNX model.
|
||||
|
||||
Params:
|
||||
overrides: The initial tensor quantization overrides to fix.
|
||||
model: Loaded ONNX model
|
||||
default_activation_qtype: The intended default activation quantization type.
|
||||
Used to validate the initial overrides.
|
||||
|
||||
Returns:
|
||||
Initialized MixedPrecisionTensorQuantOverridesFixer object
|
||||
"""
|
||||
model = onnx.shape_inference.infer_shapes(model) # Need to infer shapes to get value_infos
|
||||
|
||||
# Build dictionaries that enable convenient lookups of initializers and value_infos by name.
|
||||
initializers = {initializer.name: initializer for initializer in model.graph.initializer}
|
||||
value_infos = {vi.name: vi for vi in model.graph.value_info}
|
||||
value_infos.update({ot.name: ot for ot in model.graph.output})
|
||||
value_infos.update({it.name: it for it in model.graph.input})
|
||||
|
||||
# Ensure that the user-provided initial overrides are actually valid.
|
||||
valid, err = overrides.is_valid(initializers, set(value_infos), default_activation_qtype)
|
||||
if not valid:
|
||||
pprint_overrides = overrides.pprint_str(indent=4)
|
||||
logging.error(f"Provided invalid tensor quantization overrides:\n{pprint_overrides}")
|
||||
raise ValueError(err)
|
||||
|
||||
consumers = {}
|
||||
producers = {}
|
||||
|
||||
# Build dictionaries that map a tensor name to the consumer or producer nodes.
|
||||
for node in model.graph.node:
|
||||
for input_name in node.input:
|
||||
if input_name:
|
||||
if input_name not in consumers:
|
||||
consumers[input_name] = []
|
||||
|
||||
consumers[input_name].append(node)
|
||||
|
||||
for output_name in node.output:
|
||||
producers[output_name] = node
|
||||
|
||||
return MixedPrecisionTensorQuantOverridesFixer(overrides, producers, consumers, value_infos, initializers)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
default_activation_qtype: QuantType,
|
||||
default_activation_symmetric: bool,
|
||||
):
|
||||
"""
|
||||
Fixes the initial tensor quantization overrides (in-place) for use in mixed-precision QDQ models.
|
||||
|
||||
Params:
|
||||
default_activation_qtype: The intended default activation quantization type.
|
||||
default_activation_symmetric: The intended default symmetry used to quantize activations.
|
||||
"""
|
||||
type_requests = self.get_desired_tensor_types(default_activation_qtype, default_activation_symmetric)
|
||||
|
||||
# Use type requests to "fix" tensor quantization overrides by adding
|
||||
# quantization type conversions where necessary.
|
||||
for tensor_name, type_req in type_requests.items():
|
||||
all_consumers = {node.name for node in self.consumers.get(tensor_name, [])}
|
||||
has_producer_req = type_req.producer is not None
|
||||
has_consumer_req = bool(type_req.consumers)
|
||||
|
||||
# Only producer type: Add conversion back to default activation type
|
||||
if has_producer_req and not has_consumer_req:
|
||||
self._update_converted_tensor(
|
||||
tensor_name, type_req.producer, QuantTypeInfo(default_activation_qtype), all_consumers
|
||||
)
|
||||
# Only consumers
|
||||
elif not has_producer_req and has_consumer_req:
|
||||
prod_type_info = self.overrides.get_node_output_qtype_info(tensor_name, default_activation_qtype)
|
||||
consumer_type_info = type_req.consumers[0]
|
||||
|
||||
if prod_type_info != consumer_type_info:
|
||||
self._update_converted_tensor(
|
||||
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
|
||||
)
|
||||
else:
|
||||
if not self._check_nodes_are_not_convert_consumers(tensor_name, type_req.consumers[1]):
|
||||
raise ValueError(
|
||||
f"Tensor override for '{tensor_name}' converts the type for consumers that need the original type."
|
||||
)
|
||||
# Both producer and consumers
|
||||
elif has_producer_req and has_consumer_req:
|
||||
prod_type_info = type_req.producer
|
||||
consumer_type_info = type_req.consumers[0]
|
||||
|
||||
if prod_type_info != consumer_type_info:
|
||||
self._update_converted_tensor(
|
||||
tensor_name, prod_type_info, consumer_type_info, type_req.consumers[1]
|
||||
)
|
||||
else:
|
||||
consumers_for_original_type = all_consumers.difference(type_req.consumers[1])
|
||||
|
||||
if len(consumers_for_original_type) == 0:
|
||||
# All consumers want the overridden type, so no need for convert nodes!
|
||||
# Just add the override to the new new if not already present.
|
||||
if tensor_name not in self.overrides:
|
||||
self.overrides[tensor_name] = [{}]
|
||||
prod_type_info.save_to_dict(self.overrides[tensor_name][0])
|
||||
|
||||
assert "convert" not in self.overrides[tensor_name][0]
|
||||
else:
|
||||
# Some consumers don't want the overridden type.
|
||||
self._update_converted_tensor(
|
||||
tensor_name,
|
||||
prod_type_info,
|
||||
QuantTypeInfo(default_activation_qtype),
|
||||
consumers_for_original_type,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"TypeRequest for tensor {tensor_name} has no producer or consumers.")
|
||||
|
||||
# Done. Check if the overrides are valid.
|
||||
valid, err = self.overrides.is_valid(self.initializers, set(self.value_infos), default_activation_qtype)
|
||||
if not valid:
|
||||
pprint_overrides = self.overrides.pprint_str(indent=4)
|
||||
logging.error(
|
||||
f"Generated invalid tensor quantization overrides for mixed-precision QDQ model:\n{pprint_overrides}"
|
||||
)
|
||||
raise ValueError(err)
|
||||
|
||||
def get_desired_tensor_types(
|
||||
self,
|
||||
default_activation_qtype: QuantType,
|
||||
default_activation_symmetric: bool,
|
||||
) -> dict[str, TensorTypeRequest]:
|
||||
"""
|
||||
Iterates through the initial tensor quantization overrides and builds a set of TensorTypeRequests objects
|
||||
that describe the quantization types required at each tensor. These TensorTypeRequests objects are ultimately
|
||||
used to generated the "fixed" overrides.
|
||||
|
||||
Params:
|
||||
default_activation_qtype: The intended default activation quantization type.
|
||||
default_activation_symmetric: The intended default symmetry used to quantize activations.
|
||||
|
||||
Returns:
|
||||
TensorTypeRequest objects as a dict that maps a tensor name to its requested types.
|
||||
"""
|
||||
type_requests = {}
|
||||
default_activation_type_info = QuantTypeInfo(default_activation_qtype, default_activation_symmetric)
|
||||
|
||||
# Scan tensor overrides for type conversion requests.
|
||||
for tensor_name, override_list in self.overrides.items():
|
||||
if not self.__is_tensor_quantizable(tensor_name):
|
||||
continue # Skip non-quantizable tensors (e.g., not a float)
|
||||
|
||||
if tensor_name in self.initializers:
|
||||
continue # Skip initializers
|
||||
|
||||
if not override_list or len(override_list) > 1:
|
||||
continue # Skip per-channel stuff
|
||||
|
||||
override_dict = override_list[0]
|
||||
quant_type_info = QuantTypeInfo.load_from_dict(override_dict, default_activation_type_info.quant_type)
|
||||
producer_node = self.producers.get(tensor_name) # None if this is a model input
|
||||
|
||||
if quant_type_info != default_activation_type_info and "convert" not in override_dict:
|
||||
if producer_node is not None:
|
||||
self._add_type_requests_for_node(type_requests, quant_type_info, producer_node)
|
||||
|
||||
# Find all consumer nodes of `tensor_name` and update their inputs/outputs to the new type.
|
||||
for consumer_node in self.consumers.get(tensor_name, []):
|
||||
self._add_type_requests_for_node(type_requests, quant_type_info, consumer_node)
|
||||
|
||||
return type_requests
|
||||
|
||||
def _add_type_requests_for_node(
|
||||
self,
|
||||
type_requests: dict[str, TensorTypeRequest],
|
||||
quant_type_info: QuantTypeInfo,
|
||||
node: onnx.NodeProto,
|
||||
):
|
||||
"""
|
||||
Adds TensorTypeRequest objects for a given node, assuming that we want all its inputs and outputs
|
||||
to have the same quantization type (as specified by the `quant_type_info` parameter).
|
||||
|
||||
Params:
|
||||
type_requests: Dictionary of type requests to append to for this node.
|
||||
quant_type_info: The quantization type to use for inputs and outputs.
|
||||
node: The node for which the TensorTypeRequest objects are created and added to type_requests.
|
||||
"""
|
||||
# Add output side
|
||||
for output_name in node.output:
|
||||
if not self.__is_tensor_quantizable(output_name):
|
||||
continue
|
||||
|
||||
if output_name not in type_requests:
|
||||
type_requests[output_name] = TensorTypeRequest(quant_type_info, None)
|
||||
else:
|
||||
if (
|
||||
type_requests[output_name].producer is not None
|
||||
and type_requests[output_name].producer != quant_type_info
|
||||
):
|
||||
raise ValueError(f"Tensor {output_name} has multiple types.")
|
||||
|
||||
type_requests[output_name].producer = quant_type_info
|
||||
|
||||
# Add the consumer side
|
||||
for input_name in node.input:
|
||||
if input_name and input_name not in self.initializers and self.__is_tensor_quantizable(input_name):
|
||||
if input_name not in type_requests:
|
||||
type_requests[input_name] = TensorTypeRequest(None, None)
|
||||
|
||||
if type_requests[input_name].consumers is None:
|
||||
type_requests[input_name].consumers = (quant_type_info, set())
|
||||
|
||||
if type_requests[input_name].consumers[0] != quant_type_info:
|
||||
raise ValueError(f"Tensor {input_name} has consumers requesting different types.")
|
||||
|
||||
if not node.name:
|
||||
raise ValueError(
|
||||
f"Node of type {node.op_type} with output 0 {node.output[0]} does not have a name!"
|
||||
)
|
||||
|
||||
type_requests[input_name].consumers[1].add(node.name)
|
||||
|
||||
def _update_converted_tensor(
|
||||
self,
|
||||
tensor_name: str,
|
||||
producer_type_info: QuantTypeInfo,
|
||||
consumer_type_info: QuantTypeInfo,
|
||||
consumer_names: set[str],
|
||||
):
|
||||
"""
|
||||
Updates the tensor quantization overrides for a tensor that is converted from one type to another.
|
||||
|
||||
Params:
|
||||
tensor_name: The name of the tensor for which to update overrides.
|
||||
producer_type_info: Info for the tensor's produced type.
|
||||
consumer_type_info: Info for the tensor's consumed (i.e., converted) type.
|
||||
consumer_names: Nodes names of consumers that consume the converted type.
|
||||
"""
|
||||
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
|
||||
self.overrides[tensor_name] = [{}]
|
||||
producer_type_info.save_to_dict(self.overrides[tensor_name][0])
|
||||
|
||||
overrides = self.overrides[tensor_name][0]
|
||||
if producer_type_info != QuantTypeInfo.load_from_dict(overrides):
|
||||
raise ValueError(f"Desired producer quant_type for {tensor_name} doesn't match existing type.")
|
||||
|
||||
if consumer_names:
|
||||
if "convert" not in overrides:
|
||||
overrides["convert"] = {}
|
||||
consumer_type_info.save_to_dict(overrides["convert"])
|
||||
|
||||
convert_dict = overrides["convert"]
|
||||
if consumer_type_info != QuantTypeInfo.load_from_dict(convert_dict):
|
||||
raise ValueError(f"Desired consumer quant_type for {tensor_name} doesn't match existing type.")
|
||||
|
||||
if "recv_nodes" not in convert_dict:
|
||||
convert_dict["recv_nodes"] = set()
|
||||
|
||||
convert_dict["recv_nodes"].update(consumer_names)
|
||||
|
||||
def _check_nodes_are_not_convert_consumers(self, tensor_name: str, node_names: set[str]):
|
||||
"""
|
||||
Returns true if the given nodes do not consume/receive a converted quantization type.
|
||||
|
||||
Params:
|
||||
tensor_name: The name of the tensor to check.
|
||||
node_names: Set of node names that should not be consumers of the converted type.
|
||||
"""
|
||||
if tensor_name not in self.overrides or not self.overrides[tensor_name]:
|
||||
return True
|
||||
|
||||
overrides = self.overrides[tensor_name][0]
|
||||
|
||||
if "convert" not in overrides:
|
||||
return True
|
||||
|
||||
convert_dict = overrides["convert"]
|
||||
|
||||
if "recv_nodes" not in convert_dict:
|
||||
return False
|
||||
|
||||
return not convert_dict["recv_nodes"].intersection(node_names)
|
||||
|
||||
def __is_tensor_quantizable(self, tensor_name):
|
||||
weight = self.initializers.get(tensor_name)
|
||||
if weight is not None:
|
||||
if weight.data_type in (onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16):
|
||||
return True
|
||||
elif tensor_name in self.value_infos:
|
||||
vi = self.value_infos[tensor_name]
|
||||
if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
|
||||
onnx.TensorProto.FLOAT,
|
||||
onnx.TensorProto.FLOAT16,
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -0,0 +1,335 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import onnx
|
||||
|
||||
from ....tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed
|
||||
from ....tools.remove_initializer_from_input import remove_initializer_from_input
|
||||
from ...fusions import FusionGelu, FusionLayerNormalization
|
||||
from ...onnx_model import ONNXModel
|
||||
from ...quant_utils import save_and_reload_model_with_shape_infer
|
||||
from .fusion_lpnorm import FusionLpNormalization
|
||||
from .fusion_spacetodepth import FusionSpaceToDepth
|
||||
|
||||
|
||||
def qnn_preprocess_model(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
model_output: str | Path,
|
||||
exclude_initializer_from_input: bool = False,
|
||||
fuse_layernorm: bool = False,
|
||||
save_as_external_data: bool = False,
|
||||
all_tensors_to_one_file: bool = False,
|
||||
external_data_location: str | None = None,
|
||||
external_data_size_threshold: int = 1024,
|
||||
external_data_convert_attribute: bool = False,
|
||||
inputs_to_make_channel_last: list[str] | None = None,
|
||||
outputs_to_make_channel_last: list[str] | None = None,
|
||||
dynamic_input_shapes: list[tuple[str, str]] | None = None,
|
||||
) -> bool:
|
||||
"""
|
||||
If necessary, this method creates a new "pre-processed" model in preparation for
|
||||
quantization of a model to be used in QNN EP. Returns true if a new model was created.
|
||||
|
||||
This method perfoms the following operations:
|
||||
- Fuse Erf sequence into a single Gelu node.
|
||||
- Fuse ReduceL2 sequence into a single LpNormalization node (p == 2).
|
||||
- (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
|
||||
|
||||
Args:
|
||||
model_input: Path to the input model file or ModelProto.
|
||||
model_output: Path the output model file, which is only created if this method returns True.
|
||||
exclude_initializer_from_input: A bool specifying whether to exclude initializer from input.
|
||||
Defaults to False.
|
||||
fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
|
||||
Defaults to False.
|
||||
save_as_external_data: True if output model should be saved with external data. Defaults to false.
|
||||
all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false.
|
||||
If true, save all tensors to one external file specified by external_data_location.
|
||||
If false, save each tensor to a file named with the tensor name.
|
||||
external_data_location: Effective only if save_as_external_data is true. Defaults to None.
|
||||
Specify the external file to which all tensors are saved. Path is relative
|
||||
to the model path. If not specified, the model's name is used.
|
||||
external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024.
|
||||
Tensors with a data size >= external_data_size_threshold are converted to external data.
|
||||
To convert every tensor with raw data to external data, set to 0.
|
||||
external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false.
|
||||
If true, convert all tensors to external data.
|
||||
If false, convert only non-attribute tensors to external data.
|
||||
inputs_to_make_channel_last: List of graph input names to transpose to be "channel-last". For example,
|
||||
if "input0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change input0's
|
||||
shape to (N, D1, D2, ..., Dn, C) and add a transpose node after it.
|
||||
|
||||
Original:
|
||||
input0 (N, C, D1, D2, ..., Dn) --> <Nodes>
|
||||
|
||||
Updated:
|
||||
input0 (N, D1, D2, ..., Dn, C) --> Transpose --> input0_chanfirst (N, C, D1, D2, ..., Dn) --> <Nodes>
|
||||
|
||||
This can potentially improve inference latency for QDQ models running on QNN EP because the
|
||||
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
|
||||
to cancel out.
|
||||
outputs_to_make_channel_last: List of graph output names to transpose to be "channel-last". For example,
|
||||
if "output0" originally has the shape (N, C, D1, D2, ..., Dn), the resulting model will change output0's
|
||||
shape to (N, D1, D2, ..., Dn, C) and add a transpose node before it.
|
||||
|
||||
Original:
|
||||
<Nodes> --> output0 (N, C, D1, D2, ..., Dn)
|
||||
|
||||
Updated:
|
||||
<Nodes> --> output0_chanfirst (N, C, D1, D2, ..., Dn) --> Transpose --> output0 (N, D1, D2, ..., Dn, C)
|
||||
|
||||
This can potentially improve inference latency for QDQ models running on QNN EP because the
|
||||
additional transpose node may allow other transpose nodes inserted during ORT layout transformation
|
||||
to cancel out.
|
||||
dynamic_input_shapes: A list of tuples specifying model input name to and its static shape in comma seprated
|
||||
format, for example: [('input', '1,3,256,256')]. Defaults to None.
|
||||
"""
|
||||
modified = False
|
||||
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
|
||||
model = save_and_reload_model_with_shape_infer(model)
|
||||
onnx_model = ONNXModel(model)
|
||||
|
||||
# Optionally, fix the dynamic input shapes.
|
||||
if dynamic_input_shapes:
|
||||
for input_name, input_shape_str in dynamic_input_shapes:
|
||||
input_shape = [int(i) for i in input_shape_str.split(",")]
|
||||
make_input_shape_fixed(onnx_model.graph(), input_name, input_shape)
|
||||
fix_output_shapes(onnx_model.model)
|
||||
modified = True
|
||||
|
||||
# Exclude initializer from input if model.ir_version >= 4
|
||||
if exclude_initializer_from_input:
|
||||
modified |= remove_initializer_from_input(onnx_model.model)
|
||||
|
||||
# Fuse Erf sequence into a single Gelu
|
||||
fusion_gelu = FusionGelu(onnx_model)
|
||||
if fusion_gelu.apply():
|
||||
modified = True
|
||||
|
||||
# Fuse ReduceL2 sequence into a single LpNormalization node with p == 2.
|
||||
fusion_lpnorm = FusionLpNormalization(onnx_model)
|
||||
if fusion_lpnorm.apply():
|
||||
modified = True
|
||||
|
||||
# Fuse Reshape/Transpose sequence into a single SpaceToDepth.
|
||||
fusion_s2d = FusionSpaceToDepth(onnx_model)
|
||||
if fusion_s2d.apply():
|
||||
modified = True
|
||||
|
||||
# Optionally, fuse ReduceMean sequence into a single LayerNormalization node.
|
||||
if fuse_layernorm:
|
||||
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
|
||||
|
||||
# Need opset >= 17 to use LayerNormalization.
|
||||
if onnx_opset.version < 17:
|
||||
logging.warning(
|
||||
"Unable to fuse ReduceMean sequence into a LayerNormalization node. "
|
||||
"ONNX model must use an opset >= 17 in order to use LayerNormalization, "
|
||||
f"but found version {onnx_opset.version}. Please use onnx.version_converter to update your model."
|
||||
)
|
||||
else:
|
||||
fusion_layernorm = FusionLayerNormalization(onnx_model)
|
||||
if fusion_layernorm.apply():
|
||||
modified = True
|
||||
|
||||
# Optionally, transpose inputs and/or outputs to make them "channel-last".
|
||||
if inputs_to_make_channel_last or outputs_to_make_channel_last:
|
||||
transpose_node_prefix = "Transpose_channel_"
|
||||
transpose_node_suffix: int = onnx_model.get_largest_node_name_suffix(transpose_node_prefix) + 1
|
||||
update_io_to_channel_last(
|
||||
onnx_model.model,
|
||||
inputs_to_make_channel_last,
|
||||
outputs_to_make_channel_last,
|
||||
transpose_node_name_prefix=transpose_node_prefix,
|
||||
transpose_node_name_start_suffix=transpose_node_suffix,
|
||||
)
|
||||
modified = True
|
||||
|
||||
# Make sure all nodes have a name.
|
||||
unnamed_node_prefix = "qnn_preproc_node_"
|
||||
available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1
|
||||
for node in onnx_model.model.graph.node:
|
||||
if node.op_type != "Constant" and not node.name:
|
||||
new_node_name = f"{unnamed_node_prefix}{available_suffix!s}"
|
||||
available_suffix += 1
|
||||
node.name = new_node_name
|
||||
modified = True
|
||||
logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.")
|
||||
|
||||
if modified:
|
||||
onnx_model.topological_sort()
|
||||
onnx.save_model(
|
||||
model,
|
||||
model_output,
|
||||
save_as_external_data=save_as_external_data,
|
||||
all_tensors_to_one_file=all_tensors_to_one_file,
|
||||
location=external_data_location,
|
||||
size_threshold=external_data_size_threshold,
|
||||
convert_attribute=external_data_convert_attribute,
|
||||
)
|
||||
|
||||
return modified
|
||||
|
||||
|
||||
class InputOutputNameMap:
|
||||
def __init__(
|
||||
self,
|
||||
orig_tensor_names: set[str],
|
||||
orig_graph_inputs: dict[str, onnx.ValueInfoProto],
|
||||
orig_graph_outputs: dict[str, onnx.ValueInfoProto],
|
||||
):
|
||||
self.orig_tensor_names = orig_tensor_names
|
||||
self.orig_graph_inputs = orig_graph_inputs
|
||||
self.orig_graph_outputs = orig_graph_outputs
|
||||
self.updated_io_names = {}
|
||||
self.new_value_infos = []
|
||||
|
||||
def get_new_name(self, orig_name: str):
|
||||
if orig_name in self.updated_io_names:
|
||||
return self.updated_io_names[orig_name]
|
||||
|
||||
# Make a new tensor name that is unique among all tensors in the graph.
|
||||
prefix: str = f"{orig_name}_channel_first_"
|
||||
suffix: int = -1
|
||||
for tensor_name in self.orig_tensor_names:
|
||||
if tensor_name.startswith(prefix) and tensor_name[len(prefix) :].isdigit():
|
||||
index = int(tensor_name[len(prefix) :])
|
||||
suffix = max(suffix, index)
|
||||
|
||||
suffix += 1 # This is the first available suffix.
|
||||
new_name = f"{prefix}{suffix!s}"
|
||||
|
||||
# Add new value_info objects for these new tensors.
|
||||
orig_value_info = self.orig_graph_inputs.get(orig_name) or self.orig_graph_outputs[orig_name]
|
||||
value_info_proto = onnx.ValueInfoProto()
|
||||
value_info_proto.CopyFrom(orig_value_info)
|
||||
value_info_proto.name = new_name
|
||||
self.new_value_infos.append(value_info_proto)
|
||||
|
||||
self.updated_io_names[orig_name] = new_name
|
||||
return self.updated_io_names[orig_name]
|
||||
|
||||
|
||||
def update_io_to_channel_last(
|
||||
model: onnx.ModelProto,
|
||||
inputs_to_update: list[str] | None,
|
||||
outputs_to_update: list[str] | None,
|
||||
transpose_node_name_prefix: str = "Transpose_channel_",
|
||||
transpose_node_name_start_suffix: int = 0,
|
||||
):
|
||||
inputs_to_update = set(inputs_to_update or [])
|
||||
outputs_to_update = set(outputs_to_update or [])
|
||||
|
||||
if not inputs_to_update and not outputs_to_update:
|
||||
return
|
||||
|
||||
graph = model.graph
|
||||
orig_graph_inputs = {ginput.name: ginput for ginput in graph.input}
|
||||
orig_graph_outputs = {goutput.name: goutput for goutput in graph.output}
|
||||
|
||||
# Check that the user passed in actual input and output names.
|
||||
for input_name in inputs_to_update:
|
||||
if input_name not in orig_graph_inputs:
|
||||
raise ValueError(f"{input_name} is not a graph input")
|
||||
|
||||
for output_name in outputs_to_update:
|
||||
if output_name not in orig_graph_outputs:
|
||||
raise ValueError(f"{output_name} is not a graph output")
|
||||
|
||||
orig_tensor_names = set()
|
||||
orig_tensor_names.update(set(orig_graph_inputs))
|
||||
orig_tensor_names.update(set(orig_graph_outputs))
|
||||
orig_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
|
||||
|
||||
# Maps original input (or output) name to its updated name used within the graph.
|
||||
io_map = InputOutputNameMap(orig_tensor_names, orig_graph_inputs, orig_graph_outputs)
|
||||
|
||||
# Update each node's inputs/outputs to use the transposed versions.
|
||||
for node in graph.node:
|
||||
for i in range(len(node.input)):
|
||||
if node.input[i] and node.input[i] in inputs_to_update:
|
||||
node.input[i] = io_map.get_new_name(node.input[i])
|
||||
elif node.input[i] and node.input[i] in outputs_to_update:
|
||||
node.input[i] = io_map.get_new_name(node.input[i])
|
||||
|
||||
for i in range(len(node.output)):
|
||||
if node.output[i] in outputs_to_update:
|
||||
node.output[i] = io_map.get_new_name(node.output[i])
|
||||
|
||||
# Update graph inputs to channel-last and a Transpose (to channel-first) after each.
|
||||
for g_input_name in inputs_to_update:
|
||||
g_input = orig_graph_inputs[g_input_name]
|
||||
|
||||
if not g_input.type.HasField("tensor_type") or not g_input.type.tensor_type.HasField("shape"):
|
||||
raise ValueError(f"Expected input {g_input.name} to have a tensor_type with a shape")
|
||||
|
||||
input_shape = g_input.type.tensor_type.shape
|
||||
input_rank = len(input_shape.dim)
|
||||
|
||||
if input_rank < 3:
|
||||
raise ValueError(f"Expected input {g_input.name} to be of rank >= 3")
|
||||
|
||||
channel_dim = onnx.TensorShapeProto.Dimension()
|
||||
channel_dim.CopyFrom(input_shape.dim[1])
|
||||
for i in range(1, input_rank - 1):
|
||||
input_shape.dim[i].CopyFrom(input_shape.dim[i + 1])
|
||||
input_shape.dim[input_rank - 1].CopyFrom(channel_dim)
|
||||
|
||||
transpose_perm = list(range(input_rank))
|
||||
for i in range(input_rank):
|
||||
transpose_perm[i] = i if i < 1 else i - 1
|
||||
transpose_perm[1] = input_rank - 1
|
||||
|
||||
transpose_node = onnx.helper.make_node(
|
||||
"Transpose",
|
||||
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
|
||||
inputs=[g_input.name],
|
||||
outputs=[io_map.get_new_name(g_input.name)],
|
||||
perm=transpose_perm,
|
||||
)
|
||||
transpose_node_name_start_suffix += 1
|
||||
|
||||
graph.node.extend([transpose_node])
|
||||
|
||||
# Update graph outputs to channel-last and a Transpose (from channel-first) before each.
|
||||
for g_output_name in outputs_to_update:
|
||||
g_output = orig_graph_outputs[g_output_name]
|
||||
if not g_output.type.HasField("tensor_type") or not g_output.type.tensor_type.HasField("shape"):
|
||||
raise ValueError(f"Expected output {g_output.name} to have a tensor_type with a shape")
|
||||
|
||||
output_shape = g_output.type.tensor_type.shape
|
||||
output_rank = len(output_shape.dim)
|
||||
|
||||
if output_rank < 3:
|
||||
raise ValueError(f"Expected output {g_output.name} to be of rank >= 3")
|
||||
|
||||
channel_dim = onnx.TensorShapeProto.Dimension()
|
||||
channel_dim.CopyFrom(output_shape.dim[1])
|
||||
for i in range(1, output_rank - 1):
|
||||
output_shape.dim[i].CopyFrom(output_shape.dim[i + 1])
|
||||
output_shape.dim[output_rank - 1].CopyFrom(channel_dim)
|
||||
|
||||
transpose_perm = list(range(output_rank))
|
||||
for i in range(output_rank):
|
||||
transpose_perm[i] = i if i == 0 else i + 1
|
||||
transpose_perm[output_rank - 1] = 1
|
||||
|
||||
transpose_node = onnx.helper.make_node(
|
||||
"Transpose",
|
||||
name=f"{transpose_node_name_prefix}{transpose_node_name_start_suffix!s}",
|
||||
inputs=[io_map.get_new_name(g_output.name)],
|
||||
outputs=[g_output.name],
|
||||
perm=transpose_perm,
|
||||
)
|
||||
transpose_node_name_start_suffix += 1
|
||||
|
||||
graph.node.extend([transpose_node])
|
||||
|
||||
graph.value_info.extend(io_map.new_value_infos)
|
||||
@@ -0,0 +1,406 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
from ...calibrate import CalibrationDataReader, CalibrationMethod
|
||||
from ...quant_utils import QuantType
|
||||
from ...quantize import StaticQuantConfig
|
||||
from ...tensor_quant_overrides import TensorQuantOverridesHelper
|
||||
from .mixed_precision_overrides_utils import MixedPrecisionTensorQuantOverridesFixer
|
||||
|
||||
Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16}
|
||||
Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8}
|
||||
Q4_TYPES = {QuantType.QInt4, QuantType.QUInt4}
|
||||
OP_TYPES_TO_EXCLUDE = {"Cast"}
|
||||
MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB
|
||||
|
||||
|
||||
def warn_unable_to_override(
|
||||
node: onnx.NodeProto,
|
||||
what_str: str,
|
||||
tensor_name: str,
|
||||
io_kind: str,
|
||||
):
|
||||
logging.warning(
|
||||
f"Unable to override {what_str} for {node.op_type} node's {io_kind} "
|
||||
"because it has already been overridden! Check the initial quantization overrides provided "
|
||||
"to get_qnn_qdq_config() if the generated QDQ model does not run on QNN EP. "
|
||||
f"Node name: {node.name}, {io_kind} name: {tensor_name}"
|
||||
)
|
||||
|
||||
|
||||
def get_qnn_qdq_config(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
|
||||
activation_type: QuantType = QuantType.QUInt8,
|
||||
weight_type: QuantType = QuantType.QUInt8,
|
||||
per_channel: bool = False,
|
||||
init_overrides: dict[str, list[dict[str, Any]]] | None = None,
|
||||
add_qtype_converts: bool = True,
|
||||
activation_symmetric: bool = False,
|
||||
weight_symmetric: bool | None = None,
|
||||
keep_removable_activations: bool = False,
|
||||
stride: int | None = None,
|
||||
calibration_providers: list[str] | None = None,
|
||||
op_types_to_quantize: list[str] | None = None,
|
||||
nodes_to_exclude: list[str] | None = None,
|
||||
) -> StaticQuantConfig:
|
||||
"""
|
||||
Returns a static quantization configuration suitable for running QDQ models on QNN EP.
|
||||
This is done primarily by setting tensor-level quantization overrides.
|
||||
|
||||
Params:
|
||||
model_input: Path to the input model file or ModelProto.
|
||||
calibration_data_reader: Calibration data reader.
|
||||
calibrate_methode: The calibration method. Defaults to MinMax.
|
||||
activation_type: The default activation quantization type. Defaults to QUInt8.
|
||||
weight_type: The default weight quantization type. Defaults to QUInt8.
|
||||
per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
|
||||
Defaults to false. Alternatively, use the tensor-level `init_overrides` to select individual operators
|
||||
and their quantization axes.
|
||||
|
||||
If set, the quantization tool uses per-channel quantization for the following operator types and inputs:
|
||||
- Conv:
|
||||
- input[1] on axis 0
|
||||
- input[2] (bias) on axis 0
|
||||
- ConvTranspose:
|
||||
- input[1] on axis 1
|
||||
- input[2] (bias) on axis 0
|
||||
init_overrides: Initial tensor-level quantization overrides. Defaults to None. This function updates of a copy
|
||||
of these overrides with any necessary adjustments and includes them in the returned
|
||||
configuration object (i.e., config.extra_options['TensorQuantOverrides']).
|
||||
|
||||
The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
|
||||
contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
|
||||
each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
|
||||
key must be present in the first dictionary for per-channel quantization.
|
||||
|
||||
Each dictionary contains optional overrides with the following keys and values.
|
||||
'quant_type' = QuantType : The tensor's quantization data type.
|
||||
'axis' = Int : The per-channel axis. Must be present for per-channel weights.
|
||||
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
||||
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
||||
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
||||
set `scale` or `zero_point`. Only valid for initializers.
|
||||
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'convert' = Dict : A nested dictionary with the same keys for an activation
|
||||
tensor that should be converted to another quantization type.
|
||||
'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
|
||||
other nodes get the original type. If not specified,
|
||||
assume all consumer nodes get the converted type.
|
||||
add_qtype_converts: True if this function should automatically add "convert" entries to the provided
|
||||
`init_overrides` to ensure that operators use valid input/output types (activations only).
|
||||
Ex: if you override the output of an Add to 16-bit, this option ensures that the activation inputs
|
||||
of the Add are also up-converted to 16-bit and that data types for surrounding ops are converted
|
||||
appropriately. Refer to the documentation in mixed_precision_overrides_utils.py for additional details.
|
||||
activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
|
||||
Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uin16,
|
||||
the zero-point values are 128 and 32,768, respectively.
|
||||
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
|
||||
Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
|
||||
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
|
||||
be removed, and will be explicitly represented in the QDQ model. If false, these activations
|
||||
are automatically removed if activations are asymmetrically quantized. Keeping these activations
|
||||
is necessary if optimizations or EP transformations will later remove
|
||||
QuantizeLinear/DequantizeLinear operators from the model.
|
||||
calibration_providers: Execution providers to run the session during calibration. Default is None which uses
|
||||
[ "CPUExecutionProvider" ].
|
||||
op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
|
||||
nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
|
||||
quantization when it is not None.
|
||||
|
||||
Returns:
|
||||
A StaticQuantConfig object
|
||||
"""
|
||||
if weight_symmetric is None:
|
||||
weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
|
||||
|
||||
model = (
|
||||
model_input
|
||||
if isinstance(model_input, onnx.ModelProto)
|
||||
else onnx.load_model(model_input, load_external_data=False)
|
||||
)
|
||||
|
||||
op_types = set()
|
||||
model_has_external_data = False
|
||||
name_to_initializer = {}
|
||||
|
||||
# Build map of initializers (name -> initializer) and
|
||||
# check if the model has external data.
|
||||
for initializer in model.graph.initializer:
|
||||
name_to_initializer[initializer.name] = initializer
|
||||
if onnx.external_data_helper.uses_external_data(initializer):
|
||||
model_has_external_data = True
|
||||
|
||||
overrides_helper = TensorQuantOverridesHelper(copy.deepcopy(init_overrides) if init_overrides else {})
|
||||
|
||||
if not overrides_helper.empty() and add_qtype_converts:
|
||||
# Fix mixed-precision overrides.
|
||||
overrides_fixer = MixedPrecisionTensorQuantOverridesFixer.create_from_model(
|
||||
overrides_helper, model, activation_type
|
||||
)
|
||||
overrides_fixer.apply(activation_type, activation_symmetric)
|
||||
|
||||
# Setup quantization overrides for specific operator types to ensure compatibility with QNN EP.
|
||||
qnn_compat = QnnCompatibilityOverrides(
|
||||
activation_type,
|
||||
weight_type,
|
||||
activation_symmetric,
|
||||
weight_symmetric,
|
||||
per_channel,
|
||||
overrides_helper,
|
||||
name_to_initializer,
|
||||
)
|
||||
|
||||
op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
|
||||
nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
|
||||
|
||||
for node in model.graph.node:
|
||||
if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
|
||||
continue
|
||||
if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
|
||||
continue
|
||||
op_types.add(node.op_type)
|
||||
qnn_compat.process_node(node)
|
||||
|
||||
extra_options = {
|
||||
"MinimumRealRange": 0.0001,
|
||||
"DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
|
||||
"QDQKeepRemovableActivations": keep_removable_activations,
|
||||
"TensorQuantOverrides": overrides_helper.get_dict(),
|
||||
"ActivationSymmetric": activation_symmetric,
|
||||
"WeightSymmetric": weight_symmetric,
|
||||
"CalibStridedMinMax": stride,
|
||||
}
|
||||
|
||||
# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
|
||||
# on Q/DQ operators if using 16-bit or 4-bit quantization.
|
||||
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
|
||||
if onnx_opset.version < 21:
|
||||
opset21_types = Q16_TYPES.union(Q4_TYPES)
|
||||
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
|
||||
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
|
||||
extra_options["UseQDQContribOps"] = True
|
||||
|
||||
return StaticQuantConfig(
|
||||
calibration_data_reader,
|
||||
calibrate_method=calibrate_method,
|
||||
activation_type=activation_type,
|
||||
weight_type=weight_type,
|
||||
op_types_to_quantize=(
|
||||
op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
|
||||
),
|
||||
nodes_to_exclude=nodes_to_exclude,
|
||||
per_channel=per_channel,
|
||||
use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
|
||||
calibration_providers=calibration_providers,
|
||||
extra_options=extra_options,
|
||||
)
|
||||
|
||||
|
||||
class QnnCompatibilityOverrides:
|
||||
"""
|
||||
Helper that processes nodes to generate quantization overrides that make the resulting QDQ model
|
||||
compatible with QNN EP.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
default_activation_qtype: QuantType,
|
||||
default_weight_qtype: QuantType,
|
||||
activation_symmetric: bool,
|
||||
weight_symmetric: bool,
|
||||
per_channel: bool,
|
||||
overrides: TensorQuantOverridesHelper,
|
||||
initializers: dict[str, onnx.TensorProto],
|
||||
):
|
||||
self.default_activation_qtype = default_activation_qtype
|
||||
self.default_weight_qtype = default_weight_qtype
|
||||
self.activation_symmetric = activation_symmetric
|
||||
self.weight_symmetric = weight_symmetric
|
||||
self.per_channel = per_channel
|
||||
self.overrides = overrides
|
||||
self.initializers = initializers
|
||||
|
||||
self.process_fns = {
|
||||
"MatMul": self._process_matmul,
|
||||
"LayerNormalization": self._process_layernorm,
|
||||
"Sigmoid": self._process_sigmoid,
|
||||
"Tanh": self._process_tanh,
|
||||
}
|
||||
|
||||
def process_node(self, node: onnx.NodeProto):
|
||||
process_fn = self.process_fns.get(node.op_type)
|
||||
|
||||
if process_fn is not None:
|
||||
process_fn(node)
|
||||
|
||||
def _make_static_inputs_use_default_weight_type(self, node: onnx.NodeProto):
|
||||
"""
|
||||
Overrides initializer input(s) to use the default weight type if:
|
||||
- The default weight type is 8-bit
|
||||
- One of the inputs is a 16-bit activation
|
||||
- The other input is an initializer (per-tensor quantized)
|
||||
|
||||
This is necessary because the quantization tool does not assign MatMul or LayerNorm initializer
|
||||
inputs the default weight type. Instead, it assigns the default activation type.
|
||||
"""
|
||||
if self.default_weight_qtype not in Q8_TYPES:
|
||||
return
|
||||
|
||||
input_16bit_act_name = None
|
||||
input_weight_name = None
|
||||
|
||||
# Loop through first 2 inputs to find a 16-bit activation and a (per-tensor) weight.
|
||||
for i in range(2):
|
||||
input_name = node.input[i]
|
||||
if not input_name:
|
||||
continue
|
||||
|
||||
is_weight = input_name in self.initializers
|
||||
qtype_info = self.overrides.get_node_input_qtype_info(
|
||||
input_name,
|
||||
node.name,
|
||||
default_qtype=None if is_weight else self.default_activation_qtype,
|
||||
)
|
||||
|
||||
if qtype_info.axis is not None:
|
||||
return # Don't process MatMul with a per-channel quantized input.
|
||||
|
||||
if (
|
||||
is_weight
|
||||
and qtype_info.quant_type == self.default_weight_qtype
|
||||
and qtype_info.symmetric == self.weight_symmetric
|
||||
):
|
||||
return # Return. Weight is already overridden to use the desired weight type.
|
||||
|
||||
if is_weight:
|
||||
input_weight_name = input_name
|
||||
elif qtype_info.quant_type in Q16_TYPES:
|
||||
input_16bit_act_name = input_name
|
||||
|
||||
# Override initializer input to use the default weight type.
|
||||
if input_16bit_act_name and input_weight_name:
|
||||
did_update = self.overrides.update_tensor_overrides(
|
||||
input_weight_name,
|
||||
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
|
||||
overwrite=False,
|
||||
)
|
||||
|
||||
if not did_update:
|
||||
warn_unable_to_override(node, "quant_type/symmetric", input_weight_name, "input weight")
|
||||
|
||||
def _process_matmul(self, node: onnx.NodeProto):
|
||||
assert node.op_type == "MatMul", f"Expected MatMul, but got {node.op_type}"
|
||||
|
||||
if not self.per_channel:
|
||||
self._make_static_inputs_use_default_weight_type(node)
|
||||
return
|
||||
|
||||
# QNN does not support per-channel MatMul. However, the ORT quantization tool attempts to use per-channel
|
||||
# quantization for MatMul by default *if* the global per_channel setting is enabled. So, we need to
|
||||
# provide explicit per-tensor quantization overrides for MatMul if per_channel is enabled and
|
||||
# the user did not provide any other overrides.
|
||||
for input_name in node.input:
|
||||
is_weight_no_overrides = input_name in self.initializers and input_name not in self.overrides
|
||||
if is_weight_no_overrides:
|
||||
self.overrides.update_tensor_overrides(
|
||||
input_name,
|
||||
{"quant_type": self.default_weight_qtype, "symmetric": self.weight_symmetric},
|
||||
)
|
||||
|
||||
def _process_layernorm(self, node: onnx.NodeProto):
|
||||
assert node.op_type == "LayerNormalization", f"Expected LayerNormalization, but got {node.op_type}"
|
||||
|
||||
if not self.per_channel:
|
||||
self._make_static_inputs_use_default_weight_type(node)
|
||||
return
|
||||
|
||||
has_weight_no_overrides = node.input[1] in self.initializers and node.input[1] not in self.overrides
|
||||
has_bias_no_overrides = (
|
||||
len(node.input) > 2
|
||||
and node.input[2]
|
||||
and node.input[2] in self.initializers
|
||||
and node.input[2] not in self.overrides
|
||||
)
|
||||
|
||||
if has_weight_no_overrides or has_bias_no_overrides:
|
||||
# TODO: Make bias input not per-channel. QNN needs it to be per-tensor, but quantizer
|
||||
# tries to makes it per-channel if the weight is also per-channel.
|
||||
raise ValueError(
|
||||
"get_qnn_qdq_config() does not currently support the global per_channel option with LayerNormalization."
|
||||
" Please try using custom overrides that make bias per-tensor quantized."
|
||||
)
|
||||
|
||||
def _process_sigmoid(self, node: onnx.NodeProto):
|
||||
"""
|
||||
Overrides 16-bit Sigmoid's output scale and zero-point as per QNN requirements.
|
||||
"""
|
||||
assert node.op_type == "Sigmoid", f"Expected Sigmoid, but got {node.op_type}"
|
||||
output_type = self.overrides.get_node_output_qtype_info(
|
||||
node.output[0], self.default_activation_qtype
|
||||
).quant_type
|
||||
|
||||
if output_type == QuantType.QUInt16:
|
||||
self.overrides.update_tensor_overrides(
|
||||
node.output[0],
|
||||
{
|
||||
"quant_type": output_type,
|
||||
"scale": np.array(1.0 / 65536.0, dtype=np.float32),
|
||||
"zero_point": np.array(0, dtype=np.uint16),
|
||||
},
|
||||
)
|
||||
elif output_type == QuantType.QInt16:
|
||||
self.overrides.update_tensor_overrides(
|
||||
node.output[0],
|
||||
{
|
||||
"quant_type": output_type,
|
||||
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
|
||||
"zero_point": np.array(0, dtype=np.int16),
|
||||
},
|
||||
)
|
||||
|
||||
def _process_tanh(self, node: onnx.NodeProto):
|
||||
"""
|
||||
Overrides 16-bit Tanh's output scale and zero-point as per QNN requirements.
|
||||
"""
|
||||
assert node.op_type == "Tanh", f"Expected Tanh, but got {node.op_type}"
|
||||
output_type = self.overrides.get_node_output_qtype_info(
|
||||
node.output[0], self.default_activation_qtype
|
||||
).quant_type
|
||||
|
||||
if output_type == QuantType.QUInt16:
|
||||
self.overrides.update_tensor_overrides(
|
||||
node.output[0],
|
||||
{
|
||||
"quant_type": output_type,
|
||||
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
|
||||
"zero_point": np.array(32768, dtype=np.uint16),
|
||||
},
|
||||
)
|
||||
elif output_type == QuantType.QInt16:
|
||||
self.overrides.update_tensor_overrides(
|
||||
node.output[0],
|
||||
{
|
||||
"quant_type": output_type,
|
||||
"scale": np.array(1.0 / 32768.0, dtype=np.float32),
|
||||
"zero_point": np.array(0, dtype=np.int16),
|
||||
},
|
||||
)
|
||||
@@ -0,0 +1,4 @@
|
||||
from .fusion import Fusion # noqa: F401
|
||||
from .fusion_gelu import FusionGelu # noqa: F401
|
||||
from .fusion_layernorm import FusionLayerNormalization # noqa: F401
|
||||
from .replace_upsample_with_resize import ReplaceUpsampleWithResize # noqa: F401
|
||||
@@ -0,0 +1,311 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import deque
|
||||
|
||||
import onnx
|
||||
|
||||
from ..onnx_model import ONNXModel
|
||||
|
||||
|
||||
class Fusion:
|
||||
"""
|
||||
Base class for fusions.
|
||||
"""
|
||||
|
||||
def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str):
|
||||
self.search_op_type: str = search_op_type
|
||||
self.fused_op_type: str = fused_op_type
|
||||
self.model: ONNXModel = model
|
||||
self.nodes_to_remove: list = []
|
||||
self.nodes_to_add: list = []
|
||||
|
||||
self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_"
|
||||
self._new_node_name_suffix = None # int|None used to create unique node names for the fused ops.
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""
|
||||
Interface function for derived fusion classes. Tries to fuse a node sequence containing
|
||||
the specified node.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply(self) -> bool:
|
||||
"""
|
||||
Apply graph fusion on the entire model graph.
|
||||
"""
|
||||
input_name_to_nodes = self.model.input_name_to_nodes()
|
||||
output_name_to_node = self.model.output_name_to_node()
|
||||
|
||||
for node in self.model.nodes():
|
||||
if node.op_type == self.search_op_type:
|
||||
self.fuse(node, input_name_to_nodes, output_name_to_node)
|
||||
|
||||
self.model.remove_nodes(self.nodes_to_remove)
|
||||
self.model.add_nodes(self.nodes_to_add)
|
||||
|
||||
graph_updated = bool(self.nodes_to_remove or self.nodes_to_add)
|
||||
|
||||
if graph_updated:
|
||||
self.model.remove_unused_constant()
|
||||
|
||||
return graph_updated
|
||||
|
||||
def create_unique_node_name(self):
|
||||
prefix = self._new_node_name_prefix
|
||||
|
||||
if self._new_node_name_suffix is None:
|
||||
largest_suffix: int = self.model.get_largest_node_name_suffix(prefix)
|
||||
self._new_node_name_suffix = largest_suffix + 1
|
||||
|
||||
new_name = f"{prefix}{self._new_node_name_suffix!s}"
|
||||
self._new_node_name_suffix += 1
|
||||
|
||||
return new_name
|
||||
|
||||
@staticmethod
|
||||
def is_safe_to_fuse_nodes(
|
||||
nodes_to_remove: list[onnx.NodeProto],
|
||||
keep_outputs: list[str],
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
) -> bool:
|
||||
for node_to_remove in nodes_to_remove:
|
||||
for output_to_remove in node_to_remove.output:
|
||||
if output_to_remove in keep_outputs:
|
||||
continue
|
||||
|
||||
if output_to_remove in input_name_to_nodes:
|
||||
for impacted_node in input_name_to_nodes[output_to_remove]:
|
||||
if impacted_node not in nodes_to_remove:
|
||||
# Not safe to remove nodes since output is used by impacted_node
|
||||
return False
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def get_node_attribute(node: onnx.NodeProto, attribute_name: str):
|
||||
for attr in node.attribute:
|
||||
if attr.name == attribute_name:
|
||||
value = onnx.helper.get_attribute_value(attr)
|
||||
return value
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def input_index(node_output: str, child_node: onnx.NodeProto) -> int:
|
||||
for index, input_name in enumerate(child_node.input):
|
||||
if input_name == node_output:
|
||||
return index
|
||||
return -1
|
||||
|
||||
@staticmethod
|
||||
def tensor_shape_to_list(tensor_type) -> list[int]:
|
||||
shape_list = []
|
||||
for d in tensor_type.shape.dim:
|
||||
if d.HasField("dim_value"):
|
||||
shape_list.append(d.dim_value) # known dimension
|
||||
elif d.HasField("dim_param"):
|
||||
shape_list.append(d.dim_param) # unknown dimension with symbolic name
|
||||
else:
|
||||
shape_list.append("?") # shall not happen
|
||||
return shape_list
|
||||
|
||||
def get_constant_input(self, node: onnx.NodeProto):
|
||||
for i, inp in enumerate(node.input):
|
||||
value = self.model.get_constant_value(inp)
|
||||
if value is not None:
|
||||
return i, value
|
||||
|
||||
return None, None
|
||||
|
||||
def find_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> int:
|
||||
i, value = self.get_constant_input(node)
|
||||
if value is not None and value.size == 1 and abs(value - expected_value) < delta:
|
||||
return i
|
||||
|
||||
return -1
|
||||
|
||||
def has_constant_input(self, node: onnx.NodeProto, expected_value: float, delta: float = 0.000001) -> bool:
|
||||
return self.find_constant_input(node, expected_value, delta) >= 0
|
||||
|
||||
def is_constant_with_specified_rank(self, output_name: str, rank: int) -> bool:
|
||||
value = self.model.get_constant_value(output_name)
|
||||
if value is None:
|
||||
return False # Not an initializer
|
||||
|
||||
if len(value.shape) != rank:
|
||||
return False # Wrong dimensions
|
||||
|
||||
return True
|
||||
|
||||
def match_first_parent(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
parent_op_type: str,
|
||||
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
|
||||
exclude: list[onnx.NodeProto] = [], # noqa: B006
|
||||
) -> tuple[onnx.NodeProto | None, int | None]:
|
||||
"""
|
||||
Find parent node based on constraints on op_type.
|
||||
|
||||
Args:
|
||||
node: current node.
|
||||
parent_op_type (str): constraint of parent node op_type.
|
||||
output_name_to_node (dict): dictionary with output name as key, and node as value.
|
||||
exclude (list): list of nodes that are excluded (not allowed to match as parent).
|
||||
|
||||
Returns:
|
||||
parent: The matched parent node. None if not found.
|
||||
index: The input index of matched parent node. None if not found.
|
||||
"""
|
||||
if output_name_to_node is None:
|
||||
output_name_to_node = self.model.output_name_to_node()
|
||||
|
||||
for i, inp in enumerate(node.input):
|
||||
if inp in output_name_to_node:
|
||||
parent = output_name_to_node[inp]
|
||||
if parent.op_type == parent_op_type and parent not in exclude:
|
||||
return parent, i
|
||||
|
||||
return None, None
|
||||
|
||||
def match_parent(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
parent_op_type: str,
|
||||
input_index: int | None = None,
|
||||
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
|
||||
exclude: list[onnx.NodeProto] = [], # noqa: B006
|
||||
return_indice: list[int] | None = None,
|
||||
) -> onnx.NodeProto | None:
|
||||
"""
|
||||
Find parent node based on constraints on op_type and index.
|
||||
When input_index is None, we will find the first parent node based on constraints,
|
||||
and return_indice will be appended the corresponding input index.
|
||||
|
||||
Args:
|
||||
node (str): current node name.
|
||||
parent_op_type (str): constraint of parent node op_type.
|
||||
input_index (int or None): only check the parent given input index of current node.
|
||||
output_name_to_node (dict): dictionary with output name as key, and node as value.
|
||||
exclude (list): list of nodes that are excluded (not allowed to match as parent).
|
||||
return_indice (list): a list to append the input index when input_index is None.
|
||||
|
||||
Returns:
|
||||
parent: The matched parent node.
|
||||
"""
|
||||
assert node is not None
|
||||
assert input_index is None or input_index >= 0
|
||||
|
||||
if output_name_to_node is None:
|
||||
output_name_to_node = self.model.output_name_to_node()
|
||||
|
||||
if input_index is None:
|
||||
parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
|
||||
if return_indice is not None:
|
||||
return_indice.append(index)
|
||||
return parent
|
||||
|
||||
if input_index >= len(node.input):
|
||||
# Input index out of bounds.
|
||||
return None
|
||||
|
||||
parent = self.model.get_parent(node, input_index, output_name_to_node)
|
||||
if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
|
||||
return parent
|
||||
|
||||
return None
|
||||
|
||||
def match_parent_path(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
parent_op_types: list[str],
|
||||
parent_input_index: list[int] | None = None,
|
||||
output_name_to_node: dict[str, onnx.NodeProto] | None = None,
|
||||
return_indice: list[int] | None = None,
|
||||
) -> list[onnx.NodeProto] | None:
|
||||
"""
|
||||
Find a sequence of input edges based on constraints on parent op_type and index.
|
||||
When input_index is None, we will find the first parent node based on constraints,
|
||||
and return_indice will be appended the corresponding input index.
|
||||
|
||||
Args:
|
||||
node (str): current node name.
|
||||
parent_op_types (str): constraint of parent node op_type of each input edge.
|
||||
parent_input_index (list): constraint of input index of each input edge. None means no constraint.
|
||||
output_name_to_node (dict): dictionary with output name as key, and node as value.
|
||||
return_indice (list): a list to append the input index
|
||||
When there is no constraint on input index of an edge.
|
||||
|
||||
Returns:
|
||||
parents: a list of matched parent node.
|
||||
"""
|
||||
if parent_input_index is not None:
|
||||
assert len(parent_input_index) == len(parent_op_types)
|
||||
|
||||
if output_name_to_node is None:
|
||||
output_name_to_node = self.model.output_name_to_node()
|
||||
|
||||
current_node = node
|
||||
matched_parents = []
|
||||
for i, op_type in enumerate(parent_op_types):
|
||||
matched_parent = self.match_parent(
|
||||
current_node,
|
||||
op_type,
|
||||
parent_input_index[i] if parent_input_index is not None else None,
|
||||
output_name_to_node,
|
||||
exclude=[],
|
||||
return_indice=return_indice,
|
||||
)
|
||||
if matched_parent is None:
|
||||
return None
|
||||
|
||||
matched_parents.append(matched_parent)
|
||||
current_node = matched_parent
|
||||
|
||||
return matched_parents
|
||||
|
||||
def match_parent_paths(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
paths: list[tuple[list[str], list[int]]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
) -> tuple[int, list[onnx.NodeProto] | None, list[int] | None]:
|
||||
"""
|
||||
Find a matching parent path to the given node.
|
||||
"""
|
||||
for i, path in enumerate(paths):
|
||||
return_indice = []
|
||||
matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
|
||||
if matched:
|
||||
return i, matched, return_indice
|
||||
return -1, None, None
|
||||
|
||||
def find_first_child_by_type(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
child_type: str,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]] | None = None,
|
||||
recursive: bool = True,
|
||||
) -> onnx.NodeProto | None:
|
||||
children = self.model.get_children(node, input_name_to_nodes)
|
||||
dq = deque(children)
|
||||
while len(dq) > 0:
|
||||
current_node = dq.pop()
|
||||
if current_node.op_type == child_type:
|
||||
return current_node
|
||||
|
||||
if recursive:
|
||||
children = self.model.get_children(current_node, input_name_to_nodes)
|
||||
for child in children:
|
||||
dq.appendleft(child)
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,272 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import onnx
|
||||
|
||||
from ..onnx_model import ONNXModel
|
||||
from .fusion import Fusion
|
||||
|
||||
|
||||
class FusionGelu(Fusion):
|
||||
def __init__(self, model: ONNXModel):
|
||||
super().__init__(model, "Gelu", "Erf")
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
erf_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""
|
||||
Interface function that tries to fuse a node sequence containing an Erf node into a single
|
||||
Gelu node.
|
||||
"""
|
||||
if (
|
||||
self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node)
|
||||
or self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node)
|
||||
or self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node)
|
||||
):
|
||||
self.model.set_opset_import("com.microsoft", 1)
|
||||
|
||||
def fuse_1(
|
||||
self,
|
||||
erf_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
) -> bool:
|
||||
"""
|
||||
This pattern is from PyTorch model
|
||||
Fuse Gelu with Erf into one node:
|
||||
Pattern 1:
|
||||
+-------Mul(0.5)---------------------+
|
||||
| |
|
||||
| v
|
||||
[root] --> Div -----> Erf --> Add --> Mul -->
|
||||
(B=1.4142...) (1)
|
||||
|
||||
Pattern 2:
|
||||
+------------------------------------+
|
||||
| |
|
||||
| v
|
||||
[root] --> Div -----> Erf --> Add --> Mul -->Mul -->
|
||||
(B=1.4142...) (1) (0.5)
|
||||
|
||||
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
|
||||
"""
|
||||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return False
|
||||
add_after_erf = children[0]
|
||||
|
||||
if not self.has_constant_input(add_after_erf, 1):
|
||||
return False
|
||||
|
||||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
|
||||
mul_after_erf = children[0]
|
||||
|
||||
div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
|
||||
if div is None:
|
||||
return False
|
||||
|
||||
if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
|
||||
return False
|
||||
|
||||
subgraph_input = div.input[0]
|
||||
|
||||
another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
|
||||
if subgraph_input == mul_after_erf.input[another]: # pattern 2
|
||||
children = input_name_to_nodes[mul_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
mul_half = children[0]
|
||||
if not self.has_constant_input(mul_half, 0.5):
|
||||
return False
|
||||
subgraph_output = mul_half.output[0]
|
||||
else: # pattern 1
|
||||
mul_half = self.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
|
||||
if mul_half is None:
|
||||
return False
|
||||
|
||||
if not self.has_constant_input(mul_half, 0.5):
|
||||
return False
|
||||
|
||||
if subgraph_input not in mul_half.input:
|
||||
return False
|
||||
|
||||
subgraph_output = mul_after_erf.output[0]
|
||||
|
||||
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
|
||||
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node):
|
||||
return False
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = onnx.helper.make_node(
|
||||
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output]
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
return True
|
||||
|
||||
def fuse_2(
|
||||
self,
|
||||
erf_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
) -> bool:
|
||||
"""
|
||||
This pattern is from Keras model
|
||||
Fuse Gelu with Erf into one node:
|
||||
+------------------------------------------+
|
||||
| |
|
||||
| v
|
||||
[root] --> Div -----> Erf --> Add --> Mul -->Mul
|
||||
(B=1.4142...) (A=1) (A=0.5)
|
||||
|
||||
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
|
||||
"""
|
||||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return False
|
||||
add_after_erf = children[0]
|
||||
|
||||
if not self.has_constant_input(add_after_erf, 1):
|
||||
return False
|
||||
|
||||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
mul_after_erf = children[0]
|
||||
|
||||
if not self.has_constant_input(mul_after_erf, 0.5):
|
||||
return False
|
||||
|
||||
if mul_after_erf.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[mul_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
mul = children[0]
|
||||
|
||||
div = self.match_parent(erf_node, "Div", 0, output_name_to_node)
|
||||
if div is None:
|
||||
return False
|
||||
|
||||
sqrt_node = None
|
||||
if self.find_constant_input(div, 1.4142, delta=0.001) != 1:
|
||||
sqrt_node = self.match_parent(div, "Sqrt", 1, output_name_to_node)
|
||||
if sqrt_node is None:
|
||||
return False
|
||||
if not self.has_constant_input(sqrt_node, 2.0):
|
||||
return False
|
||||
|
||||
subgraph_input = div.input[0]
|
||||
|
||||
if subgraph_input not in mul.input:
|
||||
return False
|
||||
|
||||
subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
|
||||
if sqrt_node:
|
||||
subgraph_nodes.append(sqrt_node)
|
||||
|
||||
if not self.is_safe_to_fuse_nodes(subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node):
|
||||
return False
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = onnx.helper.make_node(
|
||||
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]]
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
return True
|
||||
|
||||
def fuse_3(
|
||||
self,
|
||||
erf_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
) -> bool:
|
||||
"""
|
||||
This pattern is from TensorFlow model
|
||||
Fuse Gelu with Erf into one node:
|
||||
+----------------------------------------------+
|
||||
| |
|
||||
| v
|
||||
[root] --> Mul -----> Erf --> Add --> Mul -->Mul
|
||||
(A=0.7071067690849304) (B=1) (B=0.5)
|
||||
|
||||
Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
|
||||
"""
|
||||
|
||||
if erf_node.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[erf_node.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Add":
|
||||
return False
|
||||
add_after_erf = children[0]
|
||||
|
||||
if not self.has_constant_input(add_after_erf, 1):
|
||||
return False
|
||||
|
||||
if add_after_erf.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[add_after_erf.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
mul_half = children[0]
|
||||
|
||||
if not self.has_constant_input(mul_half, 0.5):
|
||||
return False
|
||||
|
||||
first_mul = self.match_parent(erf_node, "Mul", 0, output_name_to_node)
|
||||
if first_mul is None:
|
||||
return False
|
||||
|
||||
i = self.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
|
||||
if i < 0:
|
||||
return False
|
||||
|
||||
root_input_index = 1 - i
|
||||
subgraph_input = first_mul.input[root_input_index]
|
||||
|
||||
if mul_half.output[0] not in input_name_to_nodes:
|
||||
return False
|
||||
children = input_name_to_nodes[mul_half.output[0]]
|
||||
if len(children) != 1 or children[0].op_type != "Mul":
|
||||
return False
|
||||
last_mul = children[0]
|
||||
|
||||
if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input):
|
||||
return False
|
||||
|
||||
subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
|
||||
if not self.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
[last_mul.output[0]],
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return False
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
fused_node = onnx.helper.make_node(
|
||||
"Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]]
|
||||
)
|
||||
fused_node.domain = "com.microsoft"
|
||||
self.nodes_to_add.append(fused_node)
|
||||
return True
|
||||
@@ -0,0 +1,135 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import onnx
|
||||
|
||||
from ..onnx_model import ONNXModel
|
||||
from .fusion import Fusion
|
||||
|
||||
|
||||
class FusionLayerNormalization(Fusion):
|
||||
def __init__(self, model: ONNXModel):
|
||||
super().__init__(model, "LayerNormalization", "ReduceMean")
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
reduce_mean_node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""
|
||||
Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
|
||||
LayerNormalization node.
|
||||
|
||||
+----------------------+
|
||||
| |
|
||||
| v
|
||||
[Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
|
||||
(axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^
|
||||
| |
|
||||
+-------------------------------------------------+
|
||||
|
||||
It also handles cases of duplicated sub nodes exported from older version of PyTorch:
|
||||
|
||||
+----------------------+
|
||||
| v
|
||||
| +-------> Sub-----------------------------------------------+
|
||||
| | |
|
||||
| | v
|
||||
[Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
|
||||
| ^
|
||||
| |
|
||||
+----------------------+
|
||||
"""
|
||||
children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
|
||||
if len(children) == 0 or len(children) > 2:
|
||||
return
|
||||
|
||||
root_input = reduce_mean_node.input[0]
|
||||
|
||||
if children[0].op_type != "Sub" or children[0].input[0] != root_input:
|
||||
return
|
||||
|
||||
if len(children) == 2:
|
||||
if children[1].op_type != "Sub" or children[1].input[0] != root_input:
|
||||
return
|
||||
|
||||
div_node = None
|
||||
for child in children:
|
||||
div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
|
||||
if div_node is not None:
|
||||
break
|
||||
if div_node is None:
|
||||
return
|
||||
|
||||
path_id, parent_nodes, _ = self.match_parent_paths(
|
||||
div_node,
|
||||
[
|
||||
(["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
|
||||
(
|
||||
["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
|
||||
[1, 0, 0, 0, 0, 0],
|
||||
),
|
||||
],
|
||||
output_name_to_node,
|
||||
)
|
||||
if path_id < 0:
|
||||
return
|
||||
|
||||
sub_node = parent_nodes[-1]
|
||||
if sub_node not in children:
|
||||
return
|
||||
|
||||
second_add_node = parent_nodes[1]
|
||||
i, add_weight = self.get_constant_input(second_add_node)
|
||||
if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
|
||||
# Skip fusion since epsilon value is not expected.
|
||||
return
|
||||
|
||||
pow_node = parent_nodes[3]
|
||||
if self.find_constant_input(pow_node, 2.0) != 1:
|
||||
return
|
||||
|
||||
mul_node = input_name_to_nodes[div_node.output[0]][0]
|
||||
if mul_node.op_type != "Mul":
|
||||
return
|
||||
|
||||
last_add_node = input_name_to_nodes[mul_node.output[0]][0]
|
||||
if last_add_node.op_type != "Add":
|
||||
return
|
||||
|
||||
subgraph_nodes = [reduce_mean_node]
|
||||
subgraph_nodes.extend(children)
|
||||
subgraph_nodes.extend(parent_nodes[:-1])
|
||||
|
||||
subgraph_nodes.extend([last_add_node, mul_node, div_node])
|
||||
if not self.is_safe_to_fuse_nodes(
|
||||
subgraph_nodes,
|
||||
last_add_node.output,
|
||||
input_name_to_nodes,
|
||||
output_name_to_node,
|
||||
):
|
||||
return
|
||||
|
||||
weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
|
||||
if not self.is_constant_with_specified_rank(weight_input, 1):
|
||||
return
|
||||
|
||||
bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
|
||||
if not self.is_constant_with_specified_rank(bias_input, 1):
|
||||
return
|
||||
|
||||
self.nodes_to_remove.extend(subgraph_nodes)
|
||||
|
||||
normalize_node = onnx.helper.make_node(
|
||||
"LayerNormalization",
|
||||
name=self.create_unique_node_name(),
|
||||
inputs=[reduce_mean_node.input[0], weight_input, bias_input],
|
||||
outputs=[last_add_node.output[0]],
|
||||
)
|
||||
normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
|
||||
self.nodes_to_add.append(normalize_node)
|
||||
@@ -0,0 +1,96 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
from ..onnx_model import ONNXModel
|
||||
from .fusion import Fusion
|
||||
|
||||
|
||||
class ReplaceUpsampleWithResize(Fusion):
|
||||
"""Replace Upsample with Resize."""
|
||||
|
||||
def __init__(self, model: ONNXModel, opset):
|
||||
"""Initialize."""
|
||||
super().__init__(model, "Resize", "Upsample")
|
||||
self.opset = opset
|
||||
|
||||
def fuse(
|
||||
self,
|
||||
node: onnx.NodeProto,
|
||||
input_name_to_nodes: dict[str, list[onnx.NodeProto]],
|
||||
output_name_to_node: dict[str, onnx.NodeProto],
|
||||
):
|
||||
"""Replace Upsample with Resize."""
|
||||
mode = None
|
||||
for attr in node.attribute:
|
||||
if attr.name == "mode":
|
||||
mode = attr.s.decode("utf-8")
|
||||
break
|
||||
|
||||
scales_input = None
|
||||
if self.opset > 7:
|
||||
scales_input = node.input[1] if len(node.input) > 1 else ""
|
||||
resize_inputs = [node.input[0], node.name + "_roi", scales_input]
|
||||
else:
|
||||
if self.opset == 7:
|
||||
for attr in node.attribute:
|
||||
if attr.name == "scales":
|
||||
scales_input = attr.floats
|
||||
break
|
||||
|
||||
scales_input = np.array(list(scales_input), np.float32)
|
||||
else:
|
||||
h_scale = 1
|
||||
w_scale = 1
|
||||
for attr in node.attribute:
|
||||
if attr.name == "height_scale":
|
||||
h_scale = attr.float
|
||||
elif attr.name == "width_scale":
|
||||
w_scale = attr.float
|
||||
|
||||
scales_input = np.array([1, 1, h_scale, w_scale], np.float32)
|
||||
|
||||
scales_tensor = onnx.helper.make_tensor(
|
||||
name=node.name + "_scales",
|
||||
data_type=onnx.TensorProto.FLOAT,
|
||||
dims=scales_input.shape,
|
||||
vals=scales_input.flatten().tolist(),
|
||||
)
|
||||
|
||||
scales_node = onnx.helper.make_node(
|
||||
"Constant", inputs=[], outputs=[node.name + "_scales"], value=scales_tensor
|
||||
)
|
||||
|
||||
self.nodes_to_add.append(scales_node)
|
||||
|
||||
resize_inputs = [node.input[0], node.name + "_roi", node.name + "_scales"]
|
||||
|
||||
roi_tensor = onnx.helper.make_tensor(
|
||||
name=node.name + "_roi",
|
||||
data_type=onnx.TensorProto.FLOAT,
|
||||
dims=(len(scales_input) * 2,),
|
||||
vals=[0] * len(scales_input) + [1] * len(scales_input),
|
||||
)
|
||||
|
||||
roi_node = onnx.helper.make_node("Constant", inputs=[], outputs=[node.name + "_roi"], value=roi_tensor)
|
||||
|
||||
resize_node = onnx.helper.make_node(
|
||||
op_type="Resize", inputs=resize_inputs, outputs=node.output, mode=mode, nearest_mode="floor"
|
||||
)
|
||||
|
||||
self.nodes_to_remove.append(node)
|
||||
self.nodes_to_add.append(roi_node)
|
||||
self.nodes_to_add.append(resize_node)
|
||||
|
||||
def apply(self) -> bool:
|
||||
"""Apply."""
|
||||
if super().apply():
|
||||
self.model.topological_sort()
|
||||
return True
|
||||
return False
|
||||
@@ -0,0 +1,239 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import onnx
|
||||
from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
|
||||
|
||||
from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
|
||||
|
||||
from .onnx_model import ONNXModel
|
||||
from .quant_utils import attribute_to_kwarg
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MatMulBnb4Quantizer:
|
||||
"""Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
|
||||
|
||||
##################
|
||||
# quantization types, must be consistent with native code type
|
||||
# Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
|
||||
|
||||
# 4b floating point with bias of 3
|
||||
FP4 = 0
|
||||
|
||||
# 4b NormalFloat
|
||||
NF4 = 1
|
||||
|
||||
def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
|
||||
nodes_to_exclude = nodes_to_exclude or []
|
||||
assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
|
||||
self.model = ONNXModel(model)
|
||||
self.quant_type = quant_type
|
||||
self.block_size = block_size
|
||||
self.nodes_to_exclude = set(nodes_to_exclude)
|
||||
|
||||
@staticmethod
|
||||
def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
|
||||
for gid in range(len(graph_path) - 1, -1, -1):
|
||||
graph = graph_path[gid]
|
||||
for tensor in graph.initializer:
|
||||
if tensor.name == name:
|
||||
return tensor, graph
|
||||
return None, None
|
||||
|
||||
def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
|
||||
"""4b quantize fp32/fp16 weight"""
|
||||
|
||||
if len(fpweight.shape) != 2:
|
||||
raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
|
||||
# need to copy since the transposed weight still has the original memory layout
|
||||
# Linear4bit quantizes its weight data which is the transposed weight
|
||||
fpweight_t = fpweight.transpose().copy()
|
||||
|
||||
rows, cols = fpweight.shape
|
||||
numel = rows * cols
|
||||
block_size = self.block_size
|
||||
num_blocks = (numel + block_size - 1) // block_size
|
||||
quantized_numel = (numel + 1) // 2
|
||||
|
||||
packed = np.zeros(quantized_numel, dtype="uint8")
|
||||
absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
|
||||
# block wise quantization, fpweight_t is flattened and divided into blocks
|
||||
quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
|
||||
|
||||
return (packed, absmax)
|
||||
|
||||
def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
|
||||
"""If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
|
||||
|
||||
if node.op_type != "MatMul":
|
||||
return node # only care about MatMul for now
|
||||
|
||||
logger.debug(f"start to quantize {node.name} ...")
|
||||
if node.name in self.nodes_to_exclude:
|
||||
logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
|
||||
return node
|
||||
|
||||
inputB = node.input[1] # noqa: N806
|
||||
B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack) # noqa: N806
|
||||
if B is None:
|
||||
logger.debug("MatMul doesn't have const weight. Skip to quantize")
|
||||
return node # only care about constant weight
|
||||
|
||||
B_array = onnx.numpy_helper.to_array(B) # noqa: N806
|
||||
if len(B_array.shape) != 2:
|
||||
logger.debug("MatMul weight is not 2D. Skip to quantize")
|
||||
return node # can only process 2-D matrix
|
||||
|
||||
packed, absmax = self.bnb4_block_quant(B_array)
|
||||
B_quant = onnx.numpy_helper.from_array(packed) # noqa: N806
|
||||
B_quant.name = B.name + "_Bnb4"
|
||||
for input in Bs_graph.input:
|
||||
if input.name == inputB:
|
||||
Bs_graph.input.remove(input)
|
||||
break
|
||||
|
||||
absmax_tensor = onnx.numpy_helper.from_array(absmax)
|
||||
absmax_tensor.name = B.name + "_absmax"
|
||||
|
||||
Bs_graph.initializer.extend([B_quant, absmax_tensor])
|
||||
|
||||
kwargs = {}
|
||||
rows, cols = B_array.shape
|
||||
kwargs["K"] = rows
|
||||
kwargs["N"] = cols
|
||||
kwargs["block_size"] = self.block_size
|
||||
kwargs["quant_type"] = self.quant_type
|
||||
|
||||
matmul_bnb4_node = onnx.helper.make_node(
|
||||
"MatMulBnb4",
|
||||
inputs=[node.input[0], B_quant.name, absmax_tensor.name],
|
||||
outputs=[node.output[0]],
|
||||
name=node.name + "_Bnb4" if node.name else "",
|
||||
domain="com.microsoft",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
logger.debug(f"complete quantization of {node.name} ...")
|
||||
|
||||
return matmul_bnb4_node
|
||||
|
||||
def _process_subgraph(self, graph_stack: list[GraphProto]):
|
||||
new_nodes = []
|
||||
graph = graph_stack[-1]
|
||||
|
||||
for node in graph.node:
|
||||
graph_attrs = [
|
||||
attr
|
||||
for attr in node.attribute
|
||||
if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
|
||||
]
|
||||
if graph_attrs:
|
||||
kwargs = {}
|
||||
for attr in node.attribute:
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
# recursive call to take care of sub-graph
|
||||
graph_stack.append(attr.g)
|
||||
kv = {attr.name: self._process_subgraph(graph_stack)}
|
||||
elif attr.type == onnx.AttributeProto.GRAPHS:
|
||||
value = []
|
||||
for subgraph in attr.graphs:
|
||||
# recursive call to take care of sub-graph
|
||||
graph_stack.append(subgraph)
|
||||
value.extend([self._process_subgraph(graph_stack)])
|
||||
kv = {attr.name: value}
|
||||
else:
|
||||
kv = attribute_to_kwarg(attr)
|
||||
kwargs.update(kv)
|
||||
node = onnx.helper.make_node( # noqa: PLW2901
|
||||
node.op_type, node.input, node.output, name=node.name, **kwargs
|
||||
)
|
||||
|
||||
new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
|
||||
|
||||
graph.ClearField("node")
|
||||
graph.node.extend(new_nodes)
|
||||
graph_stack.pop()
|
||||
return graph
|
||||
|
||||
def process(self):
|
||||
# use a stack to keep track of sub-graphs
|
||||
graph_stack = [self.model.graph()]
|
||||
opset_import = self.model.opset_import()
|
||||
|
||||
has_ms_domain = False
|
||||
for opset in opset_import:
|
||||
if opset.domain == "com.microsoft":
|
||||
has_ms_domain = True
|
||||
if not has_ms_domain:
|
||||
opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
|
||||
|
||||
self._process_subgraph(graph_stack)
|
||||
self.model.clean_initializers()
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
|
||||
|
||||
A weight matrix is partitioned into blocks, where each block is a contiguous
|
||||
subset inside the flattened transposed weight matrix. Each block is quantized
|
||||
into a set of 4b integers with an absolute value scaling factor.
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument("--input_model", required=True, help="Path to the input model file")
|
||||
parser.add_argument("--output_model", required=True, help="Path to the output model file")
|
||||
parser.add_argument(
|
||||
"--quant_type",
|
||||
required=False,
|
||||
default=1,
|
||||
choices=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
|
||||
help="Quantization data type. 0: FP4, 1: NF4",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_size",
|
||||
required=False,
|
||||
default=64,
|
||||
help="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
|
||||
)
|
||||
parser.add_argument("-v", "--verbose", required=False, action="store_true")
|
||||
parser.set_defaults(verbose=False)
|
||||
parser.add_argument(
|
||||
"--nodes_to_exclude",
|
||||
nargs="+",
|
||||
type=str,
|
||||
required=False,
|
||||
default=[],
|
||||
help="Specify the nodes to be excluded from quantization with node names",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
if args.verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
input_model_path = args.input_model
|
||||
output_model_path = args.output_model
|
||||
|
||||
if os.path.exists(output_model_path):
|
||||
logger.error(f"file {output_model_path} already exists")
|
||||
raise Exception(f"file {output_model_path} already exists")
|
||||
|
||||
model = onnx.load(input_model_path)
|
||||
quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
|
||||
quant.process()
|
||||
quant.model.save_model_to_file(output_model_path, True)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1 @@
|
||||
from .weight_only import gptq_quantize, rtn_quantize # noqa: F401
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,80 @@
|
||||
#
|
||||
# The implementation of this file is based on:
|
||||
# https://github.com/intel/neural-compressor/tree/master/neural_compressor
|
||||
#
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Helper classes or functions for onnxrt adaptor."""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger("neural_compressor")
|
||||
|
||||
|
||||
MAXIMUM_PROTOBUF = 2147483648
|
||||
|
||||
|
||||
def simple_progress_bar(total, i):
|
||||
"""Progress bar for cases where tqdm can't be used."""
|
||||
progress = i / total
|
||||
bar_length = 20
|
||||
bar = "#" * int(bar_length * progress)
|
||||
spaces = " " * (bar_length - len(bar))
|
||||
percentage = progress * 100
|
||||
print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="")
|
||||
|
||||
|
||||
def find_by_name(name, item_list):
|
||||
"""Helper function to find item by name in a list."""
|
||||
items = []
|
||||
for item in item_list:
|
||||
assert hasattr(item, "name"), f"{item} should have a 'name' attribute defined" # pragma: no cover
|
||||
if item.name == name:
|
||||
items.append(item)
|
||||
if len(items) > 0:
|
||||
return items[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def to_numpy(data):
|
||||
"""Convert to numpy ndarrays."""
|
||||
import torch # noqa: PLC0415
|
||||
|
||||
if not isinstance(data, np.ndarray):
|
||||
if not importlib.util.find_spec("torch"):
|
||||
logger.error(
|
||||
"Please install torch to enable subsequent data type check and conversion, "
|
||||
"or reorganize your data format to numpy array."
|
||||
)
|
||||
exit(0)
|
||||
if isinstance(data, torch.Tensor):
|
||||
if data.dtype is torch.bfloat16: # pragma: no cover
|
||||
return data.detach().cpu().to(torch.float32).numpy()
|
||||
if data.dtype is torch.chalf: # pragma: no cover
|
||||
return data.detach().cpu().to(torch.cfloat).numpy()
|
||||
return data.detach().cpu().numpy()
|
||||
else:
|
||||
try:
|
||||
return np.array(data)
|
||||
except Exception:
|
||||
assert False, ( # noqa: B011
|
||||
f"The input data for onnx model is {type(data)}, which is not supported to convert to numpy ndarrays."
|
||||
)
|
||||
else:
|
||||
return data
|
||||
@@ -0,0 +1,932 @@
|
||||
#
|
||||
# The implementation of this file is based on:
|
||||
# https://github.com/intel/neural-compressor/tree/master/neural_compressor
|
||||
#
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modifications:
|
||||
# Add k-quant quantization method.
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""WeightOnly for onnxrt adaptor."""
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import numpy_helper
|
||||
from onnx.helper import np_dtype_to_tensor_dtype
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
from .onnx_model import ONNXModel
|
||||
from .util import simple_progress_bar
|
||||
|
||||
logger = logging.getLogger("neural_compressor")
|
||||
|
||||
|
||||
def make_matmul_weight_only_node(
|
||||
node,
|
||||
weight_shape,
|
||||
num_bits,
|
||||
group_size,
|
||||
k_blocks,
|
||||
q_weight,
|
||||
scale,
|
||||
zero_point,
|
||||
accuracy_level=0,
|
||||
): # pragma: no cover
|
||||
"""Build MatMulNBits node.
|
||||
|
||||
Args:
|
||||
node: original matmul node
|
||||
weight_shape: original weight shape
|
||||
num_bits (int): num_bits
|
||||
group_size (int): how many elements share one scale/zp
|
||||
k_blocks (int): block number
|
||||
q_weight (array): quantized weight
|
||||
scale (array): scale
|
||||
zero_point (array): zero point
|
||||
accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
|
||||
|
||||
Returns:
|
||||
matmul_weight_only_node: MatMulNBits node
|
||||
new_inits: initializers of the new node
|
||||
"""
|
||||
blob_size = group_size * num_bits // 8
|
||||
packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
|
||||
q_weight_name = node.input[1] + f"_Q{num_bits!s}G{group_size!s}"
|
||||
input_names = [node.input[0], q_weight_name]
|
||||
new_inits = []
|
||||
kwargs = {}
|
||||
|
||||
op_type = "MatMulNBits"
|
||||
|
||||
# pack quantized weight
|
||||
if num_bits == 4:
|
||||
q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
|
||||
packed[:, :] = q_weight_pairs[:, :blob_size]
|
||||
elif num_bits == 8:
|
||||
packed = q_weight
|
||||
else:
|
||||
logger.error(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
|
||||
|
||||
packed = np.reshape(packed, (-1, k_blocks, blob_size))
|
||||
|
||||
# build scale tensor
|
||||
scale = np.reshape(scale, (-1, k_blocks))
|
||||
assert scale.dtype == np.float32 or scale.dtype == np.float16
|
||||
scale_tensor = onnx.helper.make_tensor(
|
||||
name=node.input[1] + "_scale",
|
||||
data_type=np_dtype_to_tensor_dtype(scale.dtype),
|
||||
dims=scale.shape,
|
||||
vals=scale.tobytes(),
|
||||
raw=True,
|
||||
)
|
||||
input_names.append(scale_tensor.name)
|
||||
new_inits.append(scale_tensor)
|
||||
|
||||
# build zero_point tensor
|
||||
if zero_point is not None:
|
||||
if num_bits == 8:
|
||||
packed_zp = zero_point.astype("uint8")
|
||||
elif num_bits == 4:
|
||||
# For 4-bit case, the default zeros is 0x8. So it is 0x88 = 136 if we fill lower/higher 4 bits with 0x8.
|
||||
packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
|
||||
# create an index array
|
||||
idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
|
||||
# separate odd and even indices
|
||||
even_idx = idx[::2]
|
||||
odd_idx = idx[1::2]
|
||||
# vectorized operation for even and odd indices
|
||||
packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
|
||||
packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
|
||||
else:
|
||||
raise ValueError(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
|
||||
|
||||
packed_zp = np.reshape(packed_zp, (weight_shape[1], -1))
|
||||
zp_tensor = onnx.helper.make_tensor(
|
||||
name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
|
||||
)
|
||||
input_names.append(zp_tensor.name)
|
||||
new_inits.append(zp_tensor)
|
||||
|
||||
# set kwargs
|
||||
kwargs["K"] = weight_shape[0]
|
||||
kwargs["N"] = weight_shape[1]
|
||||
kwargs["bits"] = num_bits
|
||||
kwargs["block_size"] = group_size
|
||||
if accuracy_level > 0:
|
||||
# require onnxruntime > 1.16.3
|
||||
kwargs["accuracy_level"] = accuracy_level
|
||||
|
||||
q_weight_tensor = onnx.helper.make_tensor(
|
||||
name=q_weight_name,
|
||||
data_type=2,
|
||||
dims=packed.shape,
|
||||
vals=packed.tobytes(),
|
||||
raw=True,
|
||||
)
|
||||
new_inits.append(q_weight_tensor)
|
||||
|
||||
matmul_weight_only_node = onnx.helper.make_node(
|
||||
op_type,
|
||||
inputs=input_names,
|
||||
outputs=node.output,
|
||||
name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
|
||||
domain="com.microsoft",
|
||||
**kwargs,
|
||||
)
|
||||
return matmul_weight_only_node, new_inits
|
||||
|
||||
|
||||
def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
|
||||
"""Quantize tensor per group.
|
||||
|
||||
Args:
|
||||
data : input weight
|
||||
num_bits (int, optional): num_bits. Defaults to 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
|
||||
scheme (str, optional): quantization scheme. Defaults to "asym".
|
||||
dtype (str, optional): data type. Defaults to "int".
|
||||
ratio (float, optional): percentile of clip. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
output: quantized weight
|
||||
scale: scale
|
||||
zero_point: zero point
|
||||
"""
|
||||
data = np.reshape(data, (-1, group_size))
|
||||
if scheme == "asym" or dtype == "uint":
|
||||
maxq = 2**num_bits - 1
|
||||
minq = 0
|
||||
elif scheme == "sym":
|
||||
maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
|
||||
minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
|
||||
|
||||
rmin = np.min(data, axis=1, keepdims=True) * ratio
|
||||
rmax = np.max(data, axis=1, keepdims=True) * ratio
|
||||
if scheme == "sym":
|
||||
max_range = np.maximum(np.abs(rmin), np.abs(rmax))
|
||||
scale = np.ones(rmax.shape)
|
||||
mask = max_range > 0
|
||||
scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
|
||||
zero_point = (
|
||||
np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
|
||||
)
|
||||
else:
|
||||
scale = np.ones(rmax.shape)
|
||||
scale[rmin != rmax] = np.array(
|
||||
[float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
|
||||
)
|
||||
zero_point = (
|
||||
((np.zeros(scale.shape) - rmin) / scale).round()
|
||||
if dtype == "int"
|
||||
else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
|
||||
)
|
||||
|
||||
q_weight = np.empty_like(data, dtype=scale.dtype)
|
||||
np.divide(data, scale, out=q_weight)
|
||||
np.add(q_weight, zero_point, out=q_weight)
|
||||
np.round(q_weight, out=q_weight)
|
||||
np.clip(q_weight, minq, maxq, out=q_weight)
|
||||
|
||||
return q_weight, scale, zero_point
|
||||
|
||||
|
||||
def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
|
||||
"""Quantize tensor per group based on k quant.
|
||||
|
||||
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
|
||||
|
||||
Args:
|
||||
data : input weight
|
||||
num_bits (int, optional): num_bits. Defaults to 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Defaults to 32.
|
||||
|
||||
Returns:
|
||||
output: quantized weight
|
||||
scale: scale
|
||||
zero_point: zero point
|
||||
"""
|
||||
data = np.reshape(data, (-1, group_size)).astype(np.float32) # nb = data.shape[0], (nb, group_size)
|
||||
maxq = 2**num_bits - 1
|
||||
minq = 0
|
||||
sum_x2 = np.sum(data**2, axis=1, keepdims=True) # (nb, 1)
|
||||
av_x = np.sqrt(sum_x2 / group_size) # (nb, 1)
|
||||
weights = np.add(av_x, np.abs(data)) # (nb, group_size)
|
||||
rmin = np.min(data, axis=1, keepdims=True) # (nb, 1)
|
||||
rmax = np.max(data, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_w = np.sum(weights, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_x = np.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
|
||||
iscale = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
|
||||
mask = rmin != rmax
|
||||
iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
|
||||
scale = 1 / iscale
|
||||
quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
|
||||
diff = scale * quant_data + rmin - data # (nb, group_size)
|
||||
best_mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
|
||||
nstep = 20
|
||||
rdelta = 0.1
|
||||
# nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
|
||||
rrmin = -1
|
||||
for is_ in range(nstep):
|
||||
iscale_new = np.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
|
||||
factor = np.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
|
||||
mask = rmin != rmax
|
||||
iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
|
||||
quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
|
||||
mul_weights_quant_data_new = weights * quant_data_new
|
||||
sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
|
||||
D = np.subtract(sum_w * sum_l2, sum_l**2) # noqa: N806
|
||||
|
||||
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
|
||||
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
|
||||
|
||||
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
|
||||
mad = np.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
|
||||
|
||||
mad_1 = np.array(mad)
|
||||
best_mad_1 = np.array(best_mad)
|
||||
idx_to_replace = np.where(mad_1 < best_mad_1)[0]
|
||||
quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
|
||||
best_mad[idx_to_replace] = mad[idx_to_replace]
|
||||
scale[idx_to_replace] = this_scale[idx_to_replace]
|
||||
rmin[idx_to_replace] = this_min[idx_to_replace]
|
||||
|
||||
zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
|
||||
scale = scale.astype(np.float64)
|
||||
q_weight = np.empty_like(data, dtype=scale.dtype)
|
||||
np.divide(data, scale, out=q_weight)
|
||||
np.add(q_weight, zero_point, out=q_weight)
|
||||
np.round(q_weight, out=q_weight)
|
||||
np.clip(q_weight, minq, maxq, out=q_weight)
|
||||
|
||||
return q_weight, scale, zero_point
|
||||
|
||||
|
||||
def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
|
||||
"""Quantize tensor per group based on k quant.
|
||||
|
||||
Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
|
||||
|
||||
Args:
|
||||
data : input weight
|
||||
num_bits (int, optional): num_bits. Defaults to 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
output: quantized weight
|
||||
scale: scale
|
||||
zero_point: zero point
|
||||
"""
|
||||
try:
|
||||
import cupy as cp # noqa: PLC0415
|
||||
import torch # noqa: PLC0415
|
||||
|
||||
if torch.cuda.is_available():
|
||||
data = cp.asarray(data)
|
||||
data = data.reshape((-1, group_size)).astype(cp.float32) # nb = data.shape[0], (nb, group_size)
|
||||
maxq = 2**num_bits - 1
|
||||
minq = 0
|
||||
sum_x2 = cp.sum(data**2, axis=1, keepdims=True) # (nb, 1)
|
||||
av_x = cp.sqrt(sum_x2 / group_size) # (nb, 1)
|
||||
weights = cp.add(av_x, cp.abs(data)) # (nb, group_size)
|
||||
rmin = cp.min(data, axis=1, keepdims=True) # (nb, 1)
|
||||
rmax = cp.max(data, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_w = cp.sum(weights, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_x = cp.sum(weights * data, axis=1, keepdims=True) # (nb, group_size)
|
||||
iscale = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
|
||||
mask = rmin != rmax
|
||||
iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
|
||||
scale = 1 / iscale
|
||||
quant_data = cp.clip(cp.round(iscale * (data - rmin)), minq, maxq) # (nb, group_size)
|
||||
diff = scale * quant_data + rmin - data # (nb, group_size)
|
||||
best_mad = cp.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
|
||||
nstep = 20
|
||||
rdelta = 0.1
|
||||
rrmin = -1
|
||||
for is_ in range(nstep):
|
||||
iscale_new = cp.ones(rmax.shape, dtype=data.dtype) # (nb, 1)
|
||||
factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
|
||||
mask = rmin != rmax
|
||||
iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
|
||||
quant_data_new = cp.clip(cp.round(iscale_new * (data - rmin)), minq, maxq) # (nb, group_size)
|
||||
mul_weights_quant_data_new = weights * quant_data_new
|
||||
sum_l = cp.sum(mul_weights_quant_data_new, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_l2 = cp.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True) # (nb, 1)
|
||||
sum_xl = cp.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True) # (nb, 1)
|
||||
D = cp.subtract(sum_w * sum_l2, sum_l**2) # noqa: N806
|
||||
|
||||
this_scale = (sum_w * sum_xl - sum_x * sum_l) / D # (nb, 1)
|
||||
this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D # (nb, 1)
|
||||
|
||||
diff = this_scale * quant_data_new + this_min - data # (nb, group_size)
|
||||
mad = cp.sum(weights * diff**2, axis=1, keepdims=True) # (nb, 1)
|
||||
|
||||
mad_1 = cp.array(mad)
|
||||
best_mad_1 = cp.array(best_mad)
|
||||
idx_to_replace = cp.where(mad_1 < best_mad_1)[0]
|
||||
quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
|
||||
best_mad[idx_to_replace] = mad[idx_to_replace]
|
||||
scale[idx_to_replace] = this_scale[idx_to_replace]
|
||||
rmin[idx_to_replace] = this_min[idx_to_replace]
|
||||
|
||||
zero_point = cp.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
|
||||
scale = scale.astype(cp.float64)
|
||||
q_weight = cp.empty_like(data, dtype=scale.dtype)
|
||||
cp.divide(data, scale, out=q_weight)
|
||||
cp.add(q_weight, zero_point, out=q_weight)
|
||||
cp.round(q_weight, out=q_weight)
|
||||
cp.clip(q_weight, minq, maxq, out=q_weight)
|
||||
|
||||
return q_weight.get(), scale.get(), zero_point.get()
|
||||
else:
|
||||
logger.warning(
|
||||
"Try to use k-quant quantization on CUDA. However, CUDA is not available."
|
||||
"Fall back to k-quant quantization on CPU."
|
||||
)
|
||||
return quant_tensor_k_quant_cpu(data, num_bits, group_size)
|
||||
except ImportError:
|
||||
logger.info(
|
||||
"Now we are using k-quant quantization on cpu, which is time consuming."
|
||||
"Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
|
||||
"Please also install torch to check CUDA availability."
|
||||
)
|
||||
return quant_tensor_k_quant_cpu(data, num_bits, group_size)
|
||||
|
||||
|
||||
def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
|
||||
"""Quant dequant tensor per group.
|
||||
|
||||
Args:
|
||||
data : input weight
|
||||
num_bits (int, optional): num_bits. Defaults to 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
|
||||
scheme (str, optional): quantization scheme. Defaults to "asym".
|
||||
dtype (str, optional): data type. Defaults to "int".
|
||||
ratio (float, optional): percentile of clip. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
output: quant-dequant weight
|
||||
"""
|
||||
org_shape = data.shape
|
||||
weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
|
||||
return np.reshape(scale * (weight - zp), org_shape)
|
||||
|
||||
|
||||
def pad_tensor(weight, group_size, k_blocks):
|
||||
"""Pad tensor rowi so that it can be is divisible by group_size.
|
||||
|
||||
Args:
|
||||
weight (array): weight
|
||||
group_size (int): how many elements share one scale/zp
|
||||
k_blocks (int): the number of block
|
||||
|
||||
Returns:
|
||||
weight: paded weight
|
||||
"""
|
||||
if group_size == -1:
|
||||
return weight
|
||||
|
||||
org_w_shape = weight.shape
|
||||
padded_rows = k_blocks * group_size
|
||||
pad_len = padded_rows - org_w_shape[0]
|
||||
|
||||
if pad_len > 0:
|
||||
weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def rtn_quantize(
|
||||
model,
|
||||
weight_config={}, # noqa: B006
|
||||
num_bits=4,
|
||||
group_size=32,
|
||||
scheme="asym",
|
||||
ratios={}, # noqa: B006
|
||||
accuracy_level=0,
|
||||
providers=["CPUExecutionProvider"], # noqa: B006
|
||||
algorithm="k_quant",
|
||||
):
|
||||
"""Quant the model with round to nearst method.
|
||||
|
||||
Args:
|
||||
model (ModelProto or ONNXModel): onnx model
|
||||
weight_config (dict): quantization config
|
||||
For example,
|
||||
weight_config = {
|
||||
'fc2':
|
||||
{
|
||||
'bits': 4,
|
||||
'group_size': 32,
|
||||
'scheme': 'sym',
|
||||
'algorithm': 'RTN'
|
||||
}
|
||||
}
|
||||
num_bits (int, optional): num_bits. Default is 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Default is 32.
|
||||
scheme (str, optional): sym or asym. Defaults to "asym".
|
||||
ratios (dict, optional): percentile of clip. Defaults to {}.
|
||||
accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
|
||||
providers (list): providers to use
|
||||
|
||||
Returns:
|
||||
model: fake quantized ONNXModel
|
||||
"""
|
||||
model = ONNXModel(model)
|
||||
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
|
||||
new_nodes = []
|
||||
remove_nodes = []
|
||||
total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
|
||||
curr_id = 0
|
||||
for node in model.nodes():
|
||||
if node.op_type in ["MatMul"]:
|
||||
curr_id += 1
|
||||
simple_progress_bar(total_num, curr_id)
|
||||
if (
|
||||
node.op_type in ["MatMul"]
|
||||
and model.get_initializer(node.input[1]) is not None
|
||||
and weight_config.get(node.name, {}) != "fp32"
|
||||
):
|
||||
weight_tensor = model.get_initializer(node.input[1])
|
||||
weight = numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
|
||||
if len(weight.shape) != 2:
|
||||
continue
|
||||
|
||||
dtype = weight.dtype
|
||||
|
||||
if node.name in weight_config:
|
||||
num_bits = weight_config[node.name]["bits"]
|
||||
group_size = weight_config[node.name]["group_size"]
|
||||
scheme = weight_config[node.name]["scheme"]
|
||||
|
||||
org_w_shape = weight.shape # ic, oc
|
||||
group_size = group_size if group_size != -1 else org_w_shape[0]
|
||||
|
||||
k_blocks = (org_w_shape[0] - 1) // group_size + 1
|
||||
init_share_num = model.get_initializer_share_num(node.input[1])
|
||||
|
||||
weight = pad_tensor(weight, group_size, k_blocks)
|
||||
|
||||
satisfy_MatMulNBits_condition = num_bits == 4 or num_bits == 8 # noqa: N806
|
||||
|
||||
if satisfy_MatMulNBits_condition: # pragma: no cover
|
||||
if algorithm == "k_quant":
|
||||
q_weight, scale, zp = quant_tensor_k_quant_cuda(weight.T, num_bits, group_size)
|
||||
else:
|
||||
q_weight, scale, zp = quant_tensor(
|
||||
weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
|
||||
)
|
||||
|
||||
q_matmul_node, new_inits = make_matmul_weight_only_node(
|
||||
node=node,
|
||||
weight_shape=org_w_shape,
|
||||
num_bits=num_bits,
|
||||
group_size=group_size,
|
||||
k_blocks=k_blocks,
|
||||
q_weight=q_weight.astype("uint8"),
|
||||
scale=scale.astype(dtype),
|
||||
zero_point=zp if scheme == "asym" or algorithm == "k_quant" else None,
|
||||
accuracy_level=accuracy_level,
|
||||
)
|
||||
|
||||
model.add_initializers(new_inits)
|
||||
remove_nodes.append(node)
|
||||
new_nodes.append(q_matmul_node)
|
||||
else:
|
||||
q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
|
||||
q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
|
||||
q_weight = np.transpose(q_weight)
|
||||
q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
|
||||
q_weight_tensor = onnx.helper.make_tensor(
|
||||
name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
|
||||
data_type=np_dtype_to_tensor_dtype(dtype),
|
||||
dims=weight.shape,
|
||||
vals=q_weight.tobytes(),
|
||||
raw=True,
|
||||
)
|
||||
model.add_initializer(q_weight_tensor)
|
||||
node.input[1] = q_weight_tensor.name
|
||||
if init_share_num == 1:
|
||||
model.remove_initializer(weight_tensor)
|
||||
|
||||
model.add_nodes(new_nodes)
|
||||
model.remove_nodes(remove_nodes)
|
||||
model.topological_sort()
|
||||
return model
|
||||
|
||||
|
||||
def get_weight_scale(weight, group_size):
|
||||
"""Get the scale of weight."""
|
||||
org_shape = weight.shape
|
||||
weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight
|
||||
scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0)
|
||||
return scale
|
||||
|
||||
|
||||
def prepare_inputs(model, n_samples, dataloader, providers):
|
||||
"""Prepare inputs for weight only quantization.
|
||||
|
||||
Args:
|
||||
model (ModelProto or ONNXModel): onnx model
|
||||
n_samples (int, optional): calibration sample number. -1 means all samples.
|
||||
dataloader (object): dataloader for calibration.
|
||||
providers (list): providers to use
|
||||
|
||||
Returns:
|
||||
inputs: prepared inputs.
|
||||
so: session options
|
||||
"""
|
||||
from importlib.util import find_spec # noqa: PLC0415
|
||||
|
||||
from .util import to_numpy # noqa: PLC0415
|
||||
|
||||
so = ort.SessionOptions()
|
||||
if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover
|
||||
from onnxruntime_extensions import get_library_path # noqa: PLC0415
|
||||
|
||||
so.register_custom_ops_library(get_library_path())
|
||||
if model.is_large_model:
|
||||
onnx.save_model(
|
||||
model.model,
|
||||
model.model_path + "_augment.onnx",
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=True,
|
||||
convert_attribute=False,
|
||||
)
|
||||
|
||||
session = (
|
||||
ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
|
||||
if not model.is_large_model
|
||||
else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
|
||||
)
|
||||
inputs_names = [i.name for i in session.get_inputs()]
|
||||
del session
|
||||
|
||||
inputs = []
|
||||
for i, data in enumerate(dataloader):
|
||||
if n_samples != -1 and ((i + 1) * dataloader.batch_size) > n_samples:
|
||||
break
|
||||
if len(inputs_names) != 1 or isinstance(data[0], dict):
|
||||
assert len(data[0]) == len(inputs_names), (
|
||||
f"Input number mismatch, require {len(inputs_names)} but get {len(data[0])}"
|
||||
)
|
||||
|
||||
if isinstance(data[0], dict):
|
||||
inputs.append(dict([(name, to_numpy(inp_data)) for name, inp_data in data[0].items()])) # noqa: C404
|
||||
elif isinstance(data[0], np.ndarray): # pragma: no cover
|
||||
inputs.append(dict([(name, inp) for name, inp in zip(inputs_names, [data[0]], strict=False)])) # noqa: C404
|
||||
else: # pragma: no cover
|
||||
inputs.append(dict([(name, to_numpy(inp)) for name, inp in zip(inputs_names, data[0], strict=False)])) # noqa: C404
|
||||
return inputs, so
|
||||
|
||||
|
||||
def gptq(
|
||||
W,
|
||||
H,
|
||||
num_bits=4,
|
||||
group_size=32,
|
||||
scheme="asym",
|
||||
blocksize=128,
|
||||
percdamp=0.01,
|
||||
actorder=False,
|
||||
mse=False,
|
||||
perchannel=True,
|
||||
):
|
||||
"""Quant the weight with GPTQ method.
|
||||
|
||||
Args:
|
||||
W (array): weight.
|
||||
H (array): Hessian matrix.
|
||||
num_bits (int, optional): num_bits. Default is 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Default is 32.
|
||||
scheme (str, optional): sym or asym. Defaults to "asym".
|
||||
blocksize (int, optional): blocksize to quantize weight.
|
||||
percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
|
||||
actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
|
||||
mse (bool, optional): whether get scale and zero point with mse error.
|
||||
perchannel (bool, optional): whether quantize weight per-channel.
|
||||
|
||||
Returns:
|
||||
Q: fake quantized weight
|
||||
"""
|
||||
maxq = 2**num_bits - 1
|
||||
grid = 100
|
||||
maxshrink = 0.8
|
||||
norm = 2.4
|
||||
|
||||
def find_params(weight):
|
||||
org_shape = weight.shape
|
||||
# find zp, scale
|
||||
if not perchannel:
|
||||
weight = np.expand_dims(weight.flatten(), axis=1)
|
||||
tmp = np.zeros(weight.shape[1])
|
||||
xmin = np.minimum(np.min(weight, axis=0), tmp)
|
||||
xmax = np.maximum(np.max(weight, axis=0), tmp)
|
||||
if scheme == "sym":
|
||||
xmax = np.maximum(np.abs(xmin), xmax)
|
||||
tmp = xmin < 0
|
||||
if np.any(tmp):
|
||||
xmin[tmp] = -xmax[tmp]
|
||||
tmp = (xmin == 0) & (xmax == 0)
|
||||
xmin[tmp] = -1
|
||||
xmax[tmp] = +1
|
||||
|
||||
scale = (xmax - xmin) / maxq
|
||||
if scheme == "sym":
|
||||
zero = np.ones(scale.shape) * (maxq + 1) / 2
|
||||
else:
|
||||
zero = np.round(-xmin / scale)
|
||||
if mse:
|
||||
best = np.ones([weight.shape[1]]) * float("inf")
|
||||
for i in range(int(maxshrink * grid)):
|
||||
p = 1 - i / grid
|
||||
xmin1 = p * xmin
|
||||
xmax1 = p * xmax
|
||||
scale1 = (xmax1 - xmin1) / maxq
|
||||
zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
|
||||
q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
|
||||
q -= weight
|
||||
q = np.power(np.abs(q), norm)
|
||||
err = np.sum(q, 0)
|
||||
tmp = err < best
|
||||
if np.any(tmp):
|
||||
best[tmp] = err[tmp]
|
||||
scale[tmp] = scale1[tmp]
|
||||
zero[tmp] = zero1[tmp]
|
||||
if not perchannel:
|
||||
tmp = org_shape[1]
|
||||
scale = np.repeat(scale, tmp)
|
||||
zero = np.repeat(zero, tmp)
|
||||
shape = [-1] + [1] * (len(org_shape) - 1)
|
||||
scale = np.reshape(scale, shape)
|
||||
zero = np.reshape(zero, shape)
|
||||
return scale, zero
|
||||
|
||||
shape = W.shape
|
||||
scale, zp = find_params(W)
|
||||
dead = np.diag(H) == 0
|
||||
H[dead, dead] = 1
|
||||
W[dead, :] = 0 # such channel makes no contribution to quantization computation
|
||||
|
||||
# rearrange considering the diag's value
|
||||
if actorder:
|
||||
perm = np.argsort(np.diag(H))[::-1]
|
||||
W = W[perm, :] # noqa: N806
|
||||
H = H[perm, :][:, perm] # noqa: N806
|
||||
Losses = np.zeros_like(W) # noqa: N806
|
||||
Q = np.zeros_like(W) # noqa: N806
|
||||
damp = percdamp * np.mean(np.diag(H))
|
||||
diag = np.arange(shape[0])
|
||||
H[diag, diag] += damp # add a average value of
|
||||
H = np.linalg.cholesky(np.linalg.inv(H)).T # noqa: N806
|
||||
Hinv = H # noqa: N806
|
||||
for i1 in range(0, shape[0], blocksize):
|
||||
i2 = min(i1 + blocksize, shape[0])
|
||||
count = i2 - i1
|
||||
|
||||
W1 = copy.deepcopy(W[i1:i2, :]) # noqa: N806
|
||||
Q1 = np.zeros_like(W1) # noqa: N806
|
||||
Err1 = np.zeros_like(W1) # noqa: N806
|
||||
Losses1 = np.zeros_like(W1) # noqa: N806
|
||||
Hinv1 = Hinv[i1:i2, i1:i2] # noqa: N806
|
||||
|
||||
for i in range(count): # within a block, channel wise
|
||||
w = W1[i, :]
|
||||
d = Hinv1[i, i]
|
||||
|
||||
if group_size != -1:
|
||||
if (i1 + i) % group_size == 0:
|
||||
scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
|
||||
|
||||
q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
|
||||
Q1[i, :] = q
|
||||
Losses1[i, :] = (w - q) ** 2 / d**2
|
||||
|
||||
err1 = (w - q) / d
|
||||
W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0))
|
||||
Err1[i, :] = err1
|
||||
|
||||
Q[i1:i2, :] = Q1
|
||||
Losses[i1:i2, :] = Losses1 / 2
|
||||
|
||||
W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1)
|
||||
|
||||
if actorder:
|
||||
invperm = np.argsort(perm)
|
||||
Q = Q[invperm, :] # noqa: N806
|
||||
|
||||
Q = np.reshape(Q, W.shape) # noqa: N806
|
||||
del W
|
||||
return Q
|
||||
|
||||
|
||||
def gptq_quantize(
|
||||
model,
|
||||
dataloader,
|
||||
weight_config={}, # noqa: B006
|
||||
num_bits=4,
|
||||
group_size=32,
|
||||
scheme="asym",
|
||||
n_samples=128,
|
||||
percdamp=0.01,
|
||||
blocksize=128,
|
||||
actorder=False,
|
||||
mse=False,
|
||||
perchannel=True,
|
||||
accuracy_level=0,
|
||||
providers=["CPUExecutionProvider"], # noqa: B006
|
||||
):
|
||||
"""Quant the model with GPTQ method.
|
||||
|
||||
Args:
|
||||
model (ModelProto or ONNXModel): onnx model
|
||||
dataloader (object): dataloader for calibration.
|
||||
weight_config (dict): quantization config
|
||||
For example,
|
||||
weight_config = {
|
||||
'fc2':
|
||||
{
|
||||
'bits': 4,
|
||||
'group_size': 32,
|
||||
'scheme': 'sym',
|
||||
'algorithm': 'GPTQ'
|
||||
}
|
||||
}
|
||||
num_bits (int, optional): num_bits. Default is 4.
|
||||
group_size (int, optional): how many elements share one scale/zp. Default is 32.
|
||||
scheme (str, optional): sym or asym. Defaults to "asym".
|
||||
n_samples (int, optional): calibration sample number.
|
||||
percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
|
||||
blocksize (int, optional): blocksize to quantize weight.
|
||||
actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
|
||||
mse (bool, optional): whether get scale and zero point with mse error.
|
||||
perchannel (bool, optional): whether quantize weight per-channel.
|
||||
accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
|
||||
providers (list): providers to use
|
||||
|
||||
Returns:
|
||||
model: fake quantized ONNXModel
|
||||
"""
|
||||
model = ONNXModel(model)
|
||||
base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
|
||||
|
||||
inputs, so = prepare_inputs(model, n_samples, dataloader, providers)
|
||||
del dataloader
|
||||
org_output = copy.deepcopy(model.model.graph.output)
|
||||
model.remove_tensors_from_outputs([i.name for i in org_output])
|
||||
output_names = []
|
||||
for node in model.nodes():
|
||||
if (
|
||||
node.op_type in ["MatMul"]
|
||||
and weight_config.get(node.name, {}) != "fp32"
|
||||
and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
|
||||
):
|
||||
output_names.append(node.input[0])
|
||||
output_names = list(set(output_names))
|
||||
model.add_tensors_to_outputs(output_names)
|
||||
if model.is_large_model:
|
||||
onnx.save_model(
|
||||
model.model,
|
||||
model.model_path + "_augment.onnx",
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=True,
|
||||
convert_attribute=False,
|
||||
)
|
||||
|
||||
session = (
|
||||
ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
|
||||
if not model.is_large_model
|
||||
else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
|
||||
)
|
||||
|
||||
for idx, input_name in enumerate(output_names):
|
||||
simple_progress_bar(len(output_names), idx + 1)
|
||||
node_list = []
|
||||
weights = []
|
||||
|
||||
for node in model.input_name_to_nodes[input_name]:
|
||||
if (
|
||||
node.op_type in ["MatMul"]
|
||||
and weight_config.get(node.name, {}) != "fp32"
|
||||
and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
|
||||
and model.get_initializer(node.input[1]) is not None
|
||||
):
|
||||
weight = numpy_helper.to_array(
|
||||
model.get_initializer(model.get_node(node.name).input[1]), base_dir
|
||||
).copy()
|
||||
if len(weight.shape) != 2:
|
||||
continue
|
||||
|
||||
weights.append(weight)
|
||||
node_list.append(model.get_node(node.name))
|
||||
|
||||
if len(weights) == 0:
|
||||
continue
|
||||
|
||||
Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights] # noqa: N806
|
||||
nsamples = 0
|
||||
for data in inputs:
|
||||
inp = session.run([input_name], data)[0]
|
||||
tmp = inp.shape[0]
|
||||
inp = np.reshape(inp, (-1, inp.shape[-1]))
|
||||
Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs] # noqa: N806
|
||||
nsamples += tmp
|
||||
inp = np.sqrt(2 / nsamples) * inp
|
||||
Hs = [i + np.matmul(inp.T, inp) for i in Hs] # noqa: N806
|
||||
|
||||
for (
|
||||
node,
|
||||
weight,
|
||||
H, # noqa: N806
|
||||
) in zip(node_list, weights, Hs, strict=False):
|
||||
if node.name in weight_config:
|
||||
num_bits = weight_config[node.name]["bits"]
|
||||
group_size = weight_config[node.name]["group_size"]
|
||||
scheme = weight_config[node.name]["scheme"]
|
||||
group_size = group_size if group_size != -1 else weight.shape[0]
|
||||
dtype = weight.dtype
|
||||
|
||||
q_weight = gptq(
|
||||
weight,
|
||||
H,
|
||||
num_bits=num_bits,
|
||||
group_size=group_size,
|
||||
scheme=scheme,
|
||||
blocksize=blocksize,
|
||||
percdamp=percdamp,
|
||||
actorder=actorder,
|
||||
mse=mse,
|
||||
perchannel=perchannel,
|
||||
)
|
||||
|
||||
weight_tensor = model.get_initializer(node.input[1])
|
||||
init_share_num = model.get_initializer_share_num(node.input[1])
|
||||
|
||||
satisfy_MatMulNBits_condition = num_bits == 4 # noqa: N806
|
||||
|
||||
if satisfy_MatMulNBits_condition: # pragma: no cover
|
||||
org_shape = weight.shape
|
||||
k_blocks = (org_shape[0] + group_size - 1) // group_size
|
||||
q_weight = pad_tensor(q_weight, group_size, k_blocks)
|
||||
q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
|
||||
q_matmul_node, new_inits = make_matmul_weight_only_node(
|
||||
node=node,
|
||||
weight_shape=org_shape,
|
||||
num_bits=num_bits,
|
||||
group_size=group_size,
|
||||
k_blocks=k_blocks,
|
||||
q_weight=q_weight.astype("uint8"),
|
||||
scale=scale.astype(dtype),
|
||||
zero_point=zp if scheme == "asym" else None,
|
||||
accuracy_level=accuracy_level,
|
||||
)
|
||||
|
||||
model.add_initializers(new_inits)
|
||||
model.remove_node(node)
|
||||
model.add_node(q_matmul_node)
|
||||
else:
|
||||
q_weight_tensor = onnx.helper.make_tensor(
|
||||
name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
|
||||
data_type=np_dtype_to_tensor_dtype(dtype),
|
||||
dims=q_weight.shape,
|
||||
vals=q_weight.astype(dtype).tobytes(),
|
||||
raw=True,
|
||||
)
|
||||
model.add_initializer(q_weight_tensor)
|
||||
node.input[1] = q_weight_tensor.name
|
||||
if init_share_num == 1:
|
||||
model.remove_initializer(weight_tensor)
|
||||
|
||||
model.remove_tensors_from_outputs(output_names)
|
||||
model.model.graph.output.MergeFrom(org_output)
|
||||
|
||||
model.topological_sort()
|
||||
|
||||
# reload external data to prevent external data file path errors
|
||||
if model.is_large_model:
|
||||
from onnx.external_data_helper import load_external_data_for_model # noqa: PLC0415
|
||||
|
||||
load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
|
||||
|
||||
return model
|
||||
@@ -0,0 +1,600 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from pathlib import Path
|
||||
|
||||
import onnx
|
||||
import onnx.helper as onnx_helper
|
||||
import onnx.numpy_helper as onnx_numpy_helper
|
||||
from onnx.onnx_pb import ModelProto
|
||||
|
||||
from .quant_utils import attribute_to_kwarg, find_by_name
|
||||
|
||||
|
||||
def _clean_initializers_helper(graph, model):
|
||||
"""Clean unused initializers from graph.
|
||||
|
||||
Returns:
|
||||
A cleaned graph without unused initializers
|
||||
A list of tensor names, which are not produced by this graph and its subgraphes
|
||||
"""
|
||||
requesting_tensor_names = set()
|
||||
requesting_tensor_names.update(input_name for node in graph.node for input_name in node.input if input_name)
|
||||
requesting_tensor_names.update(g_out.name for g_out in graph.output if g_out.name)
|
||||
|
||||
new_nodes = []
|
||||
for node in graph.node:
|
||||
new_node = node
|
||||
graph_attrs = [
|
||||
attr
|
||||
for attr in node.attribute
|
||||
if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
|
||||
]
|
||||
if graph_attrs:
|
||||
kwargs = {}
|
||||
for attr in node.attribute:
|
||||
new_attribute = {}
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
(
|
||||
cleaned_sub_graph,
|
||||
sub_requesting_tensor_names,
|
||||
) = _clean_initializers_helper(attr.g, model)
|
||||
new_attribute = {attr.name: cleaned_sub_graph}
|
||||
requesting_tensor_names.update(sub_requesting_tensor_names)
|
||||
elif attr.type == onnx.AttributeProto.GRAPHS:
|
||||
cleaned_graphes = []
|
||||
for subgraph in attr.graphs:
|
||||
(
|
||||
cleaned_sub_graph,
|
||||
sub_requesting_tensor_names,
|
||||
) = _clean_initializers_helper(subgraph, model)
|
||||
cleaned_graphes.append(cleaned_sub_graph)
|
||||
requesting_tensor_names.update(sub_requesting_tensor_names)
|
||||
new_attribute = {attr.name: cleaned_graphes}
|
||||
else:
|
||||
new_attribute = attribute_to_kwarg(attr)
|
||||
kwargs.update(new_attribute)
|
||||
new_node = onnx_helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
|
||||
new_nodes.append(new_node)
|
||||
|
||||
graph.ClearField("node")
|
||||
graph.node.extend(new_nodes)
|
||||
|
||||
requesting_tensor_names.difference_update(output for node in graph.node for output in node.output)
|
||||
|
||||
unused_initializer = []
|
||||
for initializer in graph.initializer:
|
||||
if initializer.name in requesting_tensor_names:
|
||||
requesting_tensor_names.remove(initializer.name)
|
||||
else:
|
||||
# mark it to remove, remove here directly will cause mis-behavier
|
||||
unused_initializer.append(initializer)
|
||||
|
||||
name_to_input = {input.name: input for input in graph.input}
|
||||
for initializer in unused_initializer:
|
||||
graph.initializer.remove(initializer)
|
||||
if initializer.name in name_to_input:
|
||||
try:
|
||||
graph.input.remove(name_to_input[initializer.name])
|
||||
except StopIteration:
|
||||
if model.ir_version < 4:
|
||||
print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
|
||||
|
||||
requesting_tensor_names.difference_update(input.name for input in graph.input)
|
||||
|
||||
return graph, requesting_tensor_names
|
||||
|
||||
|
||||
class ONNXModel:
|
||||
def __init__(self, model: ModelProto):
|
||||
self.model = model
|
||||
|
||||
def nodes(self):
|
||||
return self.model.graph.node
|
||||
|
||||
def initializer(self):
|
||||
return self.model.graph.initializer
|
||||
|
||||
def initializer_extend(self, inits):
|
||||
if len(inits) == 0:
|
||||
raise ValueError("Can add an empty list.")
|
||||
for init in self.initializer():
|
||||
self._check_init(init, "gain")
|
||||
for init in inits:
|
||||
self._check_init(init)
|
||||
self.model.graph.initializer.append(init)
|
||||
|
||||
def graph(self):
|
||||
return self.model.graph
|
||||
|
||||
def ir_version(self):
|
||||
return self.model.ir_version
|
||||
|
||||
def opset_import(self):
|
||||
return self.model.opset_import
|
||||
|
||||
def set_opset_import(self, domain, version):
|
||||
for opset in self.model.opset_import:
|
||||
if opset.domain == domain:
|
||||
opset.version = version
|
||||
return
|
||||
|
||||
self.model.opset_import.extend([onnx_helper.make_opsetid(domain, version)])
|
||||
|
||||
def remove_node(self, node):
|
||||
if node in self.model.graph.node:
|
||||
self.model.graph.node.remove(node)
|
||||
|
||||
def remove_nodes(self, nodes_to_remove):
|
||||
for node in nodes_to_remove:
|
||||
self.remove_node(node)
|
||||
|
||||
def add_node(self, node):
|
||||
self.model.graph.node.extend([self._check_node(node)])
|
||||
|
||||
def add_nodes(self, nodes_to_add):
|
||||
for node in nodes_to_add:
|
||||
self.add_node(node)
|
||||
|
||||
def add_initializer(self, tensor):
|
||||
if find_by_name(tensor.name, self.model.graph.initializer) is None:
|
||||
self._check_init(tensor)
|
||||
self.model.graph.initializer.extend([tensor])
|
||||
|
||||
def get_initializer(self, name):
|
||||
for tensor in self.model.graph.initializer:
|
||||
if tensor.name == name:
|
||||
return tensor
|
||||
return None
|
||||
|
||||
def find_graph_input(self, input_name):
|
||||
for input in self.model.graph.input:
|
||||
if input.name == input_name:
|
||||
return input
|
||||
return None
|
||||
|
||||
def find_graph_output(self, output_name):
|
||||
for output in self.model.graph.output:
|
||||
if output.name == output_name:
|
||||
return output
|
||||
return None
|
||||
|
||||
def get_tensor_type(self, tensor_name: str):
|
||||
tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
|
||||
|
||||
if tensor_name in tensor_type_map:
|
||||
return tensor_type_map[tensor_name].tensor_type
|
||||
|
||||
g_input = self.find_graph_input(tensor_name)
|
||||
if g_input:
|
||||
return g_input.type.tensor_type
|
||||
|
||||
g_output = self.find_graph_output(tensor_name)
|
||||
if g_output:
|
||||
return g_output.type.tensor_type
|
||||
|
||||
return None
|
||||
|
||||
def get_constant_value(self, output_name):
|
||||
for node in self.model.graph.node:
|
||||
if node.op_type == "Constant":
|
||||
if node.output[0] == output_name:
|
||||
for attr in node.attribute:
|
||||
if attr.name == "value":
|
||||
return onnx_numpy_helper.to_array(attr.t)
|
||||
|
||||
# Fallback to initializer since constant folding may have been applied.
|
||||
initializer = self.get_initializer(output_name)
|
||||
if initializer is not None:
|
||||
return onnx_numpy_helper.to_array(initializer)
|
||||
|
||||
return None
|
||||
|
||||
def get_initializer_name_set(self):
|
||||
return {initializer.name for initializer in self.model.graph.initializer}
|
||||
|
||||
def remove_initializer(self, tensor):
|
||||
if tensor in self.model.graph.initializer:
|
||||
self.model.graph.initializer.remove(tensor)
|
||||
for input in self.model.graph.input:
|
||||
if input.name == tensor.name:
|
||||
self.model.graph.input.remove(input)
|
||||
break
|
||||
|
||||
def remove_initializers(self, init_to_remove):
|
||||
for initializer in init_to_remove:
|
||||
self.remove_initializer(initializer)
|
||||
|
||||
def get_non_initializer_inputs(self):
|
||||
initializer_names = self.get_initializer_name_set()
|
||||
non_initializer_inputs = set()
|
||||
for input in self.model.graph.input:
|
||||
if input.name not in initializer_names:
|
||||
non_initializer_inputs.add(input.name)
|
||||
return non_initializer_inputs
|
||||
|
||||
def input_name_to_nodes(self):
|
||||
input_name_to_nodes = {}
|
||||
for node in self.model.graph.node:
|
||||
for input_name in node.input:
|
||||
if input_name: # Could be empty when it is optional
|
||||
if input_name not in input_name_to_nodes:
|
||||
input_name_to_nodes[input_name] = [node]
|
||||
else:
|
||||
input_name_to_nodes[input_name].append(node)
|
||||
return input_name_to_nodes
|
||||
|
||||
def output_name_to_node(self):
|
||||
output_name_to_node = {}
|
||||
for node in self.model.graph.node:
|
||||
for output_name in node.output:
|
||||
if output_name: # Could be empty when it is optional
|
||||
output_name_to_node[output_name] = node
|
||||
return output_name_to_node
|
||||
|
||||
def get_children(self, node, input_name_to_nodes=None):
|
||||
if input_name_to_nodes is None:
|
||||
input_name_to_nodes = self.input_name_to_nodes()
|
||||
|
||||
children = []
|
||||
for output in node.output:
|
||||
if output in input_name_to_nodes:
|
||||
for node in input_name_to_nodes[output]:
|
||||
children.append(node) # noqa: PERF402
|
||||
return children
|
||||
|
||||
def get_parents(self, node, output_name_to_node=None):
|
||||
if output_name_to_node is None:
|
||||
output_name_to_node = self.output_name_to_node()
|
||||
|
||||
parents = []
|
||||
for input in node.input:
|
||||
if input in output_name_to_node:
|
||||
parents.append(output_name_to_node[input])
|
||||
return parents
|
||||
|
||||
def get_parent(self, node, idx, output_name_to_node=None):
|
||||
if output_name_to_node is None:
|
||||
output_name_to_node = self.output_name_to_node()
|
||||
|
||||
if len(node.input) <= idx:
|
||||
return None
|
||||
|
||||
input = node.input[idx]
|
||||
if input not in output_name_to_node:
|
||||
return None
|
||||
|
||||
return output_name_to_node[input]
|
||||
|
||||
def find_node_by_name(self, node_name, new_nodes_list, graph):
|
||||
"""Find out if a node exists in a graph or a node is in the
|
||||
new set of nodes created during quantization.
|
||||
|
||||
Returns:
|
||||
The node found or None.
|
||||
"""
|
||||
graph_nodes_list = list(graph.node) # deep copy
|
||||
graph_nodes_list.extend(new_nodes_list)
|
||||
node = find_by_name(node_name, graph_nodes_list)
|
||||
return node
|
||||
|
||||
def get_largest_node_name_suffix(self, node_name_prefix):
|
||||
"""
|
||||
Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`.
|
||||
Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3.
|
||||
"""
|
||||
suffix = -1
|
||||
|
||||
for node in self.model.graph.node:
|
||||
if node.name and node.name.startswith(node_name_prefix):
|
||||
try:
|
||||
index = int(node.name[len(node_name_prefix) :])
|
||||
suffix = max(index, suffix)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return suffix
|
||||
|
||||
def get_largest_initializer_name_suffix(self, initializer_name_prefix):
|
||||
"""
|
||||
Gets the largest initializer name integer suffix for all initializer names that begin
|
||||
with `initializer_name_prefix`. This can be used to create unique initializer names.
|
||||
|
||||
Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if
|
||||
`initializer_name_prefix` is 'my_weight_'.
|
||||
"""
|
||||
suffix = -1
|
||||
|
||||
for initializer in self.model.graph.initializer:
|
||||
if initializer.name.startswith(initializer_name_prefix):
|
||||
try:
|
||||
index = int(initializer.name[len(initializer_name_prefix) :])
|
||||
suffix = max(index, suffix)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return suffix
|
||||
|
||||
def find_nodes_by_initializer(self, graph, initializer):
|
||||
"""
|
||||
Find all nodes with given initializer as an input.
|
||||
"""
|
||||
nodes = []
|
||||
for node in graph.node:
|
||||
for node_input in node.input:
|
||||
if node_input == initializer.name:
|
||||
nodes.append(node)
|
||||
return nodes
|
||||
|
||||
@staticmethod
|
||||
def __get_initializer(name, graph_path):
|
||||
for gid in range(len(graph_path) - 1, -1, -1):
|
||||
graph = graph_path[gid]
|
||||
for tensor in graph.initializer:
|
||||
if tensor.name == name:
|
||||
return tensor, graph
|
||||
return None, None
|
||||
|
||||
@staticmethod
|
||||
def __replace_gemm_with_matmul(graph_path):
|
||||
new_nodes = []
|
||||
graph = graph_path[-1]
|
||||
for node in graph.node:
|
||||
graph_attrs = [attr for attr in node.attribute if attr.type == 5 or attr.type == 10]
|
||||
if graph_attrs:
|
||||
kwargs = {}
|
||||
for attr in node.attribute:
|
||||
if attr.type == 5:
|
||||
graph_path.append(attr.g)
|
||||
kv = {attr.name: ONNXModel.__replace_gemm_with_matmul(graph_path)}
|
||||
elif attr.type == 10:
|
||||
value = []
|
||||
for subgraph in attr.graphs:
|
||||
graph_path.append(subgraph)
|
||||
value.extend([ONNXModel.__replace_gemm_with_matmul(graph_path)])
|
||||
kv = {attr.name: value}
|
||||
else:
|
||||
kv = attribute_to_kwarg(attr)
|
||||
kwargs.update(kv)
|
||||
node = onnx_helper.make_node( # noqa: PLW2901
|
||||
node.op_type, node.input, node.output, name=node.name, **kwargs
|
||||
)
|
||||
|
||||
if node.op_type == "Gemm":
|
||||
alpha = 1.0
|
||||
beta = 1.0
|
||||
transA = 0 # noqa: N806
|
||||
transB = 0 # noqa: N806
|
||||
for attr in node.attribute:
|
||||
if attr.name == "alpha":
|
||||
alpha = onnx_helper.get_attribute_value(attr)
|
||||
elif attr.name == "beta":
|
||||
beta = onnx_helper.get_attribute_value(attr)
|
||||
elif attr.name == "transA":
|
||||
transA = onnx_helper.get_attribute_value(attr) # noqa: N806
|
||||
elif attr.name == "transB":
|
||||
transB = onnx_helper.get_attribute_value(attr) # noqa: N806
|
||||
if alpha == 1.0 and beta == 1.0 and transA == 0:
|
||||
inputB = node.input[1] # noqa: N806
|
||||
if transB == 1:
|
||||
B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path) # noqa: N806
|
||||
if B:
|
||||
# assume B is not used by any other node
|
||||
B_array = onnx_numpy_helper.to_array(B) # noqa: N806
|
||||
B_trans = onnx_numpy_helper.from_array(B_array.T) # noqa: N806
|
||||
B_trans.name = B.name
|
||||
Bs_graph.initializer.remove(B)
|
||||
for input in Bs_graph.input:
|
||||
if input.name == inputB:
|
||||
Bs_graph.input.remove(input)
|
||||
break
|
||||
Bs_graph.initializer.extend([B_trans])
|
||||
else:
|
||||
inputB += "_Transposed" # noqa: N806
|
||||
transpose_node = onnx_helper.make_node(
|
||||
"Transpose",
|
||||
inputs=[node.input[1]],
|
||||
outputs=[inputB],
|
||||
name=node.name + "_Transpose" if node.name else "",
|
||||
)
|
||||
new_nodes.append(transpose_node)
|
||||
|
||||
matmul_node = onnx_helper.make_node(
|
||||
"MatMul",
|
||||
inputs=[node.input[0], inputB],
|
||||
outputs=[node.output[0] + ("_MatMul" if len(node.input) > 2 else "")],
|
||||
name=node.name + "_MatMul" if node.name else "",
|
||||
)
|
||||
new_nodes.append(matmul_node)
|
||||
|
||||
if len(node.input) > 2:
|
||||
add_node = onnx_helper.make_node(
|
||||
"Add",
|
||||
inputs=[node.output[0] + "_MatMul", node.input[2]],
|
||||
outputs=node.output,
|
||||
name=node.name + "_Add" if node.name else "",
|
||||
)
|
||||
new_nodes.append(add_node)
|
||||
|
||||
# unsupported
|
||||
else:
|
||||
new_nodes.append(node)
|
||||
|
||||
# not GEMM
|
||||
else:
|
||||
new_nodes.append(node)
|
||||
|
||||
graph.ClearField("node")
|
||||
graph.node.extend(new_nodes)
|
||||
graph_path.pop()
|
||||
return graph
|
||||
|
||||
def replace_gemm_with_matmul(self):
|
||||
graph_path = [self.graph()]
|
||||
ONNXModel.__replace_gemm_with_matmul(graph_path)
|
||||
|
||||
def save_model_to_file(self, output_path, use_external_data_format=False):
|
||||
"""
|
||||
Save model to external data, which is needed for model size > 2GB
|
||||
"""
|
||||
self.topological_sort()
|
||||
if use_external_data_format:
|
||||
onnx.external_data_helper.convert_model_to_external_data(
|
||||
self.model,
|
||||
all_tensors_to_one_file=True,
|
||||
location=Path(output_path).name + ".data",
|
||||
convert_attribute=True,
|
||||
)
|
||||
for init in self.model.graph.initializer:
|
||||
self._check_init(init, "end")
|
||||
onnx.save_model(self.model, output_path)
|
||||
|
||||
@staticmethod
|
||||
def replace_node_input(node, old_input_name, new_input_name):
|
||||
assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
|
||||
for j in range(len(node.input)):
|
||||
if node.input[j] == old_input_name:
|
||||
node.input[j] = new_input_name
|
||||
|
||||
def replace_input_of_all_nodes(self, old_input_name, new_input_name):
|
||||
for node in self.model.graph.node:
|
||||
ONNXModel.replace_node_input(node, old_input_name, new_input_name)
|
||||
|
||||
def replace_input_of_nodes(self, old_input_name, new_input_name, node_names_set):
|
||||
for node in self.model.graph.node:
|
||||
if node.name in node_names_set:
|
||||
ONNXModel.replace_node_input(node, old_input_name, new_input_name)
|
||||
|
||||
@staticmethod
|
||||
def replace_node_output(node, old_output_name, new_output_name):
|
||||
assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
|
||||
for j in range(len(node.output)):
|
||||
if node.output[j] == old_output_name:
|
||||
node.output[j] = new_output_name
|
||||
|
||||
def replace_output_of_all_nodes(self, old_output_name, new_output_name):
|
||||
for node in self.model.graph.node:
|
||||
ONNXModel.replace_node_output(node, old_output_name, new_output_name)
|
||||
|
||||
def replace_output_of_nodes(self, old_output_name, new_output_name, node_names_set):
|
||||
for node in self.model.graph.node:
|
||||
if node.name in node_names_set:
|
||||
ONNXModel.replace_node_output(node, old_output_name, new_output_name)
|
||||
|
||||
def remove_unused_constant(self):
|
||||
input_name_to_nodes = self.input_name_to_nodes()
|
||||
|
||||
# remove unused constant
|
||||
unused_nodes = []
|
||||
nodes = self.nodes()
|
||||
for node in nodes:
|
||||
if (
|
||||
node.op_type == "Constant"
|
||||
and not self.is_graph_output(node.output[0])
|
||||
and node.output[0] not in input_name_to_nodes
|
||||
):
|
||||
unused_nodes.append(node)
|
||||
|
||||
self.remove_nodes(unused_nodes)
|
||||
|
||||
ununsed_weights = []
|
||||
for w in self.initializer():
|
||||
if w.name not in input_name_to_nodes and not self.is_graph_output(w.name):
|
||||
ununsed_weights.append(w)
|
||||
# Remove from graph.input
|
||||
for graph_input in self.graph().input:
|
||||
if graph_input.name == w.name:
|
||||
self.graph().input.remove(graph_input)
|
||||
|
||||
self.remove_initializers(ununsed_weights)
|
||||
|
||||
def is_graph_output(self, output_name):
|
||||
return any(output.name == output_name for output in self.model.graph.output)
|
||||
|
||||
def is_graph_input(self, tensor_name: str) -> bool:
|
||||
return any(input.name == tensor_name for input in self.model.graph.input)
|
||||
|
||||
# TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
|
||||
# Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
|
||||
def topological_sort(self):
|
||||
deps_count = [0] * len(self.nodes()) # dependency count of each node
|
||||
deps_to_nodes = {} # input to node indice
|
||||
sorted_nodes = [] # initialize sorted_nodes
|
||||
for node_idx, node in enumerate(self.nodes()):
|
||||
# CANNOT use len(node.input) directly because input can be optional
|
||||
deps_count[node_idx] = sum(1 for _ in node.input if _)
|
||||
if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
|
||||
sorted_nodes.append(self.nodes()[node_idx])
|
||||
continue
|
||||
|
||||
for input_name in node.input:
|
||||
if not input_name:
|
||||
continue
|
||||
if input_name not in deps_to_nodes:
|
||||
deps_to_nodes[input_name] = [node_idx]
|
||||
else:
|
||||
deps_to_nodes[input_name].append(node_idx)
|
||||
|
||||
initializer_names = [init.name for init in self.initializer()]
|
||||
graph_input_names = [input.name for input in self.model.graph.input]
|
||||
input_names = initializer_names + graph_input_names
|
||||
input_names.sort()
|
||||
prev_input_name = None
|
||||
for input_name in input_names:
|
||||
if prev_input_name == input_name:
|
||||
continue
|
||||
|
||||
prev_input_name = input_name
|
||||
if input_name in deps_to_nodes:
|
||||
for node_idx in deps_to_nodes[input_name]:
|
||||
deps_count[node_idx] = deps_count[node_idx] - 1
|
||||
if deps_count[node_idx] == 0:
|
||||
sorted_nodes.append(self.nodes()[node_idx])
|
||||
|
||||
start = 0
|
||||
end = len(sorted_nodes)
|
||||
|
||||
while start < end:
|
||||
for output in sorted_nodes[start].output:
|
||||
if output in deps_to_nodes:
|
||||
for node_idx in deps_to_nodes[output]:
|
||||
deps_count[node_idx] = deps_count[node_idx] - 1
|
||||
if deps_count[node_idx] == 0:
|
||||
sorted_nodes.append(self.nodes()[node_idx])
|
||||
end = end + 1
|
||||
start = start + 1
|
||||
|
||||
assert end == len(self.graph().node), "Graph is not a DAG"
|
||||
self.graph().ClearField("node")
|
||||
self.graph().node.extend(sorted_nodes)
|
||||
|
||||
def clean_initializers(self):
|
||||
return _clean_initializers_helper(self.graph(), self.model)
|
||||
|
||||
def _check_init(self, init, test=None):
|
||||
if init.data_type == onnx.TensorProto.FLOAT8E4M3FN:
|
||||
if init.HasField("raw_data"):
|
||||
b = list(init.raw_data)
|
||||
if any((i & 127) == 127 for i in b):
|
||||
raise ValueError(f"Initializer {init.name!r} has nan.")
|
||||
return init
|
||||
|
||||
def _check_node(self, node):
|
||||
"""
|
||||
A quantization to float 8 does not use quantized bias but float 16 bias.
|
||||
This function checks that DequantizeLinear is not used to
|
||||
dequantize from float 16.
|
||||
"""
|
||||
if node.op_type == "DequantizeLinear":
|
||||
zero_point = node.input[2]
|
||||
init = self.get_initializer(zero_point)
|
||||
dtype = init.data_type
|
||||
if dtype in {
|
||||
onnx.TensorProto.FLOAT16,
|
||||
onnx.TensorProto.FLOAT,
|
||||
onnx.TensorProto.DOUBLE,
|
||||
onnx.TensorProto.BFLOAT16,
|
||||
}:
|
||||
raise RuntimeError(f"Unsupported DequantizeLinear operator, dequantization from {dtype}.")
|
||||
return node
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,2 @@
|
||||
# from .base_operator import QuantOperatorBase
|
||||
# from .matmul import MatMulInteger
|
||||
@@ -0,0 +1,119 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearActivation(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def QuantizeClipRelu(self): # noqa: N802
|
||||
node = self.node
|
||||
assert node.op_type == "Relu" or node.op_type == "Clip"
|
||||
|
||||
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
|
||||
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
|
||||
# If input to this node is not quantized then keep this node
|
||||
# If activation is symmetric, not quantize the op and simply return
|
||||
if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
|
||||
return super().quantize()
|
||||
|
||||
quantized_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_value
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
if node.op_type == "Relu" or node.op_type == "Clip":
|
||||
self.QuantizeClipRelu()
|
||||
return
|
||||
|
||||
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
|
||||
sigmoid_nnapi_mode = (
|
||||
node.op_type == "Sigmoid"
|
||||
and nnapi_sigmoid_option in self.quantizer.extra_options
|
||||
and self.quantizer.extra_options[nnapi_sigmoid_option]
|
||||
)
|
||||
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
|
||||
use_zeropoint = 0 if sigmoid_nnapi_mode else None
|
||||
|
||||
# No assert on op_type as it is controlled by registry
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_activation_name = ""
|
||||
if node.name:
|
||||
qlinear_activation_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_activation_inputs = [
|
||||
quantized_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
|
||||
qlinear_activation_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_activation_inputs,
|
||||
[qlinear_activation_output],
|
||||
qlinear_activation_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_activation_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
nodes.append(qlinear_activation_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQRemovableActivation(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# If input to this node is not quantized then keep this node
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
return
|
||||
|
||||
if (
|
||||
not self.quantizer.is_activation_symmetric
|
||||
and not self.quantizer.qdq_keep_removable_activations
|
||||
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
|
||||
):
|
||||
self.quantizer.remove_node(self.node)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
@@ -0,0 +1,18 @@
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
# Use the quantized tensor as input without DQ.
|
||||
class QArgMax(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
@@ -0,0 +1,73 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Attention
|
||||
"""
|
||||
|
||||
|
||||
class AttentionQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: Attention node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "Attention"
|
||||
|
||||
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
|
||||
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
|
||||
# is implemented
|
||||
for attr in node.attribute:
|
||||
if attr.name == "qkv_hidden_sizes":
|
||||
return super().quantize()
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qattention_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
inputs = []
|
||||
inputs.extend(quantized_input_names)
|
||||
inputs.extend([node.input[2]])
|
||||
inputs.extend(scale_names)
|
||||
inputs.extend([node.input[3] if len(node.input) > 3 else ""])
|
||||
inputs.extend(zero_point_names)
|
||||
inputs.extend([node.input[4] if len(node.input) > 4 else ""])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
|
||||
nodes.append(qattention_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,26 @@
|
||||
class QuantOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_float_tensor(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
Given a node which does not support quantization, this method checks whether the input to
|
||||
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
|
||||
parameter node: Current node
|
||||
parameter new_nodes_list: List of new nodes created before processing current node
|
||||
return: List of new nodes created
|
||||
"""
|
||||
for _, node_input in enumerate(self.node.input):
|
||||
dequantize_node = self.quantizer._dequantize_value(node_input)
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
|
||||
# Append the original node
|
||||
self.quantizer.new_nodes.append(self.node)
|
||||
@@ -0,0 +1,72 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearBinaryOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0, 1])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_binary_math_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_binary_math_inputs = []
|
||||
# Input 0
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[0])
|
||||
qlinear_binary_math_inputs.append(scale_names[0])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[1])
|
||||
qlinear_binary_math_inputs.append(scale_names[1])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_binary_math_inputs.append(output_scale_name)
|
||||
qlinear_binary_math_inputs.append(output_zp_name)
|
||||
|
||||
qlinear_binary_math_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_binary_math_inputs,
|
||||
[qlinear_binary_math_output],
|
||||
qlinear_binary_math_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qlinear_binary_math_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_binary_math_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import ( # noqa: F401
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QLinearConcat(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlconcat_inputs = [output_scale_name, output_zp_name]
|
||||
for i in range(len(q_input_names)):
|
||||
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
qlconcat_node = onnx.helper.make_node(
|
||||
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlconcat_node]
|
||||
@@ -0,0 +1,260 @@
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
get_mul_node,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class ConvInteger(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def add_bias(self, nodes, scaled_output):
|
||||
"""
|
||||
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
|
||||
parameter nodes: new nodes would be appended into nodes
|
||||
parameter node: current node (Conv)
|
||||
parameter scaled_output: output of quant conv without bias
|
||||
parameter output: output of Conv
|
||||
parameter bias_name: bias of Conv
|
||||
return: the name of output
|
||||
"""
|
||||
node = self.node
|
||||
model = self.quantizer.model
|
||||
# Add tensors for the shape to be reshaped to
|
||||
weight = find_by_name(node.input[1], model.initializer())
|
||||
if weight is None:
|
||||
raise ValueError(f"Expected {node.input[1]} to be an initializer")
|
||||
|
||||
# Add reshape for correct broadcase
|
||||
output = node.output[0]
|
||||
reshape_input_data = node.input[2] # bias of Conv
|
||||
reshape_input_shape = output + "_bias_reshape_shape"
|
||||
reshape_output = output + "_bias_reshape_output"
|
||||
|
||||
shape = np.ones((len(weight.dims)), dtype=np.int64)
|
||||
shape[1] = -1
|
||||
init_shape = onnx.helper.make_tensor(
|
||||
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
|
||||
)
|
||||
model.add_initializer(init_shape)
|
||||
|
||||
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
|
||||
nodes.append(reshape_node)
|
||||
|
||||
# Add an Add operation for bias
|
||||
add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
|
||||
nodes.append(add_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
conv_integer_output = node.output[0] + "_output_quantized"
|
||||
conv_integer_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
conv_integer_node = onnx.helper.make_node(
|
||||
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
|
||||
)
|
||||
nodes.append(conv_integer_node)
|
||||
|
||||
# Add cast operation to cast convInteger output to float.
|
||||
onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_op_output = conv_integer_output + "_cast_output"
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[conv_integer_output],
|
||||
[cast_op_output],
|
||||
conv_integer_output + "_cast",
|
||||
to=onnx_type, # TODO: FLOAT ot FLOAT16
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
if conv_integer_name:
|
||||
scales_mul_op = conv_integer_name + "_scales_mul"
|
||||
else:
|
||||
scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
has_bias = len(node.input) == 3
|
||||
scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
|
||||
# and make the output of this node the same as output of original conv node.
|
||||
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
scaled_output_name,
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
|
||||
if has_bias:
|
||||
self.add_bias(nodes, scaled_output_name)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QLinearConv(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
bias_present = False
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
|
||||
raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
|
||||
bias_present = True
|
||||
|
||||
qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_conv_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
qlinear_conv_inputs = []
|
||||
# Input 0
|
||||
qlinear_conv_inputs.append(quantized_input_names[0])
|
||||
qlinear_conv_inputs.append(scale_names[0])
|
||||
qlinear_conv_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_conv_inputs.append(quantized_input_names[1])
|
||||
qlinear_conv_inputs.append(scale_names[1])
|
||||
qlinear_conv_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_conv_inputs.append(output_scale_name)
|
||||
qlinear_conv_inputs.append(output_zp_name)
|
||||
|
||||
if bias_present:
|
||||
qlinear_conv_inputs.append(quantized_bias_name)
|
||||
|
||||
qlinear_conv_node = onnx.helper.make_node(
|
||||
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
|
||||
)
|
||||
nodes.append(qlinear_conv_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_conv_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQConv(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if node.op_type == "Conv" else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
@@ -0,0 +1,78 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
# For operators that support 8bits operations directly, and output could
|
||||
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
|
||||
class Direct8BitOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
# Keep backward compatibility
|
||||
# Quantize when input[0] is quantized already. Otherwise keep it.
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
else:
|
||||
# Force quantize those ops if possible, use exclude node list if this is not you want
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_names[0]
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQDirect8BitOp(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(self.node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
@@ -0,0 +1,121 @@
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
|
||||
|
||||
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
|
||||
weight inputs associated with the node to uint8.
|
||||
"""
|
||||
|
||||
|
||||
class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "EmbedLayerNormalization"
|
||||
|
||||
if len(node.output) > 2:
|
||||
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
|
||||
return super().quantize()
|
||||
|
||||
"""
|
||||
Pre-quantization EmbedLayerNorm inputs:
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (float32)
|
||||
[3] position_embedding (float32)
|
||||
[4] segment_embedding (float32)
|
||||
[5] gamma (float32)
|
||||
[6] beta (float32)
|
||||
[7] mask (int32) (optional)
|
||||
"""
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
"""
|
||||
Quantized Input Tensor List
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (uint8)
|
||||
[3] position_embedding (uint8)
|
||||
[4] segment_embedding (uint8)
|
||||
[5] gamma (uint8)
|
||||
[6] beta (uint8)
|
||||
[7] mask (int32) (optional)
|
||||
[8] word_embedding_scale (float)
|
||||
[9] position_embedding_scale (float)
|
||||
[10] segment_embedding_scale (float)
|
||||
[11] gamma_scale (float)
|
||||
[12] beta_scale (float)
|
||||
[13] word_embedding_zero_point (uint8)
|
||||
[14] position_embedding_zero_point (uint8)
|
||||
[15] segment_embedding_zero_point (uint8)
|
||||
[16] gamma_zero_point (uint8)
|
||||
[17] beta_zero_point (uint8)
|
||||
"""
|
||||
inputs = []
|
||||
# 'input_ids'
|
||||
inputs.extend([node.input[0]])
|
||||
# 'segment_ids'
|
||||
inputs.extend([node.input[1]])
|
||||
# 'word_embedding_quant'
|
||||
inputs.extend([quantized_input_names[0]])
|
||||
# 'position_embedding_quant'
|
||||
inputs.extend([quantized_input_names[1]])
|
||||
# 'segment_embedding_quant'
|
||||
inputs.extend([quantized_input_names[2]])
|
||||
# 'gamma_quant'
|
||||
inputs.extend([quantized_input_names[3]])
|
||||
# 'beta_quant'
|
||||
inputs.extend([quantized_input_names[4]])
|
||||
# 'mask' (optional)
|
||||
inputs.extend([node.input[7] if len(node.input) > 7 else ""])
|
||||
|
||||
# Add all scales:
|
||||
inputs.extend([scale_names[0]])
|
||||
inputs.extend([scale_names[1]])
|
||||
inputs.extend([scale_names[2]])
|
||||
inputs.extend([scale_names[3]])
|
||||
inputs.extend([scale_names[4]])
|
||||
|
||||
# Add all zero points:
|
||||
inputs.extend([zero_point_names[0]])
|
||||
inputs.extend([zero_point_names[1]])
|
||||
inputs.extend([zero_point_names[2]])
|
||||
inputs.extend([zero_point_names[3]])
|
||||
inputs.extend([zero_point_names[4]])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qembed_layer_norm_node = onnx.helper.make_node(
|
||||
"QEmbedLayerNormalization",
|
||||
inputs,
|
||||
node.output,
|
||||
qembed_layer_norm_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qembed_layer_norm_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,64 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Gather
|
||||
"""
|
||||
|
||||
|
||||
class GatherQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_valid_quantize_weight(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather"
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
gather_new_output,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
node.output[0] = gather_new_output
|
||||
node.input[0] = quantized_input_names[0]
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGather(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather" or node.op_type == "GatherElements"
|
||||
|
||||
if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
elif self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QGlobalAveragePool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "GlobalAveragePool"
|
||||
|
||||
# If input to this node is not quantized then keep this node.
|
||||
if node.input[0] not in self.quantizer.quantized_value_map:
|
||||
return super().quantize()
|
||||
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
(
|
||||
data_found,
|
||||
output_scale_name_from_parameter,
|
||||
output_zp_name_from_parameter,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
# Just use input scale and zp if parameters for output is not specified.
|
||||
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
|
||||
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
kwargs["channels_last"] = 0
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_value.q_name,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[quantized_output_value.q_name],
|
||||
qnode_name,
|
||||
**kwargs,
|
||||
)
|
||||
self.quantizer.new_nodes += [qnode]
|
||||
@@ -0,0 +1,172 @@
|
||||
import logging
|
||||
|
||||
import numpy as np # noqa: F401
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name, # noqa: F401
|
||||
get_mul_node, # noqa: F401
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
from .matmul import QOpMatMul
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
def is_B_transposed(gemm_node): # noqa: N802
|
||||
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"] # noqa: N806
|
||||
if transB_attribute:
|
||||
return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
return onnx.helper.get_attribute_value(beta_attribute[0])
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
def set_default_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
beta_attribute[0].f = 1.0
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
class QLinearGemm(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
self.quantizer.weight_qType,
|
||||
0 if is_B_transposed(node) else 1,
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
if len(node.input) == 3:
|
||||
if not self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
return super().quantize()
|
||||
|
||||
# Note: if the quantized type is float 8, the bias is converted into float 16.
|
||||
# cublasLtMatMul only supports (b)float16 or float32 bias.
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(
|
||||
node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
|
||||
qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qgemm_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name != "beta":
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
# generate input
|
||||
qgemm_inputs = []
|
||||
for i in range(2):
|
||||
qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
|
||||
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
|
||||
|
||||
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
|
||||
nodes.append(qgemm_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qgemm_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
node_type=node.op_type,
|
||||
node_qtype=self.quantizer.weight_qType,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGemm(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if is_B_transposed(node) else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
self.quantizer.quantize_bias_tensor(
|
||||
node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
set_default_beta(self.node)
|
||||
else:
|
||||
logging.warning(
|
||||
f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
|
||||
)
|
||||
@@ -0,0 +1,121 @@
|
||||
import numpy
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain # noqa: F401
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize LSTM
|
||||
"""
|
||||
|
||||
|
||||
class LSTMQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: LSTM node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "LSTM"
|
||||
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
|
||||
node.input[2]
|
||||
):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
model = self.quantizer.model
|
||||
W = model.get_initializer(node.input[1]) # noqa: N806
|
||||
R = model.get_initializer(node.input[2]) # noqa: N806
|
||||
|
||||
if len(W.dims) != 3 or len(R.dims) != 3:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
[W_num_dir, W_4_hidden_size, W_input_size] = W.dims # noqa: N806
|
||||
[R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
del W.dims[0]
|
||||
del R.dims[0]
|
||||
W.dims[0] = W_num_dir * W_4_hidden_size
|
||||
R.dims[0] = R_num_dir * R_4_hidden_size
|
||||
|
||||
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[2],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
|
||||
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) # noqa: N806
|
||||
R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) # noqa: N806
|
||||
R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) # noqa: N806
|
||||
R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) # noqa: N806
|
||||
|
||||
W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
model.remove_initializers([W_quant_weight, R_quant_weight])
|
||||
model.add_initializer(W_quant_tranposed)
|
||||
model.add_initializer(R_quant_tranposed)
|
||||
|
||||
W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) # noqa: N806
|
||||
R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) # noqa: N806
|
||||
W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) # noqa: N806
|
||||
R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
|
||||
inputs = []
|
||||
input_len = len(node.input)
|
||||
inputs.extend([node.input[0]])
|
||||
inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
|
||||
inputs.extend([node.input[3] if input_len > 3 else ""])
|
||||
inputs.extend([node.input[4] if input_len > 4 else ""])
|
||||
inputs.extend([node.input[5] if input_len > 5 else ""])
|
||||
inputs.extend([node.input[6] if input_len > 6 else ""])
|
||||
inputs.extend([node.input[7] if input_len > 7 else ""])
|
||||
inputs.extend(
|
||||
[
|
||||
quant_input_weight_tuple[2],
|
||||
quant_input_weight_tuple[1],
|
||||
quant_recurrent_weight_tuple[2],
|
||||
quant_recurrent_weight_tuple[1],
|
||||
]
|
||||
)
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name == "layout":
|
||||
continue
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
quant_lstm_name = "" if not node.name else node.name + "_quant"
|
||||
quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
|
||||
self.quantizer.new_nodes.append(quant_lstm_node)
|
||||
|
||||
dequantize_node = self.quantizer._dequantize_value(node.input[0])
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
@@ -0,0 +1,231 @@
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QOpMatMul(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
logging.debug(f"Ignore MatMul {self.node.name}]")
|
||||
return False
|
||||
|
||||
if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
|
||||
not self.quantizer.is_float_tensor(self.node.input[0])
|
||||
):
|
||||
logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
|
||||
return False
|
||||
|
||||
# do not quantize non-constant B matrices for matmul
|
||||
if self.quantizer.q_matmul_const_b_only:
|
||||
if not self.quantizer.find_initializer_in_path(self.node.input[1]):
|
||||
logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.IntegerOps.
|
||||
"""
|
||||
|
||||
|
||||
class MatMulInteger(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
matmul_integer_output = node.output[0] + "_output_quantized"
|
||||
matmul_integer_name = node.name + "_quant" if node.name else ""
|
||||
matmul_integer_node = onnx.helper.make_node(
|
||||
"MatMulInteger",
|
||||
quantized_input_names + zero_point_names,
|
||||
[matmul_integer_output],
|
||||
matmul_integer_name,
|
||||
)
|
||||
nodes.append(matmul_integer_node)
|
||||
|
||||
# Add cast operation to cast matmulInteger output to float.
|
||||
cast_op_output = matmul_integer_output + "_cast_output"
|
||||
otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[matmul_integer_output],
|
||||
[cast_op_output],
|
||||
matmul_integer_output + "_cast",
|
||||
to=otype,
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
scales_mul_op = (
|
||||
matmul_integer_name + "_scales_mul"
|
||||
if matmul_integer_name
|
||||
else scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
)
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of MatMulInteger
|
||||
# and make the output of this node the same as output of original matmul node.
|
||||
output_scale_mul_op = ""
|
||||
if matmul_integer_name:
|
||||
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
node.output[0],
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.QLinearOps
|
||||
"""
|
||||
|
||||
|
||||
class QLinearMatMul(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
|
||||
nodes.extend(nodes_weight)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_matmul_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlinear_matmul_inputs = []
|
||||
# Input 0
|
||||
qlinear_matmul_inputs.append(quantized_input_names[0])
|
||||
qlinear_matmul_inputs.append(scale_names[0])
|
||||
qlinear_matmul_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_matmul_inputs.append(quantized_input_names[1])
|
||||
qlinear_matmul_inputs.append(scale_names[1])
|
||||
qlinear_matmul_inputs.append(zero_point_names[1])
|
||||
# Output quantization parameter
|
||||
qlinear_matmul_inputs.append(output_scale_name)
|
||||
qlinear_matmul_inputs.append(output_zp_name)
|
||||
|
||||
domain = (
|
||||
"com.microsoft"
|
||||
if self.quantizer.weight_qType
|
||||
in {
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FN,
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
|
||||
}
|
||||
else ""
|
||||
)
|
||||
qlinear_matmul_node = onnx.helper.make_node(
|
||||
"QLinearMatMul",
|
||||
qlinear_matmul_inputs,
|
||||
[qlinear_matmul_output],
|
||||
qlinear_matmul_name,
|
||||
domain=domain,
|
||||
)
|
||||
nodes.append(qlinear_matmul_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_matmul_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQMatMul(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
nodes_to_iterate = node.input
|
||||
else:
|
||||
nodes_to_iterate = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in nodes_to_iterate:
|
||||
if find_by_name(tensor_name, self.quantizer.model.initializer()):
|
||||
is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
tensor_name, default_axis=1, op_type=node.op_type
|
||||
)
|
||||
if is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(tensor_name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QMaxPool(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, go to normal quantize.
|
||||
if self.quantizer.opset_version < 12:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQMaxPool(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, just no change
|
||||
if self.quantizer.opset_version < 12:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,40 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QDQNormalization(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
|
||||
|
||||
# Input
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
# Scale
|
||||
scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
|
||||
scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=1, op_type=node.op_type
|
||||
)
|
||||
|
||||
if scale_is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
|
||||
elif scale_is_initializer:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
|
||||
# Bias
|
||||
if len(node.input) > 2 and node.input[2]:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
|
||||
# Output
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output_name in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output_name)
|
||||
@@ -0,0 +1,172 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
quantize_nparray,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QPad(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Pad"
|
||||
|
||||
# Only after version 11, it has the optional constant_value
|
||||
# If input[0] is not quantized, do not quanitize this node
|
||||
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
|
||||
super().quantize()
|
||||
return
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
kwargs.update(kv)
|
||||
|
||||
if "mode" not in kwargs or kwargs["mode"] == b"constant":
|
||||
if len(node.input) > 2 and node.input[2] != "": # There is 3rd input 'constant_value'
|
||||
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
|
||||
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
|
||||
if zp_tensor is None or scale_tensor is None:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
|
||||
if padding_constant_initializer is not None:
|
||||
zp_array = onnx.numpy_helper.to_array(zp_tensor)
|
||||
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
|
||||
scale_array = onnx.numpy_helper.to_array(scale_tensor)
|
||||
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
|
||||
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
|
||||
quantized_padding_constant_array = quantize_nparray(
|
||||
self.quantizer.activation_qType,
|
||||
padding_constant_array,
|
||||
scale_value,
|
||||
zp_value,
|
||||
)
|
||||
quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
|
||||
quantized_padding_constant_array,
|
||||
quantized_padding_constant_name,
|
||||
)
|
||||
# Suppose this padding constant initializer only used by the node
|
||||
self.quantizer.model.remove_initializer(padding_constant_initializer)
|
||||
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
|
||||
node.input[2] = quantized_padding_constant_name
|
||||
else:
|
||||
# TODO: check quantize_inputs after sub graph is supported
|
||||
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
|
||||
node,
|
||||
2,
|
||||
self.quantizer.activation_qType,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
initial_type=scale_tensor.data_type,
|
||||
)
|
||||
self.quantizer.new_nodes.extend(pad_value_qnodes)
|
||||
node.input[2] = pad_value_qnodes[0].output[0]
|
||||
else:
|
||||
# In quantized format, the `zero` before quantization is mapped
|
||||
# to quantized_input_value.zp_name. Thus, padding 0 to
|
||||
# original tensor should become padding zero point to quantized
|
||||
# tensor.
|
||||
if len(node.input) == 2:
|
||||
# Feed quantization's zero point to padding node.
|
||||
node.input.append(quantized_input_value.zp_name)
|
||||
else:
|
||||
# Assign quantization's zero point to padding node.
|
||||
assert node.input[2] == ""
|
||||
node.input[2] = quantized_input_value.zp_name
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
|
||||
class QDQPad(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
|
||||
"""
|
||||
Returns the Pad's constant padding value. Returns `None` if the padding value is
|
||||
not constant (i.e., comes from a dynamic input).
|
||||
"""
|
||||
const_val = None
|
||||
onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
|
||||
if onnx_tensor_type is None:
|
||||
return None
|
||||
|
||||
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
|
||||
if self.quantizer.opset_version < 11:
|
||||
const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
|
||||
elif len(self.node.input) >= 3 and self.node.input[2]:
|
||||
const_val = self.quantizer.model.get_constant_value(self.node.input[2])
|
||||
else:
|
||||
const_val = np.array(0, dtype=np_dtype)
|
||||
|
||||
return const_val
|
||||
|
||||
def _should_quantize_output_same_as_input(self) -> bool:
|
||||
"""
|
||||
Returns true if Pad's output should use the same quantization parameters as input[0]
|
||||
"""
|
||||
attrs_dict = {}
|
||||
for attribute in self.node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
attrs_dict.update(kv)
|
||||
|
||||
pad_mode = attrs_dict.get("mode", b"constant")
|
||||
if pad_mode in (b"reflect", b"edge", b"wrap"):
|
||||
# These modes pad the output with a value that already exists in the input.
|
||||
# So, we can quantize the output the same as the input.
|
||||
return True
|
||||
|
||||
# For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
|
||||
# because our quantization floating-point range always includes 0.
|
||||
if pad_mode == b"constant":
|
||||
pad_val = self._get_pad_const_val(attrs_dict)
|
||||
if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
|
||||
return float(pad_val.item()) == 0
|
||||
|
||||
return False
|
||||
|
||||
def quantize(self):
|
||||
assert self.node.op_type == "Pad"
|
||||
|
||||
for input_name in self.node.input:
|
||||
if input_name:
|
||||
self.quantizer.quantize_activation_tensor(input_name)
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
if self._should_quantize_output_same_as_input():
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(self.node.output[0])
|
||||
@@ -0,0 +1,67 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearPool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear pool node for given type (AveragePool, etc)
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,22 @@
|
||||
import itertools
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray # noqa: F401
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QDQOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
tensors_to_quantize = node.input
|
||||
else:
|
||||
tensors_to_quantize = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in tensors_to_quantize:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QResize(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, go to normal quantize.
|
||||
if self.quantizer.opset_version < 11:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQResize(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, just keep this node
|
||||
if self.quantizer.opset_version < 11:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,74 @@
|
||||
import onnx
|
||||
import onnx.helper
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearSoftmax(QuantOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
# set limitations for softmax output scale and zp, because the output of softmax is always 0-1
|
||||
if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = 0
|
||||
else:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = -128
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear softmax node for given type
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
# make qlinearsoft has the real opset_version, its default SinceVersion would be 1
|
||||
kwargs["opset"] = self.quantizer.opset_version
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
return None
|
||||
@@ -0,0 +1,63 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QSplit(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_node_name = ""
|
||||
if node.name:
|
||||
quantized_node_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
|
||||
# Output just derive the scale/zero from input
|
||||
quantized_output_names = []
|
||||
for output_name in node.output:
|
||||
quantized_output_name = output_name + "quantized"
|
||||
quantized_output_names.append(quantized_output_name)
|
||||
q_output = QuantizedValue(
|
||||
output_name,
|
||||
quantized_output_name,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[output_name] = q_output
|
||||
|
||||
if len(node.input) > 1:
|
||||
quantized_input_names.extend(node.input[1:])
|
||||
quantized_node = onnx.helper.make_node(
|
||||
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
|
||||
)
|
||||
|
||||
nodes.append(quantized_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQSplit(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Split"
|
||||
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
|
||||
@@ -0,0 +1,87 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearWhere(QuantOperatorBase):
|
||||
def should_quantize(self):
|
||||
return True
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [1, 2])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_output_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlwhere_inputs = [
|
||||
node.input[0],
|
||||
q_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
q_input_names[1],
|
||||
scale_names[1],
|
||||
zero_point_names[1],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
qlwhere_node = onnx.helper.make_node(
|
||||
"QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlwhere_node]
|
||||
|
||||
|
||||
class QDQWhere(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
if not self.quantizer.is_tensor_quantized(node.input[1]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
if not self.quantizer.is_tensor_quantized(node.input[2]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[2])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
elif (
|
||||
self.quantizer.is_tensor_quantized(node.input[1])
|
||||
and self.quantizer.is_tensor_quantized(node.input[2])
|
||||
and not self.disable_qdq_for_node_output
|
||||
):
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
@@ -0,0 +1,141 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .shape_inference import quant_pre_process
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Model optimizer and shape inferencer, in preparation for quantization,
|
||||
Consists of three optional steps:
|
||||
1. Symbolic shape inference (best for transformer models).
|
||||
2. Model optimization.
|
||||
3. ONNX shape inference.
|
||||
|
||||
Model quantization with QDQ format, i.e. inserting QuantizeLinear/DeQuantizeLinear on
|
||||
the tensor, requires tensor shape information to perform its best. Currently, shape inferencing
|
||||
works best with optimized model. As a result, it is highly recommended to run quantization
|
||||
on optimized model with shape information. This is the tool for optimization and shape
|
||||
inferencing.
|
||||
|
||||
Essentially this tool performs the following three (skippable) steps:
|
||||
|
||||
1. Symbolic shape inference.
|
||||
2. Model optimization
|
||||
3. ONNX shape inference"""
|
||||
)
|
||||
|
||||
parser.add_argument("--input", required=True, help="Path to the input model file")
|
||||
parser.add_argument("--output", required=True, help="Path to the output model file")
|
||||
parser.add_argument(
|
||||
"--skip_optimization",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Skip model optimization step if true. It's a known issue that ORT"
|
||||
" optimization has difficulty with model size greater than 2GB, rerun with"
|
||||
" this option to get around this issue.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip_onnx_shape",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Skip ONNX shape inference. Symbolic shape inference is most effective"
|
||||
" with transformer based models. Skipping all shape inferences may"
|
||||
" reduce the effectiveness of quantization, as a tensor with unknown"
|
||||
" shape can not be quantized.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip_symbolic_shape",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Skip symbolic shape inference. Symbolic shape inference is most"
|
||||
" effective with transformer based models. Skipping all shape"
|
||||
" inferences may reduce the effectiveness of quantization, as a tensor"
|
||||
" with unknown shape can not be quantized.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--auto_merge",
|
||||
help="Automatically merge symbolic dims when confliction happens",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--int_max",
|
||||
help="maximum value for integer to be treated as boundless for ops like slice",
|
||||
type=int,
|
||||
default=2**31 - 1,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--guess_output_rank",
|
||||
help="guess output rank to be the same as input 0 for unknown ops",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
|
||||
type=int,
|
||||
default=0,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save_as_external_data",
|
||||
help="Saving an ONNX model to external data",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all_tensors_to_one_file",
|
||||
help="Saving all the external data to one file",
|
||||
action="store_true",
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--external_data_location",
|
||||
help="The file location to save the external file",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--external_data_size_threshold",
|
||||
help="The size threshold for external data",
|
||||
type=int,
|
||||
default=1024,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
if args.skip_optimization and args.skip_onnx_shape and args.skip_symbolic_shape:
|
||||
logger.error("Skipping all three steps, nothing to be done. Quitting...")
|
||||
sys.exit()
|
||||
|
||||
if (not args.skip_optimization) and args.save_as_external_data:
|
||||
logger.error("ORT model optimization does not support external data yet!")
|
||||
sys.exit()
|
||||
|
||||
logger.info("input model: %s", args.input)
|
||||
logger.info("output model: %s", args.output)
|
||||
quant_pre_process(
|
||||
args.input,
|
||||
args.output,
|
||||
args.skip_optimization,
|
||||
args.skip_onnx_shape,
|
||||
args.skip_symbolic_shape,
|
||||
args.auto_merge,
|
||||
args.int_max,
|
||||
args.guess_output_rank,
|
||||
args.verbose,
|
||||
args.save_as_external_data,
|
||||
args.all_tensors_to_one_file,
|
||||
args.external_data_location,
|
||||
args.external_data_size_threshold,
|
||||
)
|
||||
@@ -0,0 +1,389 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
"""Utilities to run a given ONNX model, while saving input/output tensors of
|
||||
eligible operator nodes.
|
||||
|
||||
A use case is to debug quantization induced accuracy drop. An AI engineer can
|
||||
run the original float32 model and the quantized model with the same inputs,
|
||||
then compare the corresponding activations between the two models to find
|
||||
where the divergence is.
|
||||
|
||||
Example Usage:
|
||||
|
||||
```python
|
||||
class ExampleDataReader(CalibrationDataReader):
|
||||
def __init__(self):
|
||||
...
|
||||
def get_next(self):
|
||||
...
|
||||
|
||||
input_data_reader = ExampleDataReader()
|
||||
|
||||
augmented_model_path = str(Path(self._tmp_model_dir.name).joinpath("augmented_model.onnx"))
|
||||
modify_model_output_intermediate_tensors (path_to_onnx_model, augmented_model_path)
|
||||
|
||||
tensor_dict = collect_activations(augmented_model_path, input_data_reader)
|
||||
```
|
||||
|
||||
`tensor_dict` points to a dictionary where the keys are tensor names and each value
|
||||
is a list of tensors, one from each model run
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
from collections.abc import Callable, Sequence
|
||||
from pathlib import Path
|
||||
|
||||
import numpy
|
||||
import onnx
|
||||
from onnx import helper, numpy_helper
|
||||
|
||||
import onnxruntime
|
||||
|
||||
from .calibrate import CalibraterBase, CalibrationDataReader
|
||||
from .onnx_model import ONNXModel
|
||||
from .quant_utils import (
|
||||
DEQUANT_OP_NAME,
|
||||
DEQUANT_OUTPUT_SUFFIX,
|
||||
QUANT_INPUT_SUFFIX,
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
find_by_name,
|
||||
load_model_with_shape_infer,
|
||||
)
|
||||
|
||||
_TENSOR_SAVE_POSTFIX = "_ReshapedSavedOutput"
|
||||
_TENSOR_SAVE_POSTFIX_LEN = len(_TENSOR_SAVE_POSTFIX)
|
||||
|
||||
|
||||
def modify_model_output_intermediate_tensors(
|
||||
input_model_path: str | Path,
|
||||
output_model_path: str | Path,
|
||||
op_types_for_saving: Sequence[str] | None = None,
|
||||
save_as_external_data: bool = False,
|
||||
) -> None:
|
||||
"""Augment a given ONNX model to save node input/output tensors.
|
||||
|
||||
Add all input/output tensors of operator nodes to model outputs
|
||||
so that their values can be retrieved for debugging purposes.
|
||||
|
||||
Args:
|
||||
input_model: the path to load the model.
|
||||
op_types_for_saving: Operator types for which the
|
||||
input/output should be saved. By default, saving all the
|
||||
float32/float16 tensors.
|
||||
|
||||
Returns:
|
||||
The augmented ONNX model
|
||||
"""
|
||||
|
||||
if op_types_for_saving is None:
|
||||
op_types_for_saving = []
|
||||
saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
|
||||
model_to_augment = saver.model
|
||||
tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
|
||||
reshape_shape_name = "LinearReshape_" + str(time.time())
|
||||
reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
|
||||
model_to_augment.graph.initializer.append(reshape_shape)
|
||||
|
||||
for tensor_name in tensors:
|
||||
reshape_output = tensor_name + _TENSOR_SAVE_POSTFIX
|
||||
reshape_node = onnx.helper.make_node(
|
||||
"Reshape",
|
||||
inputs=[tensor_name, reshape_shape_name],
|
||||
outputs=[reshape_output],
|
||||
name=reshape_output,
|
||||
)
|
||||
model_to_augment.graph.node.append(reshape_node)
|
||||
reshape_output_value_info = helper.make_tensor_value_info(
|
||||
reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
|
||||
)
|
||||
model_to_augment.graph.output.append(reshape_output_value_info)
|
||||
|
||||
onnx.save(
|
||||
model_to_augment,
|
||||
output_model_path,
|
||||
save_as_external_data=save_as_external_data,
|
||||
)
|
||||
|
||||
|
||||
def collect_activations(
|
||||
augmented_model: str,
|
||||
input_reader: CalibrationDataReader,
|
||||
session_options=None,
|
||||
execution_providers: Sequence[str] | None = None,
|
||||
) -> dict[str, list[numpy.ndarray]]:
|
||||
"""Run augmented model and collect activations tensors.
|
||||
|
||||
Args:
|
||||
augmented_model: Path to augmented model created by modify_model_output_intermediate_tensors ()
|
||||
input_reader: Logic for reading input for the model, augmented model have the same
|
||||
input with the original model.
|
||||
session_options: Optional OnnxRuntime session options for controlling model run.
|
||||
By default graph optimization is turned off
|
||||
execution_providers: Collection of execution providers for running the model.
|
||||
Only CPU EP is used by default.
|
||||
|
||||
Returns:
|
||||
A dictionary where the key is tensor name and values are list of tensors from each batch
|
||||
"""
|
||||
|
||||
if session_options is None:
|
||||
session_options = onnxruntime.SessionOptions()
|
||||
session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
if execution_providers is None:
|
||||
execution_providers = ["CPUExecutionProvider"]
|
||||
|
||||
inference_session = onnxruntime.InferenceSession(
|
||||
augmented_model,
|
||||
sess_options=session_options,
|
||||
providers=execution_providers,
|
||||
)
|
||||
|
||||
intermediate_outputs = []
|
||||
for input_d in input_reader:
|
||||
intermediate_outputs.append(inference_session.run(None, input_d))
|
||||
if not intermediate_outputs:
|
||||
raise RuntimeError("No data is collected while running augmented model!")
|
||||
|
||||
output_dict = {}
|
||||
output_info = inference_session.get_outputs()
|
||||
for batch in intermediate_outputs:
|
||||
for output, output_data in zip(output_info, batch, strict=False):
|
||||
if output.name.endswith(_TENSOR_SAVE_POSTFIX):
|
||||
output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN]
|
||||
output_dict.setdefault(output_name, []).append(output_data)
|
||||
|
||||
return output_dict
|
||||
|
||||
|
||||
_POST_QDQ_POSTFIX1 = DEQUANT_OUTPUT_SUFFIX + "_1"
|
||||
|
||||
|
||||
def _add_pre_post_qdq_pair(
|
||||
qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]],
|
||||
activation_name: str,
|
||||
pre_qdq_tensors: Sequence[numpy.ndarray] | None,
|
||||
post_qdq_tensors: Sequence[numpy.ndarray] | None,
|
||||
) -> None:
|
||||
if post_qdq_tensors is not None and pre_qdq_tensors is not None:
|
||||
qdq_cmp[activation_name] = {}
|
||||
qdq_cmp[activation_name]["pre_qdq"] = pre_qdq_tensors
|
||||
qdq_cmp[activation_name]["post_qdq"] = post_qdq_tensors
|
||||
|
||||
|
||||
def create_activation_matching(
|
||||
qdq_activations: dict[str, Sequence[numpy.ndarray]],
|
||||
float_activations: dict[str, Sequence[numpy.ndarray]] | None = None,
|
||||
) -> dict[str, dict[str, Sequence[numpy.ndarray]]]:
|
||||
"""Comparing activation values to help debugging accuracy loss due to quantization.
|
||||
|
||||
This functions takes saved activations from the QDQ model and (optionally) the
|
||||
float point model, and provides a data structure for comparing:
|
||||
* from the qdq model, activation values before and after QDQ operation
|
||||
* across both models, activations from the orignal model vs the corresponding
|
||||
activations in the QDQ model
|
||||
|
||||
Arg:
|
||||
qdq_activations: Output of `collect_activations`. This must be from a quantized
|
||||
model with QDQ format.
|
||||
float_activations: Output of `collect_activations`. This must be from the float
|
||||
point model.
|
||||
|
||||
Returns:
|
||||
Dict for comparing pre and post quantized activation tensors. E.g.
|
||||
```
|
||||
qdq_cmp = cmp_qdq_input_output(qdq_activations)
|
||||
print(qdq_cmp['activation1']['pre_qdq'][0])
|
||||
print(qdq_cmp['activation1'][`post_qdq'][0])
|
||||
|
||||
|
||||
qdq_cmp = cmp_qdq_input_output(qdq_activations, float_activations)
|
||||
print(qdq_cmp['activation1']['float'][0])
|
||||
print(qdq_cmp['activation1']['pre_qdq'][0])
|
||||
print(qdq_cmp['activation1'][`post_qdq'][0])
|
||||
```
|
||||
"""
|
||||
|
||||
qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]] = {}
|
||||
for tensor_name, tensors in qdq_activations.items():
|
||||
if tensor_name.endswith(QUANT_INPUT_SUFFIX):
|
||||
pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)]
|
||||
post_qdq_tensors = qdq_activations.get(pre_name)
|
||||
pre_qdq_tensors = tensors
|
||||
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
|
||||
elif tensor_name.endswith(DEQUANT_OUTPUT_SUFFIX):
|
||||
pre_name = tensor_name[: -len(DEQUANT_OUTPUT_SUFFIX)]
|
||||
pre_qdq_tensors = qdq_activations.get(pre_name)
|
||||
post_qdq_tensors = tensors
|
||||
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
|
||||
elif tensor_name.endswith(_POST_QDQ_POSTFIX1):
|
||||
pre_name = tensor_name[: -len(_POST_QDQ_POSTFIX1)]
|
||||
pre_qdq_tensors = qdq_activations.get(pre_name)
|
||||
post_qdq_tensors = tensors
|
||||
_add_pre_post_qdq_pair(qdq_cmp, pre_name, pre_qdq_tensors, post_qdq_tensors)
|
||||
|
||||
if not float_activations:
|
||||
return qdq_cmp
|
||||
|
||||
for act_name, act_values in qdq_cmp.items():
|
||||
float_acts = float_activations.get(act_name)
|
||||
if float_acts is not None:
|
||||
act_values["float"] = float_acts
|
||||
|
||||
return qdq_cmp
|
||||
|
||||
|
||||
def _run_dequantize_linear(
|
||||
weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int
|
||||
) -> numpy.ndarray | None:
|
||||
assert weight_scale.shape == weight_zp.shape
|
||||
if weight_zp.size == 1:
|
||||
return (weight_tensor - weight_zp) * weight_scale
|
||||
|
||||
assert weight_zp.ndim == 1
|
||||
reshape_dims = list(weight_tensor.shape) # deep copy
|
||||
reshape_dims[channel_axis] = 1 # only one per channel for reshape
|
||||
channel_count = weight_tensor.shape[channel_axis]
|
||||
dequantized_weights = None
|
||||
for i in range(channel_count):
|
||||
per_channel_data = weight_tensor.take(i, channel_axis)
|
||||
dequantized_per_channel_data = (per_channel_data - weight_zp[i]) * weight_scale[i]
|
||||
if i == 0:
|
||||
dequantized_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
|
||||
else:
|
||||
channel_weights = numpy.asarray(dequantized_per_channel_data).reshape(reshape_dims)
|
||||
dequantized_weights = numpy.concatenate((dequantized_weights, channel_weights), channel_axis)
|
||||
|
||||
if dequantized_weights is None:
|
||||
return None
|
||||
|
||||
dequantized_weights.reshape(weight_tensor.shape)
|
||||
return dequantized_weights
|
||||
|
||||
|
||||
def create_weight_matching(float_model_path: str, qdq_model_path: str) -> dict[str, dict[str, numpy.ndarray]]:
|
||||
"""Comparing weight values to help debugging accuracy loss due to quantization.
|
||||
|
||||
This functions takes the float model and the qdq model, and provides a data structure for comparing
|
||||
their corresponding weights to locate quantization errors
|
||||
|
||||
Arg:
|
||||
float_model_path: Path points to the float point model.
|
||||
qdq_model_path: Path points to the qdq model.
|
||||
|
||||
Returns:
|
||||
Dict for comparing weight tensors. E.g.
|
||||
```
|
||||
qdq_weight_cmp = create_weight_matching(float_model, qdq_model)
|
||||
print(qdq_weight_cmp['activation1']['float'])
|
||||
print(qdq_weight_cmp['activation1']['dequantized'])
|
||||
```
|
||||
"""
|
||||
float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path)))
|
||||
qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path)))
|
||||
|
||||
matched_weights: dict[str, dict[str, numpy.ndarray]] = {}
|
||||
initializers = qdq_onnx_model.initializer()
|
||||
for node in qdq_onnx_model.nodes():
|
||||
if node.op_type != DEQUANT_OP_NAME:
|
||||
continue # Only care about DQ node
|
||||
weight_name: str = node.input[0]
|
||||
weight_values = find_by_name(weight_name, initializers)
|
||||
if not weight_values:
|
||||
continue # Only care about DQ node with const inputs
|
||||
if not weight_name.endswith(TENSOR_NAME_QUANT_SUFFIX):
|
||||
logging.error(f"Model Error in '{qdq_model_path}': Dequantized tensor name '{weight_name}' not recognized!")
|
||||
continue
|
||||
|
||||
axis = -1
|
||||
for attr in node.attribute:
|
||||
if attr.name == "axis":
|
||||
axis = attr.i
|
||||
|
||||
weight_tensor = numpy_helper.to_array(weight_values)
|
||||
weight_scale = numpy_helper.to_array(find_by_name(node.input[1], initializers))
|
||||
if len(node.input) > 2:
|
||||
weight_zp = numpy_helper.to_array(find_by_name(node.input[2], initializers))
|
||||
else:
|
||||
weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
|
||||
|
||||
# Perform dequantization:
|
||||
if weight_scale.size == weight_zp.size == 1:
|
||||
# Avoids the confusion between a scaler and a tensor of one element.
|
||||
weight_scale = weight_scale.reshape(())
|
||||
weight_zp = weight_zp.reshape(())
|
||||
if weight_scale.shape != weight_zp.shape:
|
||||
raise RuntimeError(
|
||||
f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
|
||||
)
|
||||
weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
|
||||
weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
|
||||
if weight_quant is None:
|
||||
logging.error(f"Model Error in '{qdq_model_path}': '{weight_name}' per-channel quantization on 0 channel")
|
||||
continue
|
||||
|
||||
float_values = find_by_name(weight_name, float_onnx_model.initializer())
|
||||
if not float_values:
|
||||
logging.error(f"Model Error in '{float_model_path}': weight tensor '{weight_name}' not found!")
|
||||
continue
|
||||
weight_float = numpy_helper.to_array(float_values)
|
||||
matched_weights[weight_name] = {"float": weight_float, "dequantized": weight_quant}
|
||||
|
||||
return matched_weights
|
||||
|
||||
|
||||
def compute_signal_to_quantization_noice_ratio(
|
||||
x: Sequence[numpy.ndarray] | numpy.ndarray, y: Sequence[numpy.ndarray] | numpy.ndarray
|
||||
) -> float:
|
||||
if isinstance(x, numpy.ndarray):
|
||||
xlist = [x]
|
||||
else:
|
||||
xlist = x
|
||||
if isinstance(y, numpy.ndarray):
|
||||
ylist = [y]
|
||||
else:
|
||||
ylist = y
|
||||
if len(xlist) != len(ylist):
|
||||
raise RuntimeError("Unequal number of tensors to compare!")
|
||||
|
||||
left = numpy.concatenate(xlist).flatten()
|
||||
right = numpy.concatenate(ylist).flatten()
|
||||
|
||||
epsilon = numpy.finfo("float").eps
|
||||
tensor_norm = max(numpy.linalg.norm(left), epsilon)
|
||||
diff_norm = max(numpy.linalg.norm(left - right), epsilon)
|
||||
res = tensor_norm / diff_norm
|
||||
return 20 * math.log10(res)
|
||||
|
||||
|
||||
def compute_weight_error(
|
||||
weights_match: dict[str, dict[str, numpy.ndarray]],
|
||||
err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio,
|
||||
) -> dict[str, float]:
|
||||
result: dict[str, float] = {}
|
||||
for weight_name, weight_match in weights_match.items():
|
||||
result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"])
|
||||
return result
|
||||
|
||||
|
||||
def compute_activation_error(
|
||||
activations_match: dict[str, dict[str, Sequence[numpy.ndarray]]],
|
||||
err_func: Callable[
|
||||
[Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float
|
||||
] = compute_signal_to_quantization_noice_ratio,
|
||||
) -> dict[str, dict[str, float]]:
|
||||
result: dict[str, dict[str, float]] = {}
|
||||
for name, match in activations_match.items():
|
||||
err_result: dict[str, float] = {}
|
||||
err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"])
|
||||
float_activation = match["float"]
|
||||
if float_activation:
|
||||
err_result["xmodel_err"] = err_func(float_activation, match["post_qdq"])
|
||||
result[name] = err_result
|
||||
return result
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,953 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import tempfile
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import onnx
|
||||
|
||||
from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
|
||||
from .onnx_quantizer import ONNXQuantizer
|
||||
from .qdq_quantizer import QDQQuantizer
|
||||
from .quant_utils import (
|
||||
MODEL_SIZE_THRESHOLD,
|
||||
QuantFormat,
|
||||
QuantizationMode,
|
||||
QuantType,
|
||||
load_model_with_shape_infer,
|
||||
model_has_pre_process_metadata,
|
||||
save_and_reload_model_with_shape_infer,
|
||||
update_opset_version,
|
||||
)
|
||||
from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
|
||||
from .tensor_quant_overrides import TensorQuantOverridesHelper
|
||||
|
||||
|
||||
class QuantConfig:
|
||||
def __init__(
|
||||
self,
|
||||
activation_type=QuantType.QUInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
op_types_to_quantize=None,
|
||||
nodes_to_quantize=None,
|
||||
nodes_to_exclude=None,
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
use_external_data_format=False,
|
||||
):
|
||||
"""
|
||||
This is the Base class for both Static and Dynamic Quantize Configuration
|
||||
Args:
|
||||
activation_type:
|
||||
quantization data type of activation. Please refer to
|
||||
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
||||
weight_type:
|
||||
quantization data type of weight. Please refer to
|
||||
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
||||
op_types_to_quantize:
|
||||
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
||||
It quantizes all supported operators by default.
|
||||
nodes_to_quantize:
|
||||
List of nodes names to quantize. When this list is not None only the nodes in this list
|
||||
are quantized.
|
||||
example:
|
||||
[
|
||||
'Conv__224',
|
||||
'Conv__252'
|
||||
]
|
||||
nodes_to_exclude:
|
||||
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
||||
when it is not None.
|
||||
per_channel: quantize weights per channel
|
||||
reduce_range:
|
||||
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
||||
especially for per-channel mode
|
||||
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
||||
"""
|
||||
|
||||
nodes_to_exclude = nodes_to_exclude or []
|
||||
nodes_to_quantize = nodes_to_quantize or []
|
||||
op_types_to_quantize = op_types_to_quantize or []
|
||||
self.op_types_to_quantize = op_types_to_quantize
|
||||
self.per_channel = per_channel
|
||||
self.reduce_range = reduce_range
|
||||
self.weight_type = weight_type
|
||||
self.activation_type = activation_type
|
||||
self.nodes_to_quantize = nodes_to_quantize
|
||||
self.nodes_to_exclude = nodes_to_exclude
|
||||
self.use_external_data_format = use_external_data_format
|
||||
|
||||
|
||||
class StaticQuantConfig(QuantConfig):
|
||||
def __init__(
|
||||
self,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
quant_format=QuantFormat.QDQ,
|
||||
activation_type=QuantType.QInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
op_types_to_quantize=None,
|
||||
nodes_to_quantize=None,
|
||||
nodes_to_exclude=None,
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
use_external_data_format=False,
|
||||
calibration_providers=None,
|
||||
extra_options=None,
|
||||
):
|
||||
"""
|
||||
This is the derived class for static Quantize Configuration
|
||||
|
||||
Args:
|
||||
calibration_data_reader:
|
||||
a calibration data reader. It enumerates calibration data and generates inputs for the original model.
|
||||
calibrate_method:
|
||||
Current calibration methods supported are MinMax, Entropy and Percentile.
|
||||
quant_format: QuantFormat{QOperator, QDQ}.
|
||||
QOperator format quantizes the model with quantized operators directly.
|
||||
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
|
||||
calibration_providers: Execution providers to run the session during calibration. Default is None which uses
|
||||
[ "CPUExecutionProvider" ].
|
||||
extra_options:
|
||||
key value pair dictionary for various options in different case. Current used:
|
||||
extra.Sigmoid.nnapi = True/False (Default is False)
|
||||
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
||||
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
||||
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
|
||||
Dyanmic mode currently is supported. Will support more in future.
|
||||
ForceQuantizeNoInputCheck = True/False :
|
||||
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
||||
quantized already. Setting to True to force such operator always quantize input and so generate
|
||||
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False:
|
||||
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
|
||||
AddQDQPairToWeight = True/False :
|
||||
Default is False which quantizes floating-point weight and feeds it to solely inserted
|
||||
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
|
||||
QuantizeLinear/DeQuantizeLinear nodes to weight.
|
||||
OpTypesToExcludeOutputQuantization = list of op type :
|
||||
Default is []. If any op type is specified, it won't quantize the output of ops with this
|
||||
specific op types.
|
||||
DedicatedQDQPair = True/False :
|
||||
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
|
||||
inputs. If True, it will create identical and dedicated QDQ pair for each node.
|
||||
QDQOpTypePerChannelSupportToAxis = dictionary :
|
||||
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
|
||||
effective only when per channel quantization is supported and per_channel is True. If specific
|
||||
op type supports per channel quantization but not explicitly specified with channel axis,
|
||||
default channel axis will be used.
|
||||
CalibTensorRangeSymmetric = True/False :
|
||||
Default is False. If enabled, the final range of tensor during calibration will be explicitly
|
||||
set to symmetric to central point "0".
|
||||
CalibMovingAverage = True/False :
|
||||
Default is False. If enabled, the moving average of the minimum and maximum values will be
|
||||
computed when the calibration method selected is MinMax.
|
||||
CalibMovingAverageConstant = float :
|
||||
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
|
||||
minimum and maximum values. Effective only when the calibration method selected is MinMax and
|
||||
when CalibMovingAverage is set to True.
|
||||
QuantizeBias = True/False :
|
||||
Default is True which quantizes floating-point biases and it solely inserts
|
||||
a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
|
||||
any quantization nodes associated with biases.
|
||||
This extra option is only effective when quant_format is QuantFormat.QDQ.
|
||||
SmoothQuant = True/False :
|
||||
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
|
||||
fake input channel quantization.
|
||||
SmoothQuantAlpha = float :
|
||||
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
|
||||
and activation quantization. A larger alpha value could be used on models with more significant
|
||||
activation outliers to migrate more quantization difficulty to weights.
|
||||
SmoothQuantFolding = True/False :
|
||||
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
|
||||
SmoothQuant will be folded into the previous op if the previous op is foldable.
|
||||
UseQDQContribOps = True/False :
|
||||
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
|
||||
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
|
||||
contrib op implementations. The contrib op implementations may support features not standardized
|
||||
into the ONNX specification (e.g., 16-bit quantization types).
|
||||
MinimumRealRange = float|None :
|
||||
Default is None. If set to a floating-point value, the calculation of the quantization parameters
|
||||
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
|
||||
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
|
||||
necessary for EPs like QNN that require a minimum floating-point range when determining
|
||||
quantization parameters.
|
||||
TensorQuantOverrides = dictionary :
|
||||
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
|
||||
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
|
||||
per-channel quantization, the list contains a dictionary for each channel in the tensor.
|
||||
Each dictionary contains optional overrides with the following keys and values.
|
||||
'quant_type' = QuantType : The tensor's quantization data type.
|
||||
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
||||
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
||||
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
QDQKeepRemovableActivations = True/False:
|
||||
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
|
||||
will be explicitly represented in the QDQ model. If false, these activations are automatically
|
||||
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
|
||||
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
|
||||
operators from the model.
|
||||
QDQDisableWeightAdjustForInt32Bias = True/False:
|
||||
Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
|
||||
has a scale (input_scale * weight_scale) that is too small.
|
||||
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
|
||||
Raises:
|
||||
ValueError: Raise ValueError if execution provider is unknown
|
||||
"""
|
||||
|
||||
super().__init__(
|
||||
activation_type=activation_type,
|
||||
weight_type=weight_type,
|
||||
op_types_to_quantize=op_types_to_quantize,
|
||||
nodes_to_quantize=nodes_to_quantize,
|
||||
nodes_to_exclude=nodes_to_exclude,
|
||||
per_channel=per_channel,
|
||||
reduce_range=reduce_range,
|
||||
use_external_data_format=use_external_data_format,
|
||||
)
|
||||
self.calibration_data_reader = calibration_data_reader
|
||||
self.calibrate_method = calibrate_method
|
||||
self.quant_format = quant_format
|
||||
self.calibration_providers = calibration_providers
|
||||
self.extra_options = extra_options or {}
|
||||
|
||||
|
||||
def get_qdq_config(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
calibrate_args: dict[str, Any] | None = None,
|
||||
activation_type=QuantType.QUInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
activation_symmetric: bool = False,
|
||||
weight_symmetric: bool | None = None,
|
||||
per_channel: bool = False,
|
||||
reduce_range: bool = False,
|
||||
keep_removable_activations: bool = False,
|
||||
min_real_range: float | None = None,
|
||||
tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
|
||||
calibration_providers: list[str] | None = None,
|
||||
op_types_to_quantize: list[str] | None = None,
|
||||
nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None,
|
||||
extra_options: dict | None = None,
|
||||
) -> StaticQuantConfig:
|
||||
"""
|
||||
Returns a configuration suitable that quantizes the entire model to integer precision.
|
||||
|
||||
Params:
|
||||
model_input: Path to the input model file or ModelProto.
|
||||
calibration_data_reader: Calibration data reader.
|
||||
calibrate_methode: The calibration method. Defaults to MinMax.
|
||||
activation_type: The default activation quantization type. Defaults to QUInt8.
|
||||
weight_type: The default weight quantization type. Defaults to QInt8.
|
||||
activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
|
||||
Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
|
||||
the zero-point values are 127 and 32,767, respectively.
|
||||
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
|
||||
Defaults to None. If set to None, weight_symmetric is assumed true if a weight's quant type is a signed int.
|
||||
per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
|
||||
Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
|
||||
and their quantization axes.
|
||||
reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
|
||||
May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
|
||||
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
|
||||
be removed, and will be explicitly represented in the QDQ model. If false, these activations
|
||||
are automatically removed if activations are asymmetrically quantized. Keeping these activations
|
||||
is necessary if optimizations or EP transformations will later remove
|
||||
QuantizeLinear/DequantizeLinear operators from the model.
|
||||
min_real_range: Default is None. If set to a floating-point value, the calculation of the quantization parameters
|
||||
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
|
||||
is less than the specified minimum range, rmax will be set to rmin + min_real_range.
|
||||
tensor_quant_overrides: tensor-level quantization overrides. Defaults to None.
|
||||
The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
|
||||
contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
|
||||
each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
|
||||
key must be present in the first dictionary for per-channel quantization.
|
||||
|
||||
Each dictionary contains optional overrides with the following keys and values.
|
||||
'quant_type' = QuantType : The tensor's quantization data type.
|
||||
'axis' = Int : The per-channel axis. Must be present for per-channel weights.
|
||||
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
||||
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
||||
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
||||
set `scale` or `zero_point`. Only valid for initializers.
|
||||
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'convert' = Dict : A nested dictionary with the same keys for an activation
|
||||
tensor that should be converted to another quantization type.
|
||||
'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
|
||||
other nodes get the original type. If not specified,
|
||||
assume all consumer nodes get the converted type.
|
||||
calibration_providers: Execution providers to run the session during calibration. Default is None which uses
|
||||
[ "CPUExecutionProvider" ].
|
||||
op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
|
||||
and QuantizeLinear are quantized.
|
||||
nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
|
||||
accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
|
||||
should be excluded from quantization.
|
||||
extra_options: Additional options specified as string key/value pairs. Refer to the documentation for
|
||||
`quantize_static` for valid keys and values.
|
||||
|
||||
Returns:
|
||||
A StaticQuantConfig object
|
||||
"""
|
||||
q16_types = {QuantType.QInt16, QuantType.QUInt16}
|
||||
q4_types = {QuantType.QInt4, QuantType.QUInt4}
|
||||
op_types_to_exclude = {"Cast", "DequantizeLinear", "QuantizeLinear"}
|
||||
|
||||
model = (
|
||||
model_input
|
||||
if isinstance(model_input, onnx.ModelProto)
|
||||
else onnx.load_model(model_input, load_external_data=False)
|
||||
)
|
||||
|
||||
op_types = set()
|
||||
model_has_external_data = False
|
||||
overrides_helper = TensorQuantOverridesHelper(
|
||||
copy.deepcopy(tensor_quant_overrides) if tensor_quant_overrides else {}
|
||||
)
|
||||
|
||||
# check if the model has external data.
|
||||
for initializer in model.graph.initializer:
|
||||
if onnx.external_data_helper.uses_external_data(initializer):
|
||||
model_has_external_data = True
|
||||
|
||||
op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
|
||||
nodes_to_exclude_set = set(nodes_to_exclude) if isinstance(nodes_to_exclude, list) else set()
|
||||
|
||||
# Iterate through nodes to get all operator types in the model and
|
||||
# call user's function to filter out nodes from quantization.
|
||||
for node in model.graph.node:
|
||||
if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
|
||||
continue
|
||||
if node.name in nodes_to_exclude_set:
|
||||
continue
|
||||
if callable(nodes_to_exclude) and nodes_to_exclude(model, node):
|
||||
nodes_to_exclude_set.add(node.name)
|
||||
else:
|
||||
op_types.add(node.op_type)
|
||||
|
||||
final_extra_options = {
|
||||
"MinimumRealRange": min_real_range,
|
||||
"QDQKeepRemovableActivations": keep_removable_activations,
|
||||
"ActivationSymmetric": activation_symmetric,
|
||||
"WeightSymmetric": weight_symmetric,
|
||||
"ForceQuantizeNoInputCheck": True,
|
||||
"TensorQuantOverrides": overrides_helper.get_dict(),
|
||||
}
|
||||
|
||||
# Pass along known calibration options
|
||||
if calibrate_args:
|
||||
calib_extra_options_keys = [
|
||||
("symmetric", "CalibTensorRangeSymmetric"),
|
||||
("moving_average", "CalibMovingAverage"),
|
||||
("averaging_constant", "CalibMovingAverageConstant"),
|
||||
("max_intermediate_outputs", "CalibMaxIntermediateOutputs"),
|
||||
("percentile", "CalibPercentile"),
|
||||
]
|
||||
calib_extra_options = {
|
||||
key: calibrate_args.get(name) for (name, key) in calib_extra_options_keys if name in calibrate_args
|
||||
}
|
||||
final_extra_options.update(calib_extra_options)
|
||||
|
||||
# ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain
|
||||
# on Q/DQ operators if using 16-bit or 4-bit quantization.
|
||||
onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx")
|
||||
if onnx_opset.version < 21:
|
||||
opset21_types = q16_types.union(q4_types)
|
||||
overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types())
|
||||
if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types:
|
||||
final_extra_options["UseQDQContribOps"] = True
|
||||
|
||||
# Allow user's extra_options to override our final_extra_options.
|
||||
if extra_options:
|
||||
final_extra_options.update(extra_options)
|
||||
|
||||
return StaticQuantConfig(
|
||||
calibration_data_reader,
|
||||
calibrate_method=calibrate_method,
|
||||
quant_format=QuantFormat.QDQ,
|
||||
activation_type=activation_type,
|
||||
weight_type=weight_type,
|
||||
op_types_to_quantize=(
|
||||
op_types_to_quantize if op_types_to_quantize else list(op_types.difference(op_types_to_exclude))
|
||||
),
|
||||
nodes_to_exclude=list(nodes_to_exclude_set),
|
||||
per_channel=per_channel,
|
||||
reduce_range=reduce_range,
|
||||
use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
|
||||
calibration_providers=calibration_providers,
|
||||
extra_options=final_extra_options,
|
||||
)
|
||||
|
||||
|
||||
class DynamicQuantConfig(QuantConfig):
|
||||
def __init__(
|
||||
self,
|
||||
weight_type=QuantType.QInt8,
|
||||
op_types_to_quantize=None,
|
||||
nodes_to_quantize=None,
|
||||
nodes_to_exclude=None,
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
use_external_data_format=False,
|
||||
extra_options=None,
|
||||
):
|
||||
"""
|
||||
This is a class for dynamic Quant Configuration
|
||||
|
||||
Args:
|
||||
extra_options: key value pair dictionary for various options in different case. Current used:
|
||||
extra.Sigmoid.nnapi = True/False (Default is False)
|
||||
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
||||
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
||||
EnableSubgraph = True/False :
|
||||
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
|
||||
support more in the future.
|
||||
ForceQuantizeNoInputCheck = True/False :
|
||||
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
||||
quantized already. Setting to True to force such operator always quantize input and so generate
|
||||
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False:
|
||||
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
|
||||
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
|
||||
|
||||
Raises:
|
||||
ValueError: Raise ValueError if execution provider is unknown
|
||||
"""
|
||||
super().__init__(
|
||||
op_types_to_quantize=op_types_to_quantize,
|
||||
per_channel=per_channel,
|
||||
reduce_range=reduce_range,
|
||||
weight_type=weight_type,
|
||||
nodes_to_quantize=nodes_to_quantize,
|
||||
nodes_to_exclude=nodes_to_exclude,
|
||||
use_external_data_format=use_external_data_format,
|
||||
)
|
||||
self.extra_options = extra_options or {}
|
||||
|
||||
|
||||
def check_static_quant_arguments(quant_format: QuantFormat, activation_type: QuantType, weight_type: QuantType):
|
||||
if activation_type == QuantType.QInt8 and weight_type == QuantType.QUInt8:
|
||||
raise ValueError(
|
||||
"ONNXRuntime quantization doesn't support data format:"
|
||||
"activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8"
|
||||
)
|
||||
if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
|
||||
raise ValueError(
|
||||
f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
|
||||
"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
|
||||
)
|
||||
|
||||
if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
|
||||
raise ValueError(
|
||||
"ONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, "
|
||||
f"weight_type={weight_type}!=QuantType.QFLOAT8E4M3FN"
|
||||
)
|
||||
|
||||
q16_types = [QuantType.QInt16, QuantType.QUInt16]
|
||||
|
||||
if (activation_type in q16_types or weight_type in q16_types) and quant_format != QuantFormat.QDQ:
|
||||
raise ValueError("Only QuantFormat.QDQ supports 16-bit quantization types.")
|
||||
|
||||
if activation_type == QuantType.QInt8 and weight_type == QuantType.QInt8 and quant_format != QuantFormat.QDQ:
|
||||
logging.warning(
|
||||
"Please use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. "
|
||||
"Or it will lead to bad performance on x64."
|
||||
)
|
||||
|
||||
|
||||
def quantize_static(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
model_output: str | Path,
|
||||
calibration_data_reader: CalibrationDataReader,
|
||||
quant_format=QuantFormat.QDQ,
|
||||
op_types_to_quantize=None,
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
activation_type=QuantType.QInt8,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=None,
|
||||
nodes_to_exclude=None,
|
||||
use_external_data_format=False,
|
||||
calibrate_method=CalibrationMethod.MinMax,
|
||||
calibration_providers=None,
|
||||
extra_options=None,
|
||||
):
|
||||
"""
|
||||
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
|
||||
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
|
||||
= QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
|
||||
targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
|
||||
accuracy.
|
||||
|
||||
Args:
|
||||
|
||||
model_input: file path of model or ModelProto to quantize
|
||||
model_output: file path of quantized model
|
||||
calibration_data_reader: a calibration data reader. It
|
||||
enumerates calibration data and generates inputs for the
|
||||
original model.
|
||||
quant_format: QuantFormat{QOperator, QDQ}.
|
||||
QOperator format quantizes the model with quantized operators directly.
|
||||
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
|
||||
activation_type:
|
||||
quantization data type of activation. Please refer to
|
||||
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
||||
calibrate_method:
|
||||
Current calibration methods supported are MinMax and Entropy.
|
||||
Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
|
||||
op_types_to_quantize:
|
||||
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
||||
It quantizes all supported operators by default.
|
||||
per_channel: quantize weights per channel
|
||||
reduce_range:
|
||||
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
||||
especially for per-channel mode
|
||||
weight_type:
|
||||
quantization data type of weight. Please refer to
|
||||
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
||||
nodes_to_quantize:
|
||||
List of nodes names to quantize. When this list is not None only the nodes in this list
|
||||
are quantized.
|
||||
example:
|
||||
[
|
||||
'Conv__224',
|
||||
'Conv__252'
|
||||
]
|
||||
nodes_to_exclude:
|
||||
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
||||
when it is not None.
|
||||
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
||||
calibration_providers: Execution providers to run the session during calibration. Default is None which uses
|
||||
[ "CPUExecutionProvider" ]
|
||||
extra_options:
|
||||
key value pair dictionary for various options in different case. Current used:
|
||||
extra.Sigmoid.nnapi = True/False (Default is False)
|
||||
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
||||
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
||||
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
|
||||
Dyanmic mode currently is supported. Will support more in the future.
|
||||
ForceQuantizeNoInputCheck = True/False :
|
||||
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
||||
quantized already. Setting to True to force such operator always quantize input and so generate
|
||||
quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False:
|
||||
Default is False for static mode. If enabled, only MatMul with const B will be quantized.
|
||||
AddQDQPairToWeight = True/False :
|
||||
Default is False which quantizes floating-point weight and feeds it to solely inserted
|
||||
DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
|
||||
QuantizeLinear/DeQuantizeLinear nodes to weight.
|
||||
OpTypesToExcludeOutputQuantization = list of op type :
|
||||
Default is []. If any op type is specified, it won't quantize the output of ops with this
|
||||
specific op types.
|
||||
DedicatedQDQPair = True/False :
|
||||
Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
|
||||
inputs. If True, it will create identical and dedicated QDQ pair for each node.
|
||||
QDQOpTypePerChannelSupportToAxis = dictionary :
|
||||
Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
|
||||
effective only when per channel quantization is supported and per_channel is True. If specific
|
||||
op type supports per channel quantization but not explicitly specified with channel axis,
|
||||
default channel axis will be used.
|
||||
CalibTensorRangeSymmetric = True/False :
|
||||
Default is False. If enabled, the final range of tensor during calibration will be explicitly
|
||||
set to symmetric to central point "0".
|
||||
CalibStridedMinMax = Optional[int] :
|
||||
Default is None. If set to an integer, during calculation of the min-max, only stride amount of
|
||||
data will be used and then all results will be merged in the end.
|
||||
CalibMovingAverage = True/False :
|
||||
Default is False. If enabled, the moving average of the minimum and maximum values will be
|
||||
computed when the calibration method selected is MinMax.
|
||||
CalibMovingAverageConstant = float :
|
||||
Default is 0.01. Constant smoothing factor to use when computing the moving average of the
|
||||
minimum and maximum values. Effective only when the calibration method selected is MinMax and
|
||||
when CalibMovingAverage is set to True.
|
||||
CalibMaxIntermediateOutputs = Optional[int] :
|
||||
Default is None. If set to an integer, during calculation of the min-max range of the tensors
|
||||
it will load at max value number of outputs before computing and merging the range. This will
|
||||
produce the same result as all computing with None, but is more memory efficient.
|
||||
SmoothQuant = True/False :
|
||||
Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
|
||||
fake input channel quantization.
|
||||
SmoothQuantAlpha = float :
|
||||
Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
|
||||
and activation quantization. A larger alpha value could be used on models with more significant
|
||||
activation outliers to migrate more quantization difficulty to weights.
|
||||
SmoothQuantFolding = True/False :
|
||||
Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
|
||||
SmoothQuant will be folded into the previous op if the previous op is foldable.
|
||||
UseQDQContribOps = True/False :
|
||||
Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
|
||||
`com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
|
||||
contrib op implementations. The contrib op implementations may support features not standardized
|
||||
into the ONNX specification (e.g., 16-bit quantization types).
|
||||
MinimumRealRange = float|None :
|
||||
Default is None. If set to a floating-point value, the calculation of the quantization parameters
|
||||
(i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
|
||||
is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
|
||||
necessary for EPs like QNN that require a minimum floating-point range when determining
|
||||
quantization parameters.
|
||||
TensorQuantOverrides = dictionary :
|
||||
Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
|
||||
list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
|
||||
per-channel quantization, the list contains a dictionary for each channel in the tensor.
|
||||
Each dictionary contains optional overrides with the following keys and values.
|
||||
'quant_type' = QuantType : The tensor's quantization data type.
|
||||
'scale' = Float : The scale value to use. Must also specify `zero_point` if set.
|
||||
'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set.
|
||||
'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also
|
||||
set `scale` or `zero_point`.
|
||||
'rmax' = Float : Override the maximum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
'rmin' = Float : Override the minimum real tensor value in calibration data.
|
||||
Invalid if also set `scale` or `zero_point`.
|
||||
QDQKeepRemovableActivations = True/False:
|
||||
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
|
||||
will be explicitly represented in the QDQ model. If false, these activations are automatically
|
||||
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
|
||||
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
|
||||
operators from the model.
|
||||
QDQDisableWeightAdjustForInt32Bias = True/False:
|
||||
Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
|
||||
has a scale (input_scale * weight_scale) that is too small.
|
||||
"""
|
||||
if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
|
||||
if calibrate_method != CalibrationMethod.Distribution:
|
||||
raise ValueError("Only Distribution calibration method is supported for float quantization.")
|
||||
|
||||
extra_options = extra_options or {}
|
||||
nodes_to_exclude = nodes_to_exclude or []
|
||||
nodes_to_quantize = nodes_to_quantize or []
|
||||
op_types_to_quantize = op_types_to_quantize or []
|
||||
mode = QuantizationMode.QLinearOps
|
||||
|
||||
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
|
||||
q_linear_ops = list(QLinearOpsRegistry.keys())
|
||||
qdq_ops = list(QDQRegistry.keys())
|
||||
op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
|
||||
|
||||
model = (
|
||||
save_and_reload_model_with_shape_infer(model_input)
|
||||
if isinstance(model_input, onnx.ModelProto)
|
||||
else load_model_with_shape_infer(Path(model_input))
|
||||
)
|
||||
|
||||
pre_processed: bool = model_has_pre_process_metadata(model)
|
||||
if not pre_processed:
|
||||
logging.warning(
|
||||
"Please consider to run pre-processing before quantization. Refer to example: "
|
||||
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
||||
"/cpu/ReadMe.md "
|
||||
)
|
||||
|
||||
calib_extra_options_keys = [
|
||||
("CalibTensorRangeSymmetric", "symmetric"),
|
||||
("CalibMovingAverage", "moving_average"),
|
||||
("CalibMovingAverageConstant", "averaging_constant"),
|
||||
("CalibMaxIntermediateOutputs", "max_intermediate_outputs"),
|
||||
("CalibPercentile", "percentile"),
|
||||
]
|
||||
calib_extra_options = {
|
||||
key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options
|
||||
}
|
||||
|
||||
if extra_options.get("SmoothQuant", False):
|
||||
import importlib # noqa: PLC0415
|
||||
|
||||
try:
|
||||
importlib.import_module("neural_compressor.adaptor.ox_utils.smooth_quant")
|
||||
except Exception as e:
|
||||
logging.error(f"{e}.")
|
||||
raise RuntimeError("neural-compressor is not correctly installed. Please check your environment.") from e
|
||||
|
||||
from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant # noqa: PLC0415
|
||||
|
||||
def inc_dataloader():
|
||||
data_reader = copy.deepcopy(calibration_data_reader)
|
||||
for data in data_reader:
|
||||
yield data, None
|
||||
|
||||
orig_nodes = [i.name for i in model.graph.node]
|
||||
dataloader = inc_dataloader()
|
||||
sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
|
||||
del dataloader
|
||||
model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
|
||||
sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
|
||||
model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
|
||||
model.save(model_input)
|
||||
nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
|
||||
model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration
|
||||
|
||||
updated_model = update_opset_version(model, weight_type)
|
||||
is_model_updated = updated_model is not model
|
||||
if is_model_updated:
|
||||
model = updated_model
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
|
||||
if is_model_updated:
|
||||
# Update model_input and avoid to use the original one
|
||||
model_input = copy.deepcopy(model)
|
||||
|
||||
if isinstance(model_input, onnx.ModelProto):
|
||||
output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
|
||||
onnx.save_model(
|
||||
model_input,
|
||||
output_path,
|
||||
save_as_external_data=True,
|
||||
)
|
||||
model_input = output_path
|
||||
|
||||
calibrator = create_calibrator(
|
||||
Path(model_input),
|
||||
op_types_to_quantize,
|
||||
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
|
||||
calibrate_method=calibrate_method,
|
||||
use_external_data_format=use_external_data_format,
|
||||
providers=calibration_providers,
|
||||
extra_options=calib_extra_options,
|
||||
)
|
||||
|
||||
stride = extra_options.get("CalibStridedMinMax", None)
|
||||
if stride:
|
||||
total_data_size = len(calibration_data_reader)
|
||||
if total_data_size % stride != 0:
|
||||
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
|
||||
|
||||
for start in range(0, total_data_size, stride):
|
||||
end_index = start + stride
|
||||
calibration_data_reader.set_range(start_index=start, end_index=end_index)
|
||||
calibrator.collect_data(calibration_data_reader)
|
||||
else:
|
||||
calibrator.collect_data(calibration_data_reader)
|
||||
tensors_range = calibrator.compute_data()
|
||||
if not isinstance(tensors_range, TensorsData):
|
||||
raise TypeError(
|
||||
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
|
||||
)
|
||||
del calibrator
|
||||
|
||||
check_static_quant_arguments(quant_format, activation_type, weight_type)
|
||||
|
||||
if quant_format is QuantFormat.QOperator:
|
||||
quantizer = ONNXQuantizer(
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
mode,
|
||||
True, # static
|
||||
weight_type,
|
||||
activation_type,
|
||||
tensors_range,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options,
|
||||
)
|
||||
else:
|
||||
quantizer = QDQQuantizer(
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
weight_type,
|
||||
activation_type,
|
||||
tensors_range,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options,
|
||||
)
|
||||
|
||||
quantizer.quantize_model()
|
||||
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
||||
if not pre_processed:
|
||||
logging.warning(
|
||||
"Please consider pre-processing before quantization. See "
|
||||
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
||||
"/cpu/ReadMe.md "
|
||||
)
|
||||
|
||||
if extra_options.get("SmoothQuant", False):
|
||||
sq_path.cleanup()
|
||||
|
||||
|
||||
def quantize_dynamic(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
model_output: str | Path,
|
||||
op_types_to_quantize=None,
|
||||
per_channel=False,
|
||||
reduce_range=False,
|
||||
weight_type=QuantType.QInt8,
|
||||
nodes_to_quantize=None,
|
||||
nodes_to_exclude=None,
|
||||
use_external_data_format=False,
|
||||
extra_options=None,
|
||||
):
|
||||
"""Given an onnx model, create a quantized onnx model and save it into a file
|
||||
|
||||
Args:
|
||||
model_input: file path of model or ModelProto to quantize
|
||||
model_output: file path of quantized model
|
||||
op_types_to_quantize:
|
||||
specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
|
||||
It quantizes all supported operators by default.
|
||||
per_channel: quantize weights per channel
|
||||
reduce_range:
|
||||
quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
|
||||
especially for per-channel mode
|
||||
weight_type:
|
||||
quantization data type of weight. Please refer to
|
||||
https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
|
||||
nodes_to_quantize:
|
||||
List of nodes names to quantize. When this list is not None only the nodes in this list
|
||||
are quantized.
|
||||
example:
|
||||
[
|
||||
'Conv__224',
|
||||
'Conv__252'
|
||||
]
|
||||
nodes_to_exclude:
|
||||
List of nodes names to exclude. The nodes in this list will be excluded from quantization
|
||||
when it is not None.
|
||||
use_external_data_format: option used for large size (>2GB) model. Set to False by default.
|
||||
extra_options:
|
||||
key value pair dictionary for various options in different case. Current used:
|
||||
extra.Sigmoid.nnapi = True/False (Default is False)
|
||||
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
|
||||
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
|
||||
EnableSubgraph = True/False :
|
||||
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
|
||||
support more in the future.
|
||||
ForceQuantizeNoInputCheck = True/False :
|
||||
By default, some latent operators like maxpool, transpose, do not quantize if their input is not
|
||||
quantized already. Setting to True to force such operator always quantize input and so generate
|
||||
quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
|
||||
MatMulConstBOnly = True/False:
|
||||
Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
|
||||
"""
|
||||
extra_options = extra_options or {}
|
||||
nodes_to_exclude = nodes_to_exclude or []
|
||||
nodes_to_quantize = nodes_to_quantize or []
|
||||
op_types_to_quantize = op_types_to_quantize or []
|
||||
|
||||
mode = QuantizationMode.IntegerOps
|
||||
|
||||
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
|
||||
op_types_to_quantize = list(IntegerOpsRegistry.keys())
|
||||
|
||||
model = (
|
||||
save_and_reload_model_with_shape_infer(model_input)
|
||||
if isinstance(model_input, onnx.ModelProto)
|
||||
else load_model_with_shape_infer(Path(model_input))
|
||||
)
|
||||
|
||||
pre_processed: bool = model_has_pre_process_metadata(model)
|
||||
if not pre_processed:
|
||||
logging.warning(
|
||||
"Please consider to run pre-processing before quantization. Refer to example: "
|
||||
"https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification"
|
||||
"/cpu/ReadMe.md "
|
||||
)
|
||||
|
||||
if "MatMulConstBOnly" not in extra_options:
|
||||
extra_options["MatMulConstBOnly"] = True
|
||||
|
||||
model = update_opset_version(model, weight_type)
|
||||
|
||||
quantizer = ONNXQuantizer(
|
||||
model,
|
||||
per_channel,
|
||||
reduce_range,
|
||||
mode,
|
||||
False, # static
|
||||
weight_type,
|
||||
QuantType.QUInt8, # dynamic activation only supports uint8
|
||||
None,
|
||||
nodes_to_quantize,
|
||||
nodes_to_exclude,
|
||||
op_types_to_quantize,
|
||||
extra_options,
|
||||
)
|
||||
|
||||
quantizer.quantize_model()
|
||||
quantizer.model.save_model_to_file(model_output, use_external_data_format)
|
||||
|
||||
|
||||
def quantize(
|
||||
model_input: str | Path | onnx.ModelProto,
|
||||
model_output: str | Path,
|
||||
quant_config: QuantConfig,
|
||||
):
|
||||
"""Quantize a model with QuantConfig.
|
||||
|
||||
Args:
|
||||
model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
|
||||
model_output (str | Path): Path to save the quantized model.
|
||||
quant_config (QuantConfig | WeightOnlyQuantConfig): Quantization Configuration.
|
||||
"""
|
||||
if isinstance(quant_config, StaticQuantConfig):
|
||||
quantize_static(
|
||||
model_input,
|
||||
model_output,
|
||||
quant_config.calibration_data_reader,
|
||||
calibrate_method=quant_config.calibrate_method,
|
||||
quant_format=quant_config.quant_format,
|
||||
activation_type=quant_config.activation_type,
|
||||
weight_type=quant_config.weight_type,
|
||||
op_types_to_quantize=quant_config.op_types_to_quantize,
|
||||
nodes_to_quantize=quant_config.nodes_to_quantize,
|
||||
nodes_to_exclude=quant_config.nodes_to_exclude,
|
||||
per_channel=quant_config.per_channel,
|
||||
reduce_range=quant_config.reduce_range,
|
||||
use_external_data_format=quant_config.use_external_data_format,
|
||||
calibration_providers=quant_config.calibration_providers,
|
||||
extra_options=quant_config.extra_options,
|
||||
)
|
||||
|
||||
elif isinstance(quant_config, DynamicQuantConfig):
|
||||
quantize_dynamic(
|
||||
model_input,
|
||||
model_output,
|
||||
weight_type=quant_config.weight_type,
|
||||
op_types_to_quantize=quant_config.op_types_to_quantize,
|
||||
nodes_to_quantize=quant_config.nodes_to_quantize,
|
||||
nodes_to_exclude=quant_config.nodes_to_exclude,
|
||||
per_channel=quant_config.per_channel,
|
||||
reduce_range=quant_config.reduce_range,
|
||||
use_external_data_format=quant_config.use_external_data_format,
|
||||
extra_options=quant_config.extra_options,
|
||||
)
|
||||
else:
|
||||
# training package doesn't has quantize_matmul_4bits, avoid global import
|
||||
from .matmul_nbits_quantizer import MatMulNBitsQuantizer, WeightOnlyQuantConfig # noqa: PLC0415
|
||||
|
||||
if isinstance(quant_config, WeightOnlyQuantConfig):
|
||||
model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load(model_input)
|
||||
quant = MatMulNBitsQuantizer(model, algo_config=quant_config)
|
||||
quant.process()
|
||||
quant.model.save_model_to_file(model_output, True)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Invalid quantization config type, it must be either StaticQuantConfig, "
|
||||
"DynamicQuantConfig, or WeightOnlyQuantConfig."
|
||||
)
|
||||
@@ -0,0 +1,109 @@
|
||||
from .operators.activation import QDQRemovableActivation, QLinearActivation
|
||||
from .operators.argmax import QArgMax
|
||||
from .operators.attention import AttentionQuant
|
||||
from .operators.base_operator import QuantOperatorBase
|
||||
from .operators.binary_op import QLinearBinaryOp
|
||||
from .operators.concat import QLinearConcat
|
||||
from .operators.conv import ConvInteger, QDQConv, QLinearConv
|
||||
from .operators.direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
from .operators.embed_layernorm import EmbedLayerNormalizationQuant
|
||||
from .operators.gather import GatherQuant, QDQGather
|
||||
from .operators.gavgpool import QGlobalAveragePool
|
||||
from .operators.gemm import QDQGemm, QLinearGemm
|
||||
from .operators.lstm import LSTMQuant
|
||||
from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul
|
||||
from .operators.maxpool import QDQMaxPool, QMaxPool
|
||||
from .operators.norm import QDQNormalization
|
||||
from .operators.pad import QDQPad, QPad
|
||||
from .operators.pooling import QLinearPool
|
||||
from .operators.qdq_base_operator import QDQOperatorBase
|
||||
from .operators.resize import QDQResize, QResize
|
||||
from .operators.softmax import QLinearSoftmax
|
||||
from .operators.split import QDQSplit, QSplit
|
||||
from .operators.where import QDQWhere, QLinearWhere
|
||||
from .quant_utils import QuantizationMode
|
||||
|
||||
CommonOpsRegistry = {
|
||||
"Gather": GatherQuant,
|
||||
"Transpose": Direct8BitOp,
|
||||
"EmbedLayerNormalization": EmbedLayerNormalizationQuant,
|
||||
}
|
||||
|
||||
IntegerOpsRegistry = {
|
||||
"Conv": ConvInteger,
|
||||
"MatMul": MatMulInteger,
|
||||
"Attention": AttentionQuant,
|
||||
"LSTM": LSTMQuant,
|
||||
}
|
||||
IntegerOpsRegistry.update(CommonOpsRegistry)
|
||||
|
||||
QLinearOpsRegistry = {
|
||||
"ArgMax": QArgMax,
|
||||
"Conv": QLinearConv,
|
||||
"Gemm": QLinearGemm,
|
||||
"MatMul": QLinearMatMul,
|
||||
"Add": QLinearBinaryOp,
|
||||
"Mul": QLinearBinaryOp,
|
||||
"Relu": QLinearActivation,
|
||||
"Clip": QLinearActivation,
|
||||
"LeakyRelu": QLinearActivation,
|
||||
"Sigmoid": QLinearActivation,
|
||||
"MaxPool": QMaxPool,
|
||||
"GlobalAveragePool": QGlobalAveragePool,
|
||||
"Split": QSplit,
|
||||
"Pad": QPad,
|
||||
"Reshape": Direct8BitOp,
|
||||
"Squeeze": Direct8BitOp,
|
||||
"Unsqueeze": Direct8BitOp,
|
||||
"Resize": QResize,
|
||||
"AveragePool": QLinearPool,
|
||||
"Concat": QLinearConcat,
|
||||
"Softmax": QLinearSoftmax,
|
||||
"Where": QLinearWhere,
|
||||
}
|
||||
QLinearOpsRegistry.update(CommonOpsRegistry)
|
||||
|
||||
QDQRegistry = {
|
||||
"Conv": QDQConv,
|
||||
"ConvTranspose": QDQConv,
|
||||
"Gemm": QDQGemm,
|
||||
"Clip": QDQRemovableActivation,
|
||||
"Relu": QDQRemovableActivation,
|
||||
"Reshape": QDQDirect8BitOp,
|
||||
"Transpose": QDQDirect8BitOp,
|
||||
"Squeeze": QDQDirect8BitOp,
|
||||
"Unsqueeze": QDQDirect8BitOp,
|
||||
"Resize": QDQResize,
|
||||
"MaxPool": QDQMaxPool,
|
||||
"AveragePool": QDQDirect8BitOp,
|
||||
"Slice": QDQDirect8BitOp,
|
||||
"Pad": QDQPad,
|
||||
"MatMul": QDQMatMul,
|
||||
"Split": QDQSplit,
|
||||
"Gather": QDQGather,
|
||||
"GatherElements": QDQGather,
|
||||
"Where": QDQWhere,
|
||||
"InstanceNormalization": QDQNormalization,
|
||||
"LayerNormalization": QDQNormalization,
|
||||
"BatchNormalization": QDQNormalization,
|
||||
"TopK": QDQDirect8BitOp,
|
||||
}
|
||||
|
||||
|
||||
def CreateDefaultOpQuantizer(onnx_quantizer, node): # noqa: N802
|
||||
return QuantOperatorBase(onnx_quantizer, node)
|
||||
|
||||
|
||||
def CreateOpQuantizer(onnx_quantizer, node): # noqa: N802
|
||||
registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
|
||||
if node.op_type in registry:
|
||||
op_quantizer = registry[node.op_type](onnx_quantizer, node)
|
||||
if op_quantizer.should_quantize():
|
||||
return op_quantizer
|
||||
return QuantOperatorBase(onnx_quantizer, node)
|
||||
|
||||
|
||||
def CreateQDQQuantizer(onnx_quantizer, node): # noqa: N802
|
||||
if node.op_type in QDQRegistry:
|
||||
return QDQRegistry[node.op_type](onnx_quantizer, node)
|
||||
return QDQOperatorBase(onnx_quantizer, node)
|
||||
@@ -0,0 +1,209 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft, Intel Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
import onnx
|
||||
|
||||
import onnxruntime
|
||||
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
|
||||
from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
|
||||
|
||||
from .fusions import ReplaceUpsampleWithResize
|
||||
from .onnx_model import ONNXModel
|
||||
from .quant_utils import add_pre_process_metadata, save_and_reload_model_with_shape_infer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def quant_pre_process(
|
||||
input_model: str | Path | onnx.ModelProto | None = None,
|
||||
output_model_path: str | Path | None = None,
|
||||
skip_optimization: bool = False,
|
||||
skip_onnx_shape: bool = False,
|
||||
skip_symbolic_shape: bool = False,
|
||||
auto_merge: bool = False,
|
||||
int_max: int = 2**31 - 1,
|
||||
guess_output_rank: bool = False,
|
||||
verbose: int = 0,
|
||||
save_as_external_data: bool = False,
|
||||
all_tensors_to_one_file: bool = False,
|
||||
external_data_location: str | None = None,
|
||||
external_data_size_threshold: int = 1024,
|
||||
**deprecated_kwargs,
|
||||
) -> None:
|
||||
"""Shape inference and model optimization, in preparation for quantization.
|
||||
|
||||
Args:
|
||||
input_model: Path to the input model file or ModelProto
|
||||
output_model_path: Path to the output model file
|
||||
skip_optimization: Skip model optimization step if true. This may result in ONNX shape
|
||||
inference failure for some models.
|
||||
skip_onnx_shape: Skip ONNX shape inference. Symbolic shape inference is most effective
|
||||
with transformer based models. Skipping all shape inferences may
|
||||
reduce the effectiveness of quantization, as a tensor with unknown
|
||||
shape can not be quantized.
|
||||
skip_symbolic_shape: Skip symbolic shape inference. Symbolic shape inference is most
|
||||
effective with transformer based models. Skipping all shape
|
||||
inferences may reduce the effectiveness of quantization, as a tensor
|
||||
with unknown shape can not be quantized.
|
||||
auto_merge: For symbolic shape inference, automatically merge symbolic dims when
|
||||
conflict happens.
|
||||
int_max: For symbolic shape inference, specify the maximum value for integer to be
|
||||
treated as boundless for ops like slice
|
||||
guess_output_rank: Guess output rank to be the same as input 0 for unknown ops
|
||||
verbose: Logs detailed info of inference, 0: turn off, 1: warnings, 3: detailed
|
||||
save_as_external_data: Saving an ONNX model to external data
|
||||
all_tensors_to_one_file: Saving all the external data to one file
|
||||
external_data_location: The file location to save the external file
|
||||
external_data_size_threshold: The size threshold for external data
|
||||
"""
|
||||
|
||||
if input_model is None:
|
||||
input_model = deprecated_kwargs.pop("input_model_path", None)
|
||||
assert input_model is not None
|
||||
|
||||
assert output_model_path is not None, "output_model_path is required."
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="pre.quant.") as quant_tmp_dir:
|
||||
temp_path = Path(quant_tmp_dir)
|
||||
model = None
|
||||
|
||||
if not skip_symbolic_shape:
|
||||
logger.info("Performing symbolic shape inference...")
|
||||
loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
|
||||
model = SymbolicShapeInference.infer_shapes(
|
||||
loaded_model,
|
||||
int_max,
|
||||
auto_merge,
|
||||
guess_output_rank,
|
||||
verbose,
|
||||
)
|
||||
|
||||
# Since Upsample is deprecated after opset v10, and the model's opset will
|
||||
# be upgraded to at least v11 during quantization, we need to replace Upsample
|
||||
# with Resize first to avoid generating an invalid model.
|
||||
if model:
|
||||
ai_onnx_domain = [opset for opset in model.opset_import if not opset.domain or opset.domain == "ai.onnx"]
|
||||
if len(ai_onnx_domain) == 1:
|
||||
opset_version = ai_onnx_domain[0].version
|
||||
if opset_version < 10:
|
||||
ReplaceUpsampleWithResize(ONNXModel(model), opset_version).apply()
|
||||
model.opset_import.remove(ai_onnx_domain[0])
|
||||
opset_version = 11
|
||||
model.opset_import.extend([onnx.helper.make_opsetid("", opset_version)])
|
||||
model = onnx.version_converter.convert_version(model, opset_version)
|
||||
model = save_and_reload_model_with_shape_infer(model)
|
||||
|
||||
if not skip_optimization:
|
||||
# Use ORT optimizers (native code) to optimize model
|
||||
if not skip_symbolic_shape:
|
||||
# Need to save the inferenced model to file so as to run the optimizer
|
||||
input_model = str(temp_path / "symbolic_shape_inferred.onnx")
|
||||
if save_as_external_data:
|
||||
onnx.save_model(
|
||||
model,
|
||||
input_model,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=all_tensors_to_one_file,
|
||||
size_threshold=external_data_size_threshold,
|
||||
convert_attribute=False,
|
||||
)
|
||||
else:
|
||||
onnx.save(model, input_model)
|
||||
model = None
|
||||
|
||||
opt_model_path = str(temp_path / "optimized.onnx")
|
||||
try:
|
||||
sess_option = onnxruntime.SessionOptions()
|
||||
sess_option.optimized_model_filepath = opt_model_path
|
||||
sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
# For large model, extract external data from model and add to session options
|
||||
if isinstance(input_model, onnx.ModelProto):
|
||||
if has_external_data(input_model):
|
||||
raise ValueError(
|
||||
"ModelProto has external data not loaded into memory, ORT cannot create session. "
|
||||
"Please load external data before calling this function. "
|
||||
"See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
|
||||
)
|
||||
external_names, external_values = extract_raw_data_from_model(input_model)
|
||||
sess_option.add_external_initializers(list(external_names), list(external_values))
|
||||
input_model = input_model.SerializeToString()
|
||||
# the saved optimized model otherwise points to the original external data file name
|
||||
# which is not available relative to the optimized model file
|
||||
elif skip_symbolic_shape and save_as_external_data:
|
||||
sess_option.add_session_config_entry(
|
||||
"session.optimized_model_external_initializers_file_name", "optimized.onnx.data"
|
||||
)
|
||||
|
||||
sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
|
||||
# Close the session to avoid the cleanup error on Windows for temp folders
|
||||
# https://github.com/microsoft/onnxruntime/issues/17627
|
||||
del sess
|
||||
except Exception:
|
||||
logger.error(
|
||||
"ONNX Runtime Model Optimization Failed! Consider rerun with option `--skip_optimization'."
|
||||
)
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
input_model = opt_model_path
|
||||
|
||||
if not skip_onnx_shape:
|
||||
# ONNX shape inference.
|
||||
# According to docs, infer_shapes_path should be used for 2G+ models.
|
||||
# If the skip optimization is specified, we could be dealing with a
|
||||
# large model. So be on the safe side, save the model
|
||||
if model is not None:
|
||||
input_model = str(temp_path / "symbolic_shape_inferred.onnx")
|
||||
if save_as_external_data:
|
||||
onnx.save_model(
|
||||
model,
|
||||
input_model,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=all_tensors_to_one_file,
|
||||
size_threshold=external_data_size_threshold,
|
||||
convert_attribute=False,
|
||||
)
|
||||
else:
|
||||
onnx.save(model, input_model)
|
||||
model = None
|
||||
|
||||
if isinstance(input_model, onnx.ModelProto):
|
||||
input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
|
||||
onnx.save_model(
|
||||
model,
|
||||
input_model,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=all_tensors_to_one_file,
|
||||
size_threshold=external_data_size_threshold,
|
||||
convert_attribute=False,
|
||||
)
|
||||
|
||||
inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
|
||||
onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
|
||||
model = onnx.load(inferred_model_path)
|
||||
|
||||
if model is None:
|
||||
model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
|
||||
|
||||
add_pre_process_metadata(model)
|
||||
|
||||
if save_as_external_data:
|
||||
onnx.save_model(
|
||||
model,
|
||||
output_model_path,
|
||||
save_as_external_data=True,
|
||||
all_tensors_to_one_file=all_tensors_to_one_file,
|
||||
location=external_data_location,
|
||||
size_threshold=external_data_size_threshold,
|
||||
convert_attribute=False,
|
||||
)
|
||||
else:
|
||||
onnx.save(model, output_model_path)
|
||||
@@ -0,0 +1,256 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
import onnxruntime
|
||||
from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
|
||||
from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
|
||||
|
||||
|
||||
class OnnxModelCalibrationDataReader(CalibrationDataReader):
|
||||
def __init__(self, model_path):
|
||||
self.model_dir = os.path.dirname(model_path)
|
||||
data_dirs = [
|
||||
os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
|
||||
]
|
||||
model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
|
||||
name2tensors = []
|
||||
for data_dir in data_dirs:
|
||||
name2tensor = {}
|
||||
data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
|
||||
data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
|
||||
for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
|
||||
name2tensor[model_input.name] = data_ndarray
|
||||
name2tensors.append(name2tensor)
|
||||
assert len(name2tensors) == len(data_dirs)
|
||||
assert len(name2tensors[0]) == len(model_inputs)
|
||||
|
||||
self.calibration_data = iter(name2tensors)
|
||||
|
||||
def get_next(self) -> dict:
|
||||
"""generate the input data dict for ONNXinferenceSession run"""
|
||||
return next(self.calibration_data, None)
|
||||
|
||||
def read_onnx_pb_data(self, file_pb):
|
||||
tensor = onnx.TensorProto()
|
||||
with open(file_pb, "rb") as f:
|
||||
tensor.ParseFromString(f.read())
|
||||
ret = onnx.numpy_helper.to_array(tensor)
|
||||
return ret
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="The arguments for static quantization")
|
||||
parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
|
||||
parser.add_argument(
|
||||
"-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--activation_type",
|
||||
choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
|
||||
default="quint8",
|
||||
help="Activation quantization type used",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--weight_type",
|
||||
choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
|
||||
default="qint8",
|
||||
help="Weight quantization type used",
|
||||
)
|
||||
parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
|
||||
parser.add_argument(
|
||||
"--force_quantize_no_input_check",
|
||||
action="store_true",
|
||||
help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
|
||||
" quantized already. Setting to True to force such operator always quantize input and so generate"
|
||||
" quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--matmul_const_b_only",
|
||||
action="store_true",
|
||||
help="If set, only MatMul with const B will be quantized.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--add_qdq_pair_to_weight",
|
||||
action="store_true",
|
||||
help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
|
||||
" nodes to weight.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dedicated_qdq_pair",
|
||||
action="store_true",
|
||||
help="If set, it will create identical and dedicated QDQ pair for each node.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--op_types_to_exclude_output_quantization",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--calibration_method",
|
||||
default="minmax",
|
||||
choices=["minmax", "entropy", "percentile", "distribution"],
|
||||
help="Calibration method used",
|
||||
)
|
||||
parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
|
||||
parser.add_argument(
|
||||
"--calib_tensor_range_symmetric",
|
||||
action="store_true",
|
||||
help="If enabled, the final range of tensor during calibration will be explicitly"
|
||||
" set to symmetric to central point 0",
|
||||
)
|
||||
# TODO: --calib_strided_minmax"
|
||||
# TODO: --calib_moving_average_constant"
|
||||
# TODO: --calib_max_intermediate_outputs"
|
||||
parser.add_argument(
|
||||
"--calib_moving_average",
|
||||
action="store_true",
|
||||
help="If enabled, the moving average of"
|
||||
" the minimum and maximum values will be computed when the calibration method selected is MinMax.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable_quantize_bias",
|
||||
action="store_true",
|
||||
help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
|
||||
" If not set, it remains floating-point bias and does not insert any quantization nodes"
|
||||
" associated with biases.",
|
||||
)
|
||||
|
||||
# TODO: Add arguments related to Smooth Quant
|
||||
|
||||
parser.add_argument(
|
||||
"--use_qdq_contrib_ops",
|
||||
action="store_true",
|
||||
help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
|
||||
" which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--minimum_real_range",
|
||||
type=float,
|
||||
default=0.0001,
|
||||
help="If set to a floating-point value, the calculation of the quantization parameters"
|
||||
" (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
|
||||
" is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
|
||||
" necessary for EPs like QNN that require a minimum floating-point range when determining "
|
||||
" quantization parameters.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qdq_keep_removable_activations",
|
||||
action="store_true",
|
||||
help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
|
||||
" and will be explicitly represented in the QDQ model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qdq_disable_weight_adjust_for_int32_bias",
|
||||
action="store_true",
|
||||
help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
|
||||
" has a scale (input_scale * weight_scale) that is too small.",
|
||||
)
|
||||
parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
|
||||
parser.add_argument(
|
||||
"--nodes_to_quantize",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nodes_to_exclude",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--op_per_channel_axis",
|
||||
nargs=2,
|
||||
action="append",
|
||||
metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
|
||||
default=[],
|
||||
help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
|
||||
" effective only when per channel quantization is supported and per_channel is True. If specific"
|
||||
" op type supports per channel quantization but not explicitly specified with channel axis,"
|
||||
" default channel axis will be used.",
|
||||
)
|
||||
parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_tensor_quant_overrides(file):
|
||||
# TODO: Enhance the function to handle more real cases of json file
|
||||
if not file:
|
||||
return {}
|
||||
with open(file) as f:
|
||||
quant_override_dict = json.load(f)
|
||||
for tensor in quant_override_dict:
|
||||
for enc_dict in quant_override_dict[tensor]:
|
||||
enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
|
||||
enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
|
||||
return quant_override_dict
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
|
||||
arg2quant_type = {
|
||||
"qint8": QuantType.QInt8,
|
||||
"quint8": QuantType.QUInt8,
|
||||
"qint16": QuantType.QInt16,
|
||||
"quint16": QuantType.QUInt16,
|
||||
"qint4": QuantType.QInt4,
|
||||
"quint4": QuantType.QUInt4,
|
||||
"qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
|
||||
}
|
||||
activation_type = arg2quant_type[args.activation_type]
|
||||
weight_type = arg2quant_type[args.weight_type]
|
||||
qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
|
||||
extra_options = {
|
||||
"EnableSubgraph": args.enable_subgraph,
|
||||
"ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
|
||||
"MatMulConstBOnly": args.matmul_const_b_only,
|
||||
"AddQDQPairToWeight": args.add_qdq_pair_to_weight,
|
||||
"OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
|
||||
"DedicatedQDQPair": args.dedicated_qdq_pair,
|
||||
"QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
|
||||
"CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
|
||||
"CalibMovingAverage": args.calib_moving_average,
|
||||
"QuantizeBias": not args.disable_quantize_bias,
|
||||
"UseQDQContribOps": args.use_qdq_contrib_ops,
|
||||
"MinimumRealRange": args.minimum_real_range,
|
||||
"QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
|
||||
"QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
|
||||
# Load json file for encoding override
|
||||
"TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
|
||||
}
|
||||
arg2calib_method = {
|
||||
"minmax": CalibrationMethod.MinMax,
|
||||
"entropy": CalibrationMethod.Entropy,
|
||||
"percentile": CalibrationMethod.Percentile,
|
||||
"distribution": CalibrationMethod.Distribution,
|
||||
}
|
||||
arg2quant_format = {
|
||||
"qdq": QuantFormat.QDQ,
|
||||
"qoperator": QuantFormat.QOperator,
|
||||
}
|
||||
sqc = StaticQuantConfig(
|
||||
calibration_data_reader=data_reader,
|
||||
calibrate_method=arg2calib_method[args.calibration_method],
|
||||
quant_format=arg2quant_format[args.quant_format],
|
||||
activation_type=activation_type,
|
||||
weight_type=weight_type,
|
||||
op_types_to_quantize=None,
|
||||
nodes_to_quantize=args.nodes_to_quantize,
|
||||
nodes_to_exclude=args.nodes_to_exclude,
|
||||
per_channel=args.per_channel,
|
||||
reduce_range=False,
|
||||
use_external_data_format=False,
|
||||
calibration_providers=None, # Use CPUExecutionProvider
|
||||
extra_options=extra_options,
|
||||
)
|
||||
quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,520 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections.abc import MutableMapping
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import onnx
|
||||
|
||||
from .quant_utils import QuantType
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuantTypeInfo: # noqa: PLW1641
|
||||
"""
|
||||
The quantization type information for a tensor override.
|
||||
"""
|
||||
|
||||
quant_type: QuantType
|
||||
symmetric: bool | None = None # If None, assumes default is used.
|
||||
reduce_range: bool | None = None # If None, assumes default is used.
|
||||
axis: int | None = None # If None, assumes per-tensor quantization
|
||||
|
||||
def __eq__(self, other: object):
|
||||
if isinstance(other, QuantTypeInfo):
|
||||
return (
|
||||
self.quant_type == other.quant_type
|
||||
and (self.symmetric is None or other.symmetric is None or self.symmetric == other.symmetric)
|
||||
and (self.reduce_range is None or other.reduce_range is None or self.reduce_range == other.reduce_range)
|
||||
and (self.axis == other.axis)
|
||||
)
|
||||
return NotImplemented
|
||||
|
||||
@staticmethod
|
||||
def load_from_dict(
|
||||
raw_dict: dict[str, Any],
|
||||
default_qtype: QuantType | None = None,
|
||||
default_symmetric: bool | None = None,
|
||||
default_reduce_range: bool | None = None,
|
||||
) -> QuantTypeInfo:
|
||||
return QuantTypeInfo(
|
||||
raw_dict.get("quant_type", default_qtype),
|
||||
raw_dict.get("symmetric", default_symmetric),
|
||||
raw_dict.get("reduce_range", default_reduce_range),
|
||||
raw_dict.get("axis"),
|
||||
)
|
||||
|
||||
def save_to_dict(self, raw_dict: dict[str, Any]):
|
||||
raw_dict["quant_type"] = self.quant_type
|
||||
if self.symmetric is not None:
|
||||
raw_dict["symmetric"] = self.symmetric
|
||||
if self.reduce_range is not None:
|
||||
raw_dict["reduce_range"] = self.reduce_range
|
||||
if self.axis is not None:
|
||||
raw_dict["axis"] = self.axis
|
||||
|
||||
|
||||
class TensorQuantOverridesHelper(MutableMapping):
|
||||
"""
|
||||
Utility wrapper over the tensor quantization overrides passed via extra_options.
|
||||
"""
|
||||
|
||||
def __init__(self, raw_overrides: dict[str, list[dict[str, Any]]]):
|
||||
self.overrides = raw_overrides
|
||||
self.quant_types = None
|
||||
self.keys_unsupported_with_scale_zp = {"symmetric", "reduce_range", "rmax", "rmin"}
|
||||
|
||||
def has_per_tensor_overrides(self, tensor_name: str) -> bool:
|
||||
overrides_list = self.overrides.get(tensor_name)
|
||||
return overrides_list and "axis" not in overrides_list[0]
|
||||
|
||||
def has_per_channel_overrides(self, tensor_name: str) -> bool:
|
||||
overrides_list = self.overrides.get(tensor_name)
|
||||
return overrides_list and "axis" in overrides_list[0]
|
||||
|
||||
def overrides_scale_zp(self, tensor_name: str) -> bool:
|
||||
overrides_list = self.overrides.get(tensor_name)
|
||||
return overrides_list and ("scale" in overrides_list[0]) and ("zero_point" in overrides_list[0])
|
||||
|
||||
def get_per_tensor_overrides(
|
||||
self,
|
||||
tensor_name: str,
|
||||
default_val: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any] | None:
|
||||
default_list_val = [default_val] if default_val is not None else None
|
||||
overrides_list = self.overrides.get(tensor_name, default_list_val)
|
||||
if overrides_list and "axis" in overrides_list[0]:
|
||||
raise ValueError(
|
||||
f"Expected tensor '{tensor_name}' to use per-tensor quantization overrides, "
|
||||
f"but found per-channel overrides."
|
||||
)
|
||||
|
||||
return overrides_list[0] if overrides_list else None
|
||||
|
||||
def get_per_channel_overrides(
|
||||
self,
|
||||
tensor_name: str,
|
||||
default_val: list[dict[str, Any]] | None = None,
|
||||
) -> list[dict[str, Any]] | None:
|
||||
overrides_list = self.overrides.get(tensor_name, default_val)
|
||||
|
||||
if not overrides_list:
|
||||
return None
|
||||
|
||||
if "axis" not in overrides_list[0]:
|
||||
raise ValueError(
|
||||
f"Expected tensor '{tensor_name}' to have per-channel quantization overrides (axis value is missing).",
|
||||
)
|
||||
|
||||
return overrides_list
|
||||
|
||||
def get_quant_types(self) -> set[QuantType]:
|
||||
if self.quant_types is not None:
|
||||
return self.quant_types
|
||||
|
||||
self.quant_types = set()
|
||||
|
||||
if self.overrides:
|
||||
for quant_overrides_list in self.overrides.values():
|
||||
for quant_overrides in quant_overrides_list:
|
||||
if "quant_type" in quant_overrides:
|
||||
self.quant_types.add(quant_overrides["quant_type"])
|
||||
|
||||
if "convert" in quant_overrides and "quant_type" in quant_overrides["convert"]:
|
||||
self.quant_types.add(quant_overrides["convert"]["quant_type"])
|
||||
|
||||
return self.quant_types
|
||||
|
||||
def _is_valid_per_tensor(
|
||||
self,
|
||||
initializers,
|
||||
default_activation_qtype,
|
||||
tensor_name: str,
|
||||
quant_overrides: dict[str, Any],
|
||||
) -> tuple[bool, str | None]:
|
||||
if not isinstance(quant_overrides, dict):
|
||||
return (
|
||||
False,
|
||||
f"Tensor quantization overrides for '{tensor_name}' are not in a dict",
|
||||
)
|
||||
|
||||
is_initializer = tensor_name in initializers
|
||||
|
||||
quant_type = quant_overrides.get("quant_type")
|
||||
if quant_type:
|
||||
self.quant_types.add(quant_type)
|
||||
|
||||
has_scale = "scale" in quant_overrides
|
||||
has_zero_point = "zero_point" in quant_overrides
|
||||
|
||||
if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
|
||||
return (
|
||||
False,
|
||||
"Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
|
||||
)
|
||||
|
||||
if has_scale:
|
||||
keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
|
||||
if keys:
|
||||
return (
|
||||
False,
|
||||
f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
|
||||
)
|
||||
|
||||
if "reduce_range" in quant_overrides and not is_initializer:
|
||||
return (
|
||||
False,
|
||||
f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
|
||||
)
|
||||
|
||||
if "convert" in quant_overrides:
|
||||
if is_initializer:
|
||||
return False, "Cannot use 'convert' override for initializers"
|
||||
|
||||
if "quant_type" not in quant_overrides["convert"]:
|
||||
return False, f"'convert' options (tensor '{tensor_name}') must specify a 'quant_type'"
|
||||
|
||||
if "reduce_range" in quant_overrides["convert"]:
|
||||
return (
|
||||
False,
|
||||
f"Option 'reduce_range' is only supported for initializers, not for activation {tensor_name}",
|
||||
)
|
||||
|
||||
convert_quant_type = quant_overrides["convert"]["quant_type"]
|
||||
original_quant_type = quant_type if quant_type is not None else default_activation_qtype
|
||||
if convert_quant_type == original_quant_type:
|
||||
return (
|
||||
False,
|
||||
f"'convert' quant_type must differ from original quant_type (tensor '{tensor_name}')",
|
||||
)
|
||||
|
||||
convert_has_scale = "scale" in quant_overrides["convert"]
|
||||
convert_has_zero_point = "zero_point" in quant_overrides["convert"]
|
||||
|
||||
if (convert_has_scale and not convert_has_zero_point) or (convert_has_zero_point and not convert_has_scale):
|
||||
return (
|
||||
False,
|
||||
f"Must provide both 'scale' and 'zero_point' if one of the overrides is provided (tensor '{tensor_name}')",
|
||||
)
|
||||
|
||||
if convert_has_scale:
|
||||
keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides["convert"]))
|
||||
if keys:
|
||||
return (
|
||||
False,
|
||||
f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point' "
|
||||
f"(tensor '{tensor_name}')",
|
||||
)
|
||||
|
||||
self.quant_types.add(convert_quant_type)
|
||||
|
||||
return True, None
|
||||
|
||||
def _is_valid_per_channel(
|
||||
self,
|
||||
initializers,
|
||||
tensor_name: str,
|
||||
quant_overrides_list: list[dict[str, Any]],
|
||||
) -> tuple[bool, str | None]:
|
||||
is_initializer = tensor_name in initializers
|
||||
|
||||
if not is_initializer:
|
||||
return (
|
||||
False,
|
||||
f"Tensor '{tensor_name}' has per-channel overrides, but is not an initializer",
|
||||
)
|
||||
|
||||
axis = quant_overrides_list[0].get("axis")
|
||||
|
||||
if axis is None:
|
||||
return (
|
||||
False,
|
||||
f"Per-channel overrides for tensor {tensor_name} is missing an 'axis' value in "
|
||||
"the first channel dictionary.",
|
||||
)
|
||||
|
||||
weight_shape = list(initializers[tensor_name].dims)
|
||||
weight_rank = len(weight_shape)
|
||||
norm_axis = axis
|
||||
if norm_axis < 0:
|
||||
norm_axis += weight_rank
|
||||
|
||||
if norm_axis < 0 or norm_axis >= len(weight_shape):
|
||||
return (
|
||||
False,
|
||||
f"Axis override value is out-of-bounds for tensor {tensor_name} (rank {len(weight_shape)})",
|
||||
)
|
||||
|
||||
if len(quant_overrides_list) > 1 and len(quant_overrides_list) != weight_shape[norm_axis]:
|
||||
return (
|
||||
False,
|
||||
f"Incorrect number of channel overrides for tensor {tensor_name} (axis {axis}), "
|
||||
f"expected {weight_shape[axis]}, but found {len(quant_overrides_list)}.",
|
||||
)
|
||||
|
||||
if "convert" in quant_overrides_list[0]:
|
||||
return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
|
||||
|
||||
quant_type = quant_overrides_list[0].get("quant_type")
|
||||
if quant_type:
|
||||
self.quant_types.add(quant_type)
|
||||
|
||||
symmetric = quant_overrides_list[0].get("symmetric")
|
||||
reduce_range = quant_overrides_list[0].get("reduce_range")
|
||||
|
||||
has_scale = "scale" in quant_overrides_list[0]
|
||||
has_zero_point = "zero_point" in quant_overrides_list[0]
|
||||
has_scale_zp = has_scale and has_zero_point
|
||||
|
||||
if (has_scale and not has_zero_point) or (has_zero_point and not has_scale):
|
||||
return (
|
||||
False,
|
||||
"Must provide both 'scale' and 'zero_point' if one of the overrides is provided",
|
||||
)
|
||||
|
||||
if has_scale_zp:
|
||||
keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides_list[0]))
|
||||
if keys:
|
||||
return (
|
||||
False,
|
||||
f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
|
||||
)
|
||||
|
||||
has_rmin = "rmin" in quant_overrides_list[0]
|
||||
has_rmax = "rmax" in quant_overrides_list[0]
|
||||
has_rmin_rmax = has_rmin and has_rmax
|
||||
if (has_rmin and not has_rmax) or (not has_rmin and has_rmax):
|
||||
return (
|
||||
False,
|
||||
"Must provide both 'rmin' and 'rmax' if one is provided",
|
||||
)
|
||||
|
||||
for index, quant_overrides in enumerate(quant_overrides_list[1:]):
|
||||
if not isinstance(quant_overrides, dict):
|
||||
return (
|
||||
False,
|
||||
f"Tensor quantization overrides at index {index} for '{tensor_name}' are not in a dict",
|
||||
)
|
||||
|
||||
if "convert" in quant_overrides:
|
||||
return False, f"Cannot use 'convert' override for initializers, such as {tensor_name}."
|
||||
|
||||
# For per-channel quantization, all channels must use the same quantization type, axis, symmetric
|
||||
# and reduce_range values. And, if specified, they must be present in the first channel dict
|
||||
# (i.e., quant_overrides_list[0]).
|
||||
if "quant_type" in quant_overrides and quant_type != quant_overrides["quant_type"]:
|
||||
return (
|
||||
False,
|
||||
"Channel quantization types for tensor '{tensor_name}' do not match at index {index}.",
|
||||
)
|
||||
if "axis" in quant_overrides and axis != quant_overrides["axis"] and norm_axis != quant_overrides["axis"]:
|
||||
return (
|
||||
False,
|
||||
"Channel axis for tensor '{tensor_name}' does not match at index {index}.",
|
||||
)
|
||||
if "symmetric" in quant_overrides and symmetric != quant_overrides["symmetric"]:
|
||||
return (
|
||||
False,
|
||||
"Channel symmetric value for tensor '{tensor_name}' does not match at index {index}.",
|
||||
)
|
||||
if "reduce_range" in quant_overrides and reduce_range != quant_overrides["reduce_range"]:
|
||||
return (
|
||||
False,
|
||||
"Channel reduce_range value for tensor '{tensor_name}' does not match at index {index}.",
|
||||
)
|
||||
|
||||
# If override scale/zp, must do so for all channels.
|
||||
chan_has_scale_zp = "scale" in quant_overrides and "zero_point" in quant_overrides
|
||||
|
||||
if has_scale_zp and not chan_has_scale_zp:
|
||||
return (
|
||||
False,
|
||||
"Per-channel overrides that specify scale/zero_point must do so for all channels, "
|
||||
f"but tensor '{tensor_name}' is missing them at index {index}.",
|
||||
)
|
||||
|
||||
if chan_has_scale_zp:
|
||||
keys = self.keys_unsupported_with_scale_zp.intersection(set(quant_overrides))
|
||||
if keys:
|
||||
return (
|
||||
False,
|
||||
f"Tensor override option(s) [{', '.join(keys)}] are invalid with 'scale' and 'zero_point'",
|
||||
)
|
||||
|
||||
# If override rmin/rmax, must do so for all channels.
|
||||
chan_has_rmin_rmax = "rmin" in quant_overrides and "rmax" in quant_overrides
|
||||
if has_rmin_rmax and not chan_has_rmin_rmax:
|
||||
return (
|
||||
False,
|
||||
"Per-channel overrides that specify rmin/rmax must do so for all channels, "
|
||||
f"but tensor '{tensor_name}' is missing them at index {index}.",
|
||||
)
|
||||
|
||||
return True, None
|
||||
|
||||
def is_valid(
|
||||
self,
|
||||
initializers: dict[str, onnx.TensorProto],
|
||||
activation_names: set[str],
|
||||
default_activation_qtype,
|
||||
) -> tuple[bool, str | None]:
|
||||
self.quant_types = set()
|
||||
|
||||
# Validate that compatible/valid overrides are provided.
|
||||
if self.overrides:
|
||||
for tensor_name, quant_overrides_list in self.overrides.items():
|
||||
if tensor_name not in initializers and tensor_name not in activation_names:
|
||||
return False, f"Tensor '{tensor_name}' in TensorQuantOverrides is not present in the model"
|
||||
|
||||
if not isinstance(quant_overrides_list, list):
|
||||
return False, f"Tensor quantization overrides for '{tensor_name}' are not in a list"
|
||||
|
||||
if not quant_overrides_list:
|
||||
continue
|
||||
|
||||
if not isinstance(quant_overrides_list[0], dict):
|
||||
return False, f"Tensor quantization overrides at index 0 for '{tensor_name}' are not in a dict"
|
||||
|
||||
if not quant_overrides_list[0]:
|
||||
continue
|
||||
|
||||
axis = quant_overrides_list[0].get("axis")
|
||||
is_per_channel = len(quant_overrides_list) > 1 or axis is not None
|
||||
|
||||
if is_per_channel:
|
||||
return self._is_valid_per_channel(initializers, tensor_name, quant_overrides_list)
|
||||
|
||||
return self._is_valid_per_tensor(
|
||||
initializers, default_activation_qtype, tensor_name, quant_overrides_list[0]
|
||||
)
|
||||
|
||||
return True, None
|
||||
|
||||
def update_tensor_overrides(
|
||||
self,
|
||||
tensor_name: str,
|
||||
new_vals: dict[str, Any],
|
||||
channels: list[int] | None = None,
|
||||
overwrite: bool = True,
|
||||
) -> bool:
|
||||
if not new_vals:
|
||||
return False
|
||||
|
||||
channels = set(channels) if channels is not None else None
|
||||
have_overrides = self.overrides.get(tensor_name)
|
||||
|
||||
# If `overwrite` is False, check if we would overwrite anything.
|
||||
do_update = True
|
||||
if not overwrite and have_overrides:
|
||||
for channel, overrides in enumerate(self.overrides[tensor_name]):
|
||||
if channels is not None and channel not in channels:
|
||||
continue
|
||||
if set(new_vals).intersection(set(overrides)):
|
||||
do_update = False
|
||||
break
|
||||
|
||||
# Do the update if `overwrite` is True or if nothing is overwritten (do not want partial overwrites).
|
||||
if do_update:
|
||||
if not have_overrides:
|
||||
self.overrides[tensor_name] = [{}]
|
||||
|
||||
for channel, overrides in enumerate(self.overrides[tensor_name]):
|
||||
if channels is not None and channel not in channels:
|
||||
continue
|
||||
overrides.update(new_vals)
|
||||
|
||||
return do_update
|
||||
|
||||
def get_node_output_qtype_info(
|
||||
self,
|
||||
output_name: str,
|
||||
default_qtype: QuantType | None,
|
||||
default_symmetric: bool | None = None,
|
||||
) -> QuantTypeInfo:
|
||||
# Outputs are activations, which do not support 'reduce_range' or 'axis'
|
||||
if output_name not in self.overrides:
|
||||
return QuantTypeInfo(default_qtype, default_symmetric)
|
||||
|
||||
tensor_overrides = self.overrides[output_name][0]
|
||||
|
||||
return QuantTypeInfo(
|
||||
tensor_overrides.get("quant_type", default_qtype),
|
||||
tensor_overrides.get("symmetric", default_symmetric),
|
||||
)
|
||||
|
||||
def get_node_input_qtype_info(
|
||||
self,
|
||||
input_name: str,
|
||||
node_name: str,
|
||||
default_qtype: QuantType | None,
|
||||
default_symmetric: bool | None = None,
|
||||
default_reduce_range: bool | None = None,
|
||||
) -> QuantTypeInfo:
|
||||
if input_name not in self.overrides or not self.overrides[input_name]:
|
||||
return QuantTypeInfo(default_qtype, default_symmetric, default_reduce_range)
|
||||
|
||||
# Get the first overrides dict in the list. This works for both per-tensor and per-channel
|
||||
# quantization because all channels must use the same quant type.
|
||||
tensor_overrides = self.overrides[input_name][0]
|
||||
producer_type = tensor_overrides.get("quant_type", default_qtype)
|
||||
|
||||
if "convert" not in tensor_overrides:
|
||||
return QuantTypeInfo(
|
||||
producer_type,
|
||||
tensor_overrides.get("symmetric", default_symmetric),
|
||||
tensor_overrides.get("reduce_range", default_reduce_range),
|
||||
tensor_overrides.get("axis"),
|
||||
)
|
||||
|
||||
# This tensor is converted. Check if the node gets the original qtype or the converted qtype.
|
||||
convert_dict = tensor_overrides["convert"]
|
||||
qtype_info = QuantTypeInfo(
|
||||
producer_type,
|
||||
convert_dict.get("symmetric", default_symmetric),
|
||||
# Converted tensors are not initializers, so do not have 'axis' or 'reduce_range'.
|
||||
)
|
||||
|
||||
# Check if all nodes receive the converted type (i.e., recv_nodes is None) or this node
|
||||
# is in the list of consumers (recv_nodes).
|
||||
if ("recv_nodes" not in convert_dict) or (node_name in convert_dict["recv_nodes"]):
|
||||
qtype_info.quant_type = convert_dict["quant_type"]
|
||||
|
||||
return qtype_info
|
||||
|
||||
def pprint_str(self, indent=None) -> str:
|
||||
return json.dumps(self.overrides, default=str, indent=indent)
|
||||
|
||||
def empty(self) -> bool:
|
||||
return not self.overrides
|
||||
|
||||
def get_dict(self) -> dict[str, list[dict[str, Any]]]:
|
||||
return self.overrides
|
||||
|
||||
# Required implementations of abstract methods in collections.abc.MutableMapping
|
||||
# so that this class can be used like a dict.
|
||||
def __setitem__(self, key: str, value: list[dict]):
|
||||
self.overrides[key] = value
|
||||
|
||||
def __getitem__(self, key: str) -> list[dict]:
|
||||
return self.overrides[key]
|
||||
|
||||
def __delitem__(self, key: str):
|
||||
del self.overrides[key]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.overrides)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.overrides)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return str(self.overrides)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"{super().__repr__()}, TensorQuantOverridesHelper({self.overrides})"
|
||||
Reference in New Issue
Block a user