diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index 7c18b5f35..572e1f010 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -33,7 +33,7 @@ from torch.nn import functional from torch.utils import data -from onnx_neural_compressor import data_reader, logger, utility +from onnx_neural_compressor import data_reader from onnx_neural_compressor.quantization import config, matmul_nbits_quantizer, tuning logging.basicConfig( @@ -315,10 +315,6 @@ def rewind(self): if __name__ == "__main__": - utility.set_workspace(args.workspace) - if not os.path.exists(args.workspace): - os.mkdir(args.workspace) - if args.benchmark: if args.mode == "performance": benchmark(args.model_path) @@ -331,23 +327,11 @@ def rewind(self): model_name = "model.onnx" # require optimum >= 1.14.0 model_path = os.path.join(args.model_path, model_name) - # do graph optimization - logger.info("Start graph optimization...") - sess_options = ort.SessionOptions() - sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx") - sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data" - ) - sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") - sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"]) - logger.info("Graph optimization done.") - best_model = None if args.algorithm.upper() == "RTN": algo_config = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig() quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=True, @@ -362,7 +346,7 @@ def rewind(self): calibration_data_reader=calibration_data_reader, enable_mse_search=False ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=True, @@ -377,7 +361,7 @@ def rewind(self): calibration_data_reader=calibration_data_reader, ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( - sess_options.optimized_model_filepath, + model_path, n_bits=4, block_size=32, is_symmetric=False, diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py index fc004cc05..095897b49 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrate.py @@ -278,7 +278,7 @@ def _collect_data(inputs): node_name = name_to_node[node_output_names[output_idx]] if node_output_names[output_idx] not in name_to_calibrator: calib_method = ( - q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else 0 + q_config[node_name]["calibrate_method"] if q_config and node_name in q_config else "MinMax" ) assert calib_method in calibrator.CALIBRATOR, "Calibration method {} is not registered.".format( calib_method @@ -389,7 +389,7 @@ def get_weight_tensors_calib_range(self): os.path.dirname(self.model_wrapper.model_path) if self.model_wrapper.model_path is not None else "" ), ) - _calibrator = calibrator.CALIBRATOR[0]() # use minmax method to calibrate initializer tensors + _calibrator = calibrator.CALIBRATOR["MinMax"]() # use minmax method to calibrate initializer tensors if initializer_tensor.flatten().size > 0: _calibrator.collect(initializer_tensor) weight_tensors_calib_range[initializer_tensor_name] = [list(_calibrator.calib_range)] diff --git a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py index 97506b0d2..abef2d323 100644 --- a/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py +++ b/onnx_neural_compressor/algorithms/post_training_quant/calibrator.py @@ -69,7 +69,7 @@ def calib_range(self): return self._calib_min, self._calib_max -@calib_registry(calib_method=0) +@calib_registry(calib_method="MinMax") class MinMaxCalibrator(CalibratorBase): """MinMax calibrator class.""" @@ -109,7 +109,7 @@ def method_name(self): return "MinMax" -@calib_registry(calib_method=2) +@calib_registry(calib_method="Percentile") class PercentileCalibrator(CalibratorBase): """Percentile calibrator class. @@ -163,7 +163,7 @@ def method_name(self): return "Percentile" -@calib_registry(calib_method=1) +@calib_registry(calib_method="Entropy") class EntropyCalibrator(CalibratorBase): """Entropy calibrator class. diff --git a/onnx_neural_compressor/algorithms/utility.py b/onnx_neural_compressor/algorithms/utility.py index c326b0775..d802dc04d 100644 --- a/onnx_neural_compressor/algorithms/utility.py +++ b/onnx_neural_compressor/algorithms/utility.py @@ -147,7 +147,7 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, sym=False): # noqa: N802 def quantize_nparray(dtype, arr, scale, zero_point, low=None, high=None): """Quantize numpy array.""" - q_weight = np.empty_like(np.asarray(arr), dtype=scale.dtype) + q_weight = np.empty_like(np.asarray(arr), dtype=np.asarray(scale).dtype) np.divide(arr, scale, out=q_weight) np.add(q_weight, zero_point, out=q_weight) np.round(q_weight, out=q_weight) @@ -340,9 +340,8 @@ def make_matmul_weight_only_node( op_type = "MatMulNBits" # pack quantized weight - for i in range(q_weight.shape[0]): - for k in range(0, group_size, 2): - packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4 + q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4 + packed[:, :] = q_weight_pairs[:, :blob_size] packed = np.reshape(packed, (-1, k_blocks, blob_size)) # build scale tensor @@ -363,15 +362,14 @@ def make_matmul_weight_only_node( packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8") else: packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8") - for i in range(zero_point.shape[0] // k_blocks): - for j in range(k_blocks): - idx = i * k_blocks + j - zp = zero_point[idx] - packed_zp[idx // 2] = ( - ((packed_zp[idx // 2] & 0x0F) | (zp << 4)) - if (idx & 1) - else ((packed_zp[idx // 2] & 0xF0) | zp) - ) + # create an index array + idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1) + # separate odd and even indices + even_idx = idx[::2] + odd_idx = idx[1::2] + # vectorized operation for even and odd indices + packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel() + packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4) zp_tensor = onnx.helper.make_tensor( name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index 36efa17fc..c1661f85e 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -35,14 +35,12 @@ def __init__(self, model, **kwargs): model (str or ModelProto): path to onnx model or loaded ModelProto model object. """ self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False) - self._model_path = None if not isinstance(model, str) else model self.check_is_large_model() if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False): logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize") if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True): - onnx.external_data_helper.load_external_data_for_model(self.model, os.path.dirname(self._model_path)) self._config = None diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index fa29a6a9c..bcb08d64c 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -206,6 +206,7 @@ def smooth_quant_entry( pathlib.Path(tmp_dir).joinpath("smooth.onnx").as_posix(), quant_config, calibration_data_reader, + model_output, ) return q_model diff --git a/onnx_neural_compressor/quantization/config.py b/onnx_neural_compressor/quantization/config.py index d7392c399..6137c29fd 100644 --- a/onnx_neural_compressor/quantization/config.py +++ b/onnx_neural_compressor/quantization/config.py @@ -660,7 +660,7 @@ class OperatorConfig: def __post_init__(self): self.weight_type = getattr(self.weight_type, "tensor_type", self.weight_type) self.activation_type = getattr(self.activation_type, "tensor_type", self.activation_type) - self.calibrate_method = getattr(self.calibrate_method, "value", self.calibrate_method) + self.calibrate_method = getattr(self.calibrate_method, "name", self.calibrate_method) def __getitem__(self, key): return getattr(self, key) diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index 2eedb90ee..ea77b18de 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -109,9 +109,7 @@ def __init__( ): if nodes_to_exclude is None: nodes_to_exclude = [] - self.model_path = model if isinstance(model, str) else None self.model = model - self.model = onnx_model.ONNXModel(onnx.load(model)) if isinstance(model, str) else onnx_model.ONNXModel(model) self.block_size = block_size self.is_symmetric = is_symmetric self.accuracy_level = accuracy_level @@ -170,7 +168,7 @@ def _generate_nc_config(self): def int4_quant_algo(self): qconfig = self._generate_nc_config() - model = self.model_path or self.model + model = self.model opt_tmp_file = tempfile.TemporaryDirectory() # do graph optimization if not layer_wise_quant @@ -181,12 +179,20 @@ def int4_quant_algo(self): if not isinstance(model, str): onnx.save(model, pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix()) model = pathlib.Path(opt_tmp_file.name).joinpath("tmp.onnx").as_posix() + logger.info("Start graph optimization...") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = self.optimization_level sess_options.optimized_model_filepath = pathlib.Path(opt_tmp_file.name).joinpath("opt.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "opt.onnx_data" + ) + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_min_size_in_bytes", "1024" + ) session = ort.InferenceSession(model, sess_options) model = sess_options.optimized_model_filepath del session + logger.info("Graph optimization done.") logger.info(f"start to quantize model with {self.algorithm} algorithm...") if self.algorithm == "RTN": diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index 9a1c7bda9..9fb3dfd41 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -36,6 +36,12 @@ def quantize( sess_options = ort.SessionOptions() sess_options.graph_optimization_level = optimization_level sess_options.optimized_model_filepath = pathlib.Path(tmp_dir).joinpath("opt.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "opt.onnx_data" + ) + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_min_size_in_bytes", "1024" + ) session = ort.InferenceSession(model_input, sess_options) del session model_input = sess_options.optimized_model_filepath diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index 100d8c3b3..4cc0030b9 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -496,6 +496,10 @@ def autotune( sess_options = ort.SessionOptions() sess_options.graph_optimization_level = optimization_level sess_options.optimized_model_filepath = pathlib.Path(tmp_folder.name).joinpath("opt.onnx").as_posix() + sess_options.add_session_config_entry( + "session.optimized_model_external_initializers_file_name", "opt.onnx_data" + ) + sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") session = ort.InferenceSession(model_input, sess_options) model_input = sess_options.optimized_model_filepath del session diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index 7e14d83d7..7988cd3f6 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -134,6 +134,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel = quant.model @@ -145,6 +146,7 @@ def test_rtn_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel_lwq = quant.model @@ -183,6 +185,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel = quant.model @@ -196,6 +199,7 @@ def test_gptq_layer_wise_with_ort_like_api(self): quant = matmul_4bits_quantizer.MatMul4BitsQuantizer( copy.deepcopy(self.llama), algo_config=algo_config, + optimization_level=ort.GraphOptimizationLevel.ORT_DISABLE_ALL, ) quant.process() qmodel_lwq = quant.model diff --git a/test/quantization/post_training_quant/test_operators.py b/test/quantization/post_training_quant/test_operators.py index 9345305e8..45c189328 100644 --- a/test/quantization/post_training_quant/test_operators.py +++ b/test/quantization/post_training_quant/test_operators.py @@ -68,7 +68,7 @@ class TestQuantizer(unittest.TestCase): "per_channel": False, "weight_sym": True, "activation_sym": False, - "calibrate_method": quantization.CalibrationMethod.MinMax, + "calibrate_method": "MinMax", } @classmethod @@ -622,7 +622,7 @@ def test_conv(self): "C": [np.uint8(10.0), np.float32(0)], "D": [np.uint8(10.0), np.float32(0)], } - quantizable_op_types = [op] + quantizable_op_types = ["Conv"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual( collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 @@ -653,7 +653,7 @@ def test_matmul(self): "B": [np.uint8(10.0), np.float32(0)], "C": [np.uint8(10.0), np.float32(0)], } - quantizable_op_types = ["Matmul"] + quantizable_op_types = ["MatMul"] q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types) self.assertEqual( collections.Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1 diff --git a/test/quantization/post_training_quant/test_post_training_quant.py b/test/quantization/post_training_quant/test_post_training_quant.py index 89be504b7..2720ff69d 100644 --- a/test/quantization/post_training_quant/test_post_training_quant.py +++ b/test/quantization/post_training_quant/test_post_training_quant.py @@ -71,7 +71,7 @@ def _count_op_num(model, optype): return num -class TestStaticQuant(unittest.TestCase): +class TestPostTrainingQuant(unittest.TestCase): @classmethod def setUpClass(self): @@ -86,6 +86,7 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree("./model", ignore_errors=True) os.remove("quant.onnx") + os.remove("quant.onnx_data") def test_static_quant(self): cfg = config.StaticQuantConfig( @@ -93,6 +94,7 @@ def test_static_quant(self): weight_type=quantization.QuantType.QInt8, per_channel=True, quant_last_matmul=True, + calibrate_method=quantization.CalibrationMethod.Entropy, extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, execution_provider="CPUExecutionProvider", ) @@ -103,6 +105,7 @@ def test_static_quant(self): cfg = config.StaticQuantConfig( calibration_data_reader=self.data_reader, weight_type=quantization.QuantType.QInt8, + calibrate_method=quantization.CalibrationMethod.Percentile, per_channel=True, quant_last_matmul=False, extra_options={"WeightSymmetric": True, "ActivationSymmetric": False}, diff --git a/test/quantization/post_training_quant/test_quant_utils.py b/test/quantization/post_training_quant/test_quant_utils.py index 6fce47d7c..e98c6104d 100644 --- a/test/quantization/post_training_quant/test_quant_utils.py +++ b/test/quantization/post_training_quant/test_quant_utils.py @@ -15,10 +15,6 @@ def test_pad_tensor(self): pad_data = quant_utils.pad_tensor(data, group_size, k_blocks) self.assertEqual(pad_data.shape, (k_blocks * group_size, 32)) - def test_4bit_quant_tensor(self): - data = np.random.random((100, 32)) - q_data, scale, zp = quant_utils.quant_tensor(data) - def test_quant_dequant_data(self): data = np.random.random((100, 32)) qrange = quant_utils.get_qmin_qmax_for_qType( @@ -34,7 +30,6 @@ def test_quant_dequant_data(self): _, _, zero_point, scale, quantized_data = quant_utils.quantize_data( data=data, - quantize_range=qrange, qType=onnx.TensorProto.UINT8, sym=True, ) @@ -48,7 +43,6 @@ def test_quant_dequant_data(self): _, _, zero_point, scale, quantized_data = quant_utils.quantize_data_per_channel( data=data, - quantize_range=qrange, qType=onnx.TensorProto.UINT8, sym=True, axis=1, diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index 46b7dce75..39c09bbf0 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -179,11 +179,11 @@ def test_static_quant_config(self): elif idx in [1, 5]: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) if idx < 4: - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") else: self.assertFalse("add" in configs_mapping) if idx in [0, 1]: - self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax") self.assertLess(idx, 16) for execution_provider in ["TensorrtExecutionProvider"]: @@ -215,9 +215,9 @@ def test_static_quant_config(self): configs_mapping = quant_config.to_config_mapping(model_info=model_info) if "Matmul" in configs_mapping: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["Matmul"]["calibrate_method"], "MinMax") if "add" in configs_mapping: - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") self.assertLess(idx, 16) for execution_provider in ["TensorrtExecutionProvider"]: @@ -236,8 +236,8 @@ def test_static_quant_config(self): elif idx in [1, 5]: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) if "add" in configs_mapping: - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") self.assertTrue(configs_mapping["add"]["weight_sym"]) self.assertTrue(configs_mapping["add"]["activation_sym"]) if "Matmul" in configs_mapping: @@ -261,7 +261,7 @@ def test_static_custom_quant_config(self): self.assertTrue(configs_mapping["Matmul"]["per_channel"]) elif idx == 1: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") self.assertLess(idx, 2) @@ -295,7 +295,7 @@ def test_static_custom_quant_config(self): model_info = quant_config.get_model_info(model=self.simple_onnx_model) configs_mapping = quant_config.to_config_mapping(model_info=model_info) self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") self.assertLess(idx, 4) for execution_provider in ["TensorrtExecutionProvider"]: @@ -314,7 +314,7 @@ def test_static_custom_quant_config(self): self.assertTrue(configs_mapping["Matmul"]["per_channel"]) elif idx == 1: self.assertFalse(configs_mapping["Matmul"]["per_channel"]) - self.assertEqual(configs_mapping["add"]["calibrate_method"], 0) + self.assertEqual(configs_mapping["add"]["calibrate_method"], "MinMax") self.assertTrue(configs_mapping["add"]["weight_sym"]) self.assertTrue(configs_mapping["add"]["activation_sym"]) self.assertTrue(configs_mapping["Matmul"]["weight_sym"])