From c692f0b3d47c2c5f0812fc9ed616c4c3e73aece3 Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Mon, 2 Dec 2024 18:27:52 +0900 Subject: [PATCH] [GPU] Enable output transposed gemm for onednn (#27568) ### Details: - *The gemm onednn requires non-transposed output shape and transposed order info. But the current impl uses output-transposed shape itself. It needs to use non-transposed output shape.* ### Tickets: - *155222* --- .../src/graph/impls/onednn/gemm_onednn.cpp | 11 ++- .../transpose_matmul_fusion.cpp | 92 +++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp index 767128a5be2950..c4c27161b89fe4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp @@ -186,8 +186,15 @@ struct gemm_onednn : typed_primitive_onednn_impl { if (ret) { tag = convert_data_format(transposed_format); dnnl::memory::dims original_dims = dims; - for (size_t i = 0; i < original_dims.size(); ++i) { - dims[i] = original_dims[order[i]]; + if (is_input) { + for (size_t i = 0; i < original_dims.size(); ++i) { + dims[i] = original_dims[order[i]]; + } + } else { + // Get non-transposed dims for output dims + for (size_t i = 0; i < original_dims.size(); ++i) { + dims[order[i]] = original_dims[i]; + } } } else { std::ostringstream ostream; diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp index b55c9e00bdab64..cc28dbab3660b9 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp @@ -96,3 +96,95 @@ TEST_P(TransposeMatMulFusionOnGPU, CompareWithRefs){ }; } // namespace + + +//================================================================================= +// Transpose + MatMul + Transpose pattern fusion (TransposeMatMulTransposeMatcher) +//================================================================================= +namespace ov { +namespace test { + +using MatMulTransposeFusionParams = std::tuple; // input C shapes +class MatMulTransposeFusionOnGPU: public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ov::PartialShape input0; + ov::PartialShape input1; + ov::PartialShape input2; + + std::tie(input0, input1, input2) = obj.param; + + std::ostringstream result; + result << "device=(" << std::string(utils::DEVICE_GPU) << ")_"; + result << ov::test::utils::partialShape2str({input0}) << "_"; + result << ov::test::utils::partialShape2str({input1}) << "_"; + result << ov::test::utils::partialShape2str({input2}) << "_"; + return result.str(); + } +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + + ov::PartialShape shape1; + ov::PartialShape shape2; + ov::PartialShape shape3; + + std::tie(shape1, shape2, shape3) = GetParam(); + + InputShape input_shape1 = {shape1, {shape1.get_shape()}}; + InputShape input_shape2 = {shape2, {shape2.get_shape()}}; + InputShape input_shape3 = {shape3, {shape3.get_shape()}}; + init_input_shapes({input_shape1, input_shape2, input_shape3}); + + const auto param1 = std::make_shared(ov::element::f16, shape1); + const auto param2 = std::make_shared(ov::element::f16, shape2); + const auto param3 = std::make_shared(ov::element::f16, shape3); + + auto input2_shape = shape2.get_shape(); + + //input0 + const auto input0_order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {1, 0, 2, 3}); + const auto input0_transpose = std::make_shared(param1, input0_order); + const auto input0_shape_pattern = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, input2_shape); + const auto input0_reshape = std::make_shared(input0_transpose, input0_shape_pattern, false); + + //input1 + const auto input1_order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {0, 1, 3, 2}); + const auto input1_transpose = std::make_shared(param2, input1_order); + + // matmul & softmax + const auto matmul1 = std::make_shared(input0_reshape, input1_transpose, false, false); + const auto softmax = std::make_shared(matmul1, -1); + + // input3 + const auto input3_transpose = std::make_shared(param3, input0_order); + const auto input3_shape_pattern = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, input2_shape); + const auto input3_reshape = std::make_shared(input3_transpose, input3_shape_pattern, false); + + // target matmul + const auto matmul2 = std::make_shared(softmax, input3_reshape, false, false); + const auto order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {2, 0, 1, 3}); + const auto transpose = std::make_shared(matmul2, order); + + function = std::make_shared(transpose, ov::ParameterVector{param1, param2, param3}); + } +}; + + +} // namespace test +} // namespace ov + + +namespace { +INSTANTIATE_TEST_SUITE_P(smoke_MatMulTransposeFusion, MatMulTransposeFusionOnGPU, + ::testing::Values( + MatMulTransposeFusionParams({3, 8, 16, 1}, {2, 4, 3, 16}, {3, 8, 16, 1})), + MatMulTransposeFusionOnGPU::getTestCaseName); + +TEST_P(MatMulTransposeFusionOnGPU, CompareWithRefs){ + run(); +}; +} // namespace