Skip to content

Commit

Permalink
[ARM][FORK] Revert ACL stateless feature
Browse files Browse the repository at this point in the history
  • Loading branch information
alvoron committed Dec 23, 2024
1 parent 8e1a5fa commit 73ed350
Show file tree
Hide file tree
Showing 25 changed files with 1,084 additions and 1,105 deletions.
2 changes: 1 addition & 1 deletion .github/automation/build_acl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ source ${SCRIPT_DIR}/common_aarch64.sh

ACL_CONFIG=${ACL_CONFIG:-"Release"}
ACL_ROOT_DIR=${ACL_ROOT_DIR:-"${PWD}/ComputeLibrary"}
ACL_VERSION=${ACL_VERSION:-v24.11.1}
ACL_VERSION=${ACL_VERSION:-v24.09}
ACL_ARCH=${ACL_ARCH:-"armv8.2-a"}
ACL_REPO="https://github.com/ARM-software/ComputeLibrary.git"

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
machine learning applications and provides AArch64 optimized implementations
of core functions. This functionality currently requires that ACL is downloaded
and built separately. See [Build from Source] section of the Developer Guide for
details. oneDNN only supports Compute Library versions 24.11.1 or later.
details. oneDNN only supports Compute Library versions 24.09 or later.

[Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary

Expand Down
2 changes: 1 addition & 1 deletion cmake/ACL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ endif()

find_package(ACL REQUIRED)

set(ACL_MINIMUM_VERSION "24.11.1")
set(ACL_MINIMUM_VERSION "24.09")

if(ACL_FOUND)
file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
Expand Down
19 changes: 0 additions & 19 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,6 @@ enum {
key_conv_gemm_zp_src_comp,
key_conv_int_dat_in_acc_dt,
key_conv_padded_bias,
key_conv_permuted_inputs,
key_conv_permuted_outputs,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
key_conv_tails,
Expand All @@ -225,20 +222,10 @@ enum {
key_eltwise_src,
key_fusion_forward_scratchpad,
key_fusion_inout_buffer,
key_gemm_asm_tmp_buffer,
key_gemm_tmp_buffer,
key_gemm_blocked_a,
key_gemm_blocked_b,
key_gemm_accumulator,
key_gemm_interleaved_lhs,
key_gemm_mm_result_s32,
key_gemm_mm_signed_a,
key_gemm_mm_signed_output,
key_gemm_output,
key_gemm_pretranspose,
key_gemm_pretranspose_b,
key_gemm_pretransposed_rhs,
key_gemm_transposed_1xwrhs,
key_generic_acc,
key_gnorm_cvt,
key_gnorm_reduction,
Expand Down Expand Up @@ -311,15 +298,9 @@ enum {
key_softmax_interim_store,
key_sum_reduction,
key_sum_srcs_cvt,
key_wino_transformed_weights,
key_wino_U,
key_wino_V,
key_wino_M,
key_wino_workspace,
key_decompression_scales,
key_decompression_zero_points,
key_src_quantized,
key_src_dequantized_scales,
// These two keys should always be the last ones,
// even though they are not in alphabetical order
key_nested,
Expand Down
2 changes: 2 additions & 0 deletions src/cpu/acl/acl_batch_normalization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
CHECK(r->configure(pd()->abp, pd()));
mapper.add(this, std::move(r));

CHECK(pd()->post_ops.create_resource(engine, mapper));

return status::success;
}

Expand Down
204 changes: 15 additions & 189 deletions src/cpu/acl/acl_binary.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2022, 2024 Arm Ltd. and affiliates
* Copyright 2022 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,198 +16,32 @@

#include "acl_binary.hpp"

#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/experimental/operators/CpuAdd.h"
#include "arm_compute/runtime/experimental/operators/CpuElementwise.h"
#include "arm_compute/runtime/experimental/operators/CpuMul.h"
#include "arm_compute/runtime/experimental/operators/CpuSub.h"

namespace dnnl {
namespace impl {
namespace cpu {
namespace acl {

status_t acl_binary_t::pd_t::init(engine_t *engine) {
using namespace acl_utils;

// Only support f16/f32/s32 for now
data_type_t ddt = dst_md(0)->data_type;
if (!utils::one_of(ddt, data_type::f16, data_type::f32, data_type::s32))
return status::unimplemented;

// Only support src and dst all matching for now
if (ddt != src_md(0)->data_type || src_md(1)->data_type != ddt)
return status::unimplemented;

// Sets the memory format of dst from any to src_md(0) blocking desc
CHECK(set_default_params());

if (!attr()->has_default_values()) return status::unimplemented;

asp_.alg = desc()->alg_kind;

// All the algorithms we support
if (!utils::one_of(asp_.alg, alg_kind::binary_add, alg_kind::binary_sub,
alg_kind::binary_mul, alg_kind::binary_div,
alg_kind::binary_max, alg_kind::binary_min))
return status::unimplemented;

// s32 div in ACL does not round as oneDNN expects
if (ddt == data_type::s32 && asp_.alg == alg_kind::binary_div)
return status::unimplemented;

// ACL pointwise arithmetic operators assume that the innermost
// dimensions are dense for src0, src1 and dst. Reordering the
// logical dimensions by stride does this (if reordered_dims >= 1 )
// and also makes memory accesses contiguous in ACL (without any
// data reordering).
memory_desc_t src_d0_permed, src_d1_permed, dst_d_permed;
int reordered_dims = reorder_dimensions_by_stride(
{&src_d0_permed, &src_d1_permed, &dst_d_permed},
{src_md(0), src_md(1), dst_md()});
if (reordered_dims < 1) return status::unimplemented;

// Create ACL tensor infos with permuted descs
CHECK(tensor_info(asp_.src0_info, src_d0_permed));
CHECK(tensor_info(asp_.src1_info, src_d1_permed));
CHECK(tensor_info(asp_.dst_info, dst_d_permed));

// In this case ACL tries to treat src0 and src1 as a 1D array, but
// fails because the strides aren't equal. TODO: remove when fixed
// in ACL
if (asp_.alg == alg_kind::binary_add
&& asp_.src0_info.tensor_shape() == asp_.src1_info.tensor_shape()
&& asp_.src0_info.strides_in_bytes()
!= asp_.src1_info.strides_in_bytes()) {
return status::unimplemented;
}

// This forces ACL not to parallelise with small workloads, this is
// a temporary fix and should be removed in future versions (TODO)
memory_desc_wrapper dst_d(dst_md());
if (dst_d.nelems() < 40000) {
size_t acl_y_axis_i = 1;
CHECK(insert_singleton_dimension(asp_.src0_info, acl_y_axis_i));
CHECK(insert_singleton_dimension(asp_.src1_info, acl_y_axis_i));
CHECK(insert_singleton_dimension(asp_.dst_info, acl_y_axis_i));
}

// Call operator specific validate function to check support
ACL_CHECK_VALID(validate(asp_));

return status::success;
}

arm_compute::Status acl_binary_t::pd_t::validate(const acl_binary_conf_t &asp) {
switch (asp.alg) {
case alg_kind::binary_add:
return arm_compute::experimental::op::CpuAdd::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
case alg_kind::binary_sub:
return arm_compute::experimental::op::CpuSub::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
case alg_kind::binary_div:
return arm_compute::experimental::op::CpuElementwiseDivision::
validate(&asp.src0_info, &asp.src1_info, &asp.dst_info);
case alg_kind::binary_mul:
return arm_compute::experimental::op::CpuMul::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info, 1.0f,
arm_compute::ConvertPolicy::SATURATE,
arm_compute::RoundingPolicy::TO_ZERO);
case alg_kind::binary_min:
return arm_compute::experimental::op::CpuElementwiseMin::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info);
case alg_kind::binary_max:
return arm_compute::experimental::op::CpuElementwiseMax::validate(
&asp.src0_info, &asp.src1_info, &asp.dst_info);
default:
return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
"unsupported alg_kind");
}
}

status_t acl_binary_t::init(engine_t *engine) {
auto asp = pd()->asp_;

switch (asp.alg) {
case alg_kind::binary_add: {
auto add_op
= std::make_unique<arm_compute::experimental::op::CpuAdd>();
add_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
binary_op_ = std::move(add_op);
break;
}
case alg_kind::binary_sub: {
auto sub_op
= std::make_unique<arm_compute::experimental::op::CpuSub>();
sub_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
arm_compute::ConvertPolicy::SATURATE);
binary_op_ = std::move(sub_op);
break;
}
case alg_kind::binary_div: {
auto div_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseDivision>();
div_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(div_op);
break;
}
case alg_kind::binary_mul: {
auto mul_op
= std::make_unique<arm_compute::experimental::op::CpuMul>();
mul_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
1.0f, arm_compute::ConvertPolicy::SATURATE,
arm_compute::RoundingPolicy::TO_ZERO);
binary_op_ = std::move(mul_op);
break;
}
case alg_kind::binary_min: {
auto min_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseMin>();
min_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(min_op);
break;
}
case alg_kind::binary_max: {
auto max_op = std::make_unique<
arm_compute::experimental::op::CpuElementwiseMax>();
max_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
binary_op_ = std::move(max_op);
break;
}
default: return status::runtime_error;
}

return status::success;
}

status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx, const void *src0,
const void *src1, void *dst) const {

auto asp = pd()->asp_;
// Lock here is needed because resource_mapper does not support
// concurrent multithreaded access.
std::lock_guard<std::mutex> _lock {this->mtx};

arm_compute::Tensor src0_tensor;
arm_compute::Tensor src1_tensor;
arm_compute::Tensor dst_tensor;
// Retrieve primitive resource and configured Compute Library objects
acl_binary_obj_t &acl_obj = ctx.get_resource_mapper()
->get<acl_binary_resource_t>(this)
->get_acl_obj();

src0_tensor.allocator()->init(asp.src0_info);
src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
src1_tensor.allocator()->init(asp.src1_info);
src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
dst_tensor.allocator()->init(asp.dst_info);
dst_tensor.allocator()->import_memory(dst);
acl_obj.src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
acl_obj.src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
acl_obj.dst_tensor.allocator()->import_memory(dst);

arm_compute::ITensorPack run_pack {
{arm_compute::TensorType::ACL_SRC_0, &src0_tensor},
{arm_compute::TensorType::ACL_SRC_1, &src1_tensor},
{arm_compute::TensorType::ACL_DST, &dst_tensor}};
acl_obj.binary_op->run();

binary_op_->run(run_pack);
acl_obj.src0_tensor.allocator()->free();
acl_obj.src1_tensor.allocator()->free();
acl_obj.dst_tensor.allocator()->free();

return status::success;
}
Expand All @@ -221,14 +55,6 @@ status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx) const {
return execute_forward(ctx, src0, src1, dst);
}

status_t acl_binary_t::execute(const exec_ctx_t &ctx) const {
return execute_forward(ctx);
}

const acl_binary_t::pd_t *acl_binary_t::pd() const {
return static_cast<const pd_t *>(primitive_t::pd().get());
}

} // namespace acl
} // namespace cpu
} // namespace impl
Expand Down
Loading

0 comments on commit 73ed350

Please sign in to comment.