From e562a4d3d7563ce3d8df134ed9ac09b88b432d1a Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 20 Dec 2024 12:37:47 -0800 Subject: [PATCH 01/14] xe: conv_v2: update README --- src/gpu/intel/jit/v2/conv/README.md | 12 +++++------- src/gpu/intel/jit/v2/conv/planner/planner.cpp | 7 ++++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/README.md b/src/gpu/intel/jit/v2/conv/README.md index 8e52c9d0663..8d9637d0f91 100644 --- a/src/gpu/intel/jit/v2/conv/README.md +++ b/src/gpu/intel/jit/v2/conv/README.md @@ -11,21 +11,19 @@ This is a new convolution implementation for GPU which aims to solve two issues: ### How to build and test ```bash -# 1. Build with OpenCL GPU runtime -cmake . -Bbuild -DONEDNN_GPU_RUNTIME=OCL -DONEDNN_DEV_MODE=ON -DDNNL_GPU_CONV_PLANNER=ON -DONEDNN_BUILD_GRAPH=OFF +# 1. Build with OpenCL GPU runtime with experimental support to enable v2 convolution +cmake . -Bbuild -DONEDNN_GPU_RUNTIME=OCL -DONEDNN_EXPERIMENTAL=ON -DONEDNN_BUILD_GRAPH=OFF make -C build -j `nproc` benchdnn gpu_conv_planner # 2. Test -export enable_conv_v2=1 -./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --dir=FWD_I --batch=shapes_resnet_50_v1_5 +./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --impl=v2 --dir=FWD_I --batch=shapes_resnet_50_v1_5 ... run: --mode=F --conv --engine=gpu --dir=FWD_I ic64ih56oc64oh56kh3ph1n"resnet_50_v1_5:res2a_branch2b*3" perf,gpu,jit:ir_v2,"resnet_50_v1_5:res2a_branch2b*3",--mode=F --conv --engine=gpu --dir=FWD_I ic64ih56oc64oh56kh3ph1n"resnet_50_v1_5:res2a_branch2b*3",0.451478,155.925,0.10656,4236.84,0.107055,4217.25 # 3. Set kernel descriptor from environment -export enable_conv_v2=1 export desc="--prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --iter ic16mb16oc32 --tg ow4oc4 --loop-desc kw,kh,kd,ic --load a:2d,b:2d --store c:2d" -./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --dir=FWD_I --dt=f32 mb128ic256ih56oc64oh56kh1ph0 +./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --impl=v2 --dir=FWD_I --dt=f32 mb128ic256ih56oc64oh56kh1ph0 ... perf,gpu,jit:ir_v2,,--mode=F --conv --engine=gpu --dir=FWD_I mb128ic256ih56oc64oh56kh1ph0,13.1533,158.426,1.124,11702.3,1.13858,11552.4 ``` @@ -44,7 +42,7 @@ event of changes in the kernel generation or of adding new features, use the snippet below to overwrite the kernel registry in oneDNN. ```bash -export enable_conv_v2=1 +export ONEDNN_EXPERIMENTAL_GPU_CONV_V2=1 export ONEDNN_GPU_CONV_PLAN_REGISTRY_PATH=plan_registry_data.txt ./build/src/gpu/intel/jit/v2/conv/planner/gpu_conv_planner --auto-search cp ${ONEDNN_GPU_CONV_PLAN_REGISTRY_PATH}.cpp /path/to/onednn/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.cpp b/src/gpu/intel/jit/v2/conv/planner/planner.cpp index 9743dd41ce7..36644582ad5 100644 --- a/src/gpu/intel/jit/v2/conv/planner/planner.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/planner.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -104,10 +104,11 @@ void init_params( params.mode = planner_mode_t::trace; } // Check if conv v2 is enabled. - bool enable_conv_v2 = gpu_utils::dev_getenv("enable_conv_v2", false); + bool enable_conv_v2 + = gpu_utils::dev_getenv("ONEDNN_EXPERIMENTAL_GPU_CONV_V2", false); if (!enable_conv_v2) { std::cout << "Error: conv_v2 is not enabled, set " - "enable_conv_v2=1 in environment." + "ONEDNN_EXPERIMENTAL_GPU_CONV_V2=1 in environment." << std::endl; exit(1); } From 50043977b2894d85bdc130984cfbfdf8936cbb72 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 20 Dec 2024 14:46:36 -0800 Subject: [PATCH 02/14] xe: jit: utils: extend hex (de)serialize functions --- src/gpu/intel/jit/utils/utils.hpp | 28 ++++++++++++++++++---------- src/gpu/intel/serialization.hpp | 4 +++- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/gpu/intel/jit/utils/utils.hpp b/src/gpu/intel/jit/utils/utils.hpp index 4cd13ec5792..3e3a8c699ac 100644 --- a/src/gpu/intel/jit/utils/utils.hpp +++ b/src/gpu/intel/jit/utils/utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1344,26 +1344,34 @@ void stringify_to_cpp_file(const std::string &file_name, const std::string &var_name, const std::vector &namespaces, const std::vector &lines); -template -std::string serialize_to_hex(const T &t) { +inline std::string data_to_hex(const std::vector &data) { std::ostringstream oss; - serialized_data_t s; - s.append(t); - for (uint8_t d : s.get_data()) { + for (auto v : data) { oss << std::uppercase << std::hex << std::setw(2) << std::setfill('0') - << (int)d; + << (int)v; } return oss.str(); } -template -void deserialize_from_hex(T &t, const std::string &s_hex) { +inline std::vector hex_to_data(const std::string &s_hex) { std::vector data; for (size_t i = 0; i < s_hex.size(); i += 2) { data.push_back(static_cast( std::stoi(s_hex.substr(i, 2), nullptr, 16))); } - auto s = serialized_t::from_data(std::move(data)); + return data; +} + +template +std::string serialize_to_hex(const T &t) { + serialized_data_t s; + s.append(t); + return data_to_hex(s.get_data()); +} + +template +void deserialize_from_hex(T &t, const std::string &s_hex) { + auto s = serialized_t::from_data(hex_to_data(s_hex)); deserializer_t d(s); d.pop(t); } diff --git a/src/gpu/intel/serialization.hpp b/src/gpu/intel/serialization.hpp index a4c5f8b77b4..28257ea26d9 100644 --- a/src/gpu/intel/serialization.hpp +++ b/src/gpu/intel/serialization.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -239,6 +239,8 @@ struct deserializer_t { } } + bool empty() const { return idx >= s.get_data().size(); } + size_t idx; const serialized_data_t &s; }; From 36cd02e9c178cc03e16bfe70843168827ad5710c Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 27 Dec 2024 13:04:44 -0800 Subject: [PATCH 03/14] xe: jit: utils: introduce parse result --- src/gpu/intel/jit/utils/utils.hpp | 33 ++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/gpu/intel/jit/utils/utils.hpp b/src/gpu/intel/jit/utils/utils.hpp index 3e3a8c699ac..93b08afdf9a 100644 --- a/src/gpu/intel/jit/utils/utils.hpp +++ b/src/gpu/intel/jit/utils/utils.hpp @@ -1088,6 +1088,24 @@ T parse(const std::string &s) { return t; } +class parse_result_t { +public: + const std::unordered_map &args() const { + return args_; + } + void set_arg(const std::string &name, const std::string &value) { + args_[name] = value; + } + bool is_set(const std::string &name) const { return args_.count(name) > 0; } + const std::string &arg_value(const std::string &name) const { + ir_assert(is_set(name)) << "Argument is not set: " << name; + return args_.at(name); + } + +private: + std::unordered_map args_; +}; + template class parse_iface_t { public: @@ -1172,11 +1190,13 @@ class parse_iface_t { } } - void parse(std::istream &in, T &parent) const { + void parse(std::istream &in, T &parent, + parse_result_t *result = nullptr) const { parent = T(); if (relaxed_) { - parse_relaxed(in, parent); + parse_relaxed(in, parent, result); } else { + ir_assert(!result); for (auto &e : entries_) { if (!e.name.empty()) { stream_match(in, e.name); @@ -1188,9 +1208,10 @@ class parse_iface_t { if (post_parse_func_) post_parse_func_(parent); } - void parse(const std::string &s, T &parent) const { + void parse(const std::string &s, T &parent, + parse_result_t *result = nullptr) const { std::istringstream iss(s); - parse(iss, parent); + parse(iss, parent, result); } int size() const { return static_cast(entries_.size()); } @@ -1219,7 +1240,8 @@ class parse_iface_t { return -1; } - void parse_relaxed(std::istream &in, T &parent) const { + void parse_relaxed(std::istream &in, T &parent, + parse_result_t *result = nullptr) const { std::vector seen(entries_.size()); while (true) { std::string name; @@ -1235,6 +1257,7 @@ class parse_iface_t { std::istringstream iss(value); entries_[idx].parse(iss, parent); seen[idx] = true; + if (result) result->set_arg(name, value); } for (size_t i = 0; i < entries_.size(); i++) { if (entries_[i].required && !seen[i]) { From eb336ee0eb6cea5b30c0064b1928b03b40723ba5 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 6 Jan 2025 14:40:11 -0800 Subject: [PATCH 04/14] xpu: ocl, sycl: profiler: add per-kernel time query --- src/common/c_types_map.hpp | 4 +++- src/xpu/ocl/stream_profiler.cpp | 15 +++++++++++++-- src/xpu/sycl/stream_profiler.cpp | 13 ++++++++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp index b22a63ba2dd..d16bd814008 100644 --- a/src/common/c_types_map.hpp +++ b/src/common/c_types_map.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2016-2024 Intel Corporation +* Copyright 2016-2025 Intel Corporation * Copyright 2024 FUJITSU LIMITED * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -260,6 +260,8 @@ const profiling_data_kind_t internal_only_start = (profiling_data_kind_t)(1 << 8); const profiling_data_kind_t cycles = (profiling_data_kind_t)(internal_only_start + 1); +const profiling_data_kind_t time_per_kernel + = (profiling_data_kind_t)(internal_only_start + 2); } // namespace profiling_data_kind using format_tag_t = dnnl_format_tag_t; diff --git a/src/xpu/ocl/stream_profiler.cpp b/src/xpu/ocl/stream_profiler.cpp index 7e067715574..de82b412483 100644 --- a/src/xpu/ocl/stream_profiler.cpp +++ b/src/xpu/ocl/stream_profiler.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,12 @@ namespace ocl { status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, int *num_entries, uint64_t *data) const { if (!num_entries) return status::invalid_arguments; + bool is_per_kernel = (data_kind == profiling_data_kind::time_per_kernel); if (!data) { + if (is_per_kernel) { + *num_entries = (int)events_.size(); + return status::success; + } std::unordered_set seen; for (auto &ev : events_) seen.insert(ev.stamp); @@ -44,8 +49,8 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, } std::map stamp2entry; + int idx = 0; for (auto &ev : events_) { - auto &entry = stamp2entry[ev.stamp]; const xpu::ocl::event_t &ocl_event = *utils::downcast(ev.event.get()); cl_ulong beg, end; @@ -54,6 +59,11 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, CL_PROFILING_COMMAND_START, sizeof(beg), &beg, nullptr)); OCL_CHECK(clGetEventProfilingInfo(ocl_event[0].get(), CL_PROFILING_COMMAND_END, sizeof(end), &end, nullptr)); + if (is_per_kernel) { + data[idx++] = static_cast(end - beg); + continue; + } + auto &entry = stamp2entry[ev.stamp]; entry.min_nsec = std::min(entry.min_nsec, beg); entry.max_nsec = std::max(entry.max_nsec, end); const auto *gpu_stream @@ -61,6 +71,7 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, entry.freq += gpu_stream->get_freq(*ev.event); entry.kernel_count++; } + if (is_per_kernel) return status::success; return xpu::stream_profiler_t::get_info_impl(stamp2entry, data_kind, data); } diff --git a/src/xpu/sycl/stream_profiler.cpp b/src/xpu/sycl/stream_profiler.cpp index 6fdc1db8607..3d05dc47d6f 100644 --- a/src/xpu/sycl/stream_profiler.cpp +++ b/src/xpu/sycl/stream_profiler.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,12 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, int *num_entries, uint64_t *data) const { using namespace ::sycl::info; if (!num_entries) return status::invalid_arguments; + bool is_per_kernel = (data_kind == profiling_data_kind::time_per_kernel); if (!data) { + if (is_per_kernel) { + *num_entries = (int)events_.size(); + return status::success; + } std::unordered_set seen; for (auto &ev : events_) seen.insert(ev.stamp); @@ -42,6 +47,7 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, } std::map stamp2entry; + int idx = 0; for (auto &ev : events_) { const xpu::sycl::event_t &sycl_event = *utils::downcast(ev.event.get()); @@ -51,11 +57,16 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind, .get_profiling_info(); auto end = sycl_event[0] .get_profiling_info(); + if (is_per_kernel) { + data[idx++] = static_cast(end - beg); + continue; + } auto &entry = stamp2entry[ev.stamp]; entry.min_nsec = std::min(entry.min_nsec, beg); entry.max_nsec = std::max(entry.max_nsec, end); entry.kernel_count++; } + if (is_per_kernel) return status::success; return xpu::stream_profiler_t::get_info_impl(stamp2entry, data_kind, data); } From 3dcb49c13eb91bf1a99e8cbbba85e784f247e3a1 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 23 Dec 2024 14:48:23 -0800 Subject: [PATCH 05/14] xe: conv_v2: remove unused code --- src/gpu/intel/jit/v2/conv/planner/search.cpp | 42 +------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp index 900cddf45a3..80d45c4796b 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,46 +41,6 @@ namespace v2 { namespace conv { namespace planner { -class search_iterator_t { -public: - int add(const std::vector &key_values) { - int key = (int)values_.size(); - values_.push_back(key_values); - idxs_.push_back(0); - if (key == 0) { - idxs_[0] = -1; - total_ = 1; - } - total_ *= (int)key_values.size(); - return key; - } - - int nkeys() const { return (int)values_.size(); } - - bool has_next() const { return idx_ + 1 < total_; } - - void next() { - ir_assert(has_next()); - int carry = 1; - for (int j = 0; j < nkeys(); j++) { - int new_idx = idxs_[j] + carry; - int bound = (int)values_[j].size(); - idxs_[j] = new_idx % bound; - carry = new_idx / bound; - if (carry == 0) break; - } - idx_++; - } - - int operator()(int key) const { return values_[key][idxs_[key]]; } - -private: - int idx_ = -1; - int total_ = 0; - std::vector> values_; - std::vector idxs_; -}; - // Flags specifying blocking restrictions for a convolution dimension. enum class tile_flags_t : uint32_t { undef = 0, From eba3852510fae4d7d4b9bd2a385649f0d3becdf5 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 6 Jan 2025 14:53:24 -0800 Subject: [PATCH 06/14] xe: conv_v2: fix BWD_D data type check --- src/gpu/intel/jit/v2/conv/gen_convolution.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpu/intel/jit/v2/conv/gen_convolution.cpp b/src/gpu/intel/jit/v2/conv/gen_convolution.cpp index 303c0f4ac9b..0bbb53215a6 100644 --- a/src/gpu/intel/jit/v2/conv/gen_convolution.cpp +++ b/src/gpu/intel/jit/v2/conv/gen_convolution.cpp @@ -77,7 +77,8 @@ class gen_convolution_t { return false; // Mixed types are not supported for backward by data. if (pd->is_bwd_d() - && pd->dst_md()->data_type != pd->diff_src_md()->data_type) { + && pd->diff_dst_md()->data_type + != pd->diff_src_md()->data_type) { return false; } From ef22d54e3fba07f9592b09ee1b9a5cedc40a5dbd Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 6 Jan 2025 14:53:13 -0800 Subject: [PATCH 07/14] xe: conv_v2: introduce bench_time_t --- src/gpu/intel/jit/v2/conv/bench_data.cpp | 14 +++++---- src/gpu/intel/jit/v2/conv/bench_data.hpp | 36 ++++++++++++++++++++++-- src/gpu/intel/jit/v2/conv/model.cpp | 14 ++++----- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/bench_data.cpp b/src/gpu/intel/jit/v2/conv/bench_data.cpp index 69428b95fb0..d701630856a 100644 --- a/src/gpu/intel/jit/v2/conv/bench_data.cpp +++ b/src/gpu/intel/jit/v2/conv/bench_data.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,8 +30,8 @@ std::string bench_data_t::str() const { std::ostringstream oss; for (int i = 0; i < size(); i++) { if (i > 0) oss << std::endl; - double gops_sec = prbs[i].ops() / times[i]; - oss << "bench," << prbs[i].csv_str() << "," << times[i] << "," + double gops_sec = prbs[i].ops() / times[i].total; + oss << "bench," << prbs[i].csv_str() << "," << times[i].total << "," << gops_sec; } return oss.str(); @@ -62,7 +62,7 @@ std::vector bench_data_set_t::find_best_idxs(int _nbest) const { std::vector cur_times(nprbs, max_time); for (auto &bd : vec_) { for (int i = 0; i < nprbs; i++) { - best_times[i] = std::min(best_times[i], bd.times[i]); + best_times[i] = std::min(best_times[i], bd.times[i].total); } } std::unordered_set best_idxs; @@ -75,7 +75,8 @@ std::vector bench_data_set_t::find_best_idxs(int _nbest) const { double geomean = 1.0; for (int j = 0; j < nprbs; j++) { double ratio = best_times[j] - / (double)std::min(cur_times[j], vec_[i].times[j]); + / (double)std::min( + cur_times[j], vec_[i].times[j].total); geomean *= std::pow(ratio, 1.0 / nprbs); } if (geomean >= best_geomean) { @@ -85,7 +86,8 @@ std::vector bench_data_set_t::find_best_idxs(int _nbest) const { } ir_assert(best_idx != -1); for (int j = 0; j < nprbs; j++) { - cur_times[j] = std::min(cur_times[j], vec_[best_idx].times[j]); + cur_times[j] + = std::min(cur_times[j], vec_[best_idx].times[j].total); } best_idxs.insert(best_idx); } diff --git a/src/gpu/intel/jit/v2/conv/bench_data.hpp b/src/gpu/intel/jit/v2/conv/bench_data.hpp index 7c8bfa33255..6e20f865e0b 100644 --- a/src/gpu/intel/jit/v2/conv/bench_data.hpp +++ b/src/gpu/intel/jit/v2/conv/bench_data.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,12 +29,42 @@ namespace jit { namespace v2 { namespace conv { +// Stores device times for primitive execution. +// Includes: +// - Total time, computed via profiling queries as: +// the end of the last kernel - the start of the first kernel +// - Kernel times: individual kernel times +struct bench_time_t { + uint64_t total = 0; + std::vector kernel_times; + + bench_time_t() = default; + bench_time_t(uint64_t total) : total(total) { + kernel_times.push_back(total); + } + template + bench_time_t(uint64_t total, IteratorT beg, IteratorT end) : total(total) { + kernel_times = std::vector(beg, end); + } + int nkernels() const { return (int)kernel_times.size(); } + + bench_time_t min(const bench_time_t &other) const { + bench_time_t ret = *this; + ret.total = std::min(ret.total, other.total); + for (int i = 0; i < nkernels(); i++) { + ret.kernel_times[i] + = std::min(ret.kernel_times[i], other.kernel_times[i]); + } + return ret; + } +}; + class bench_data_t { public: int id = -1; kernel_desc_t kernel_desc; std::vector prbs; - std::vector times; + std::vector times; bench_data_t() = default; explicit bench_data_t(int id, const kernel_desc_t &kernel_desc) @@ -43,7 +73,7 @@ class bench_data_t { int size() const { return (int)prbs.size(); } explicit operator bool() const { return size() > 0; } - void add(const problem_t &prb, uint64_t time) { + void add(const problem_t &prb, const bench_time_t &time) { prbs.push_back(prb); times.push_back(time); } diff --git a/src/gpu/intel/jit/v2/conv/model.cpp b/src/gpu/intel/jit/v2/conv/model.cpp index 706dfca54c8..862139923ef 100644 --- a/src/gpu/intel/jit/v2/conv/model.cpp +++ b/src/gpu/intel/jit/v2/conv/model.cpp @@ -110,7 +110,7 @@ struct hw_config_t { struct sample_t { problem_t prb; kernel_desc_t kernel_desc; - uint64_t time_ns = 0; + bench_time_t time; hw_config_t hw_cfg; dim_t b, m, n, k; @@ -121,8 +121,8 @@ struct sample_t { sample_t() = default; sample_t(const problem_t &prb, const kernel_desc_t &kernel_desc, - uint64_t time_ns = 0) - : prb(prb), kernel_desc(kernel_desc), time_ns(time_ns) { + const bench_time_t &time = bench_time_t()) + : prb(prb), kernel_desc(kernel_desc), time(time) { hw_cfg = hw_config_t( prb.hw(), kernel_desc.fma, kernel_desc.src_tag.type()); auto padded_shape = prb.shape(); @@ -157,7 +157,7 @@ struct sample_t { return ret; } - float to_y() const { return time_ns; } + float to_y() const { return time.total; } float ntgs() const { float ntgs = 1.0f; @@ -176,7 +176,7 @@ struct sample_t { } float eff() const { - float sec = time_ns / 1e9; + float sec = time.total / 1e9; return ops() / 1e9 / sec / hw_cfg.max_gops_per_sec(); } @@ -268,7 +268,7 @@ void model_t::score(const bench_data_t &bd) { vec1d y_test; vec1d y_pred; for (int i = 0; i < bd.size(); i++) { - sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i]); + sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i].total); y_test.push_back(s.to_y()); y_pred.push_back(predict(bd.prbs[i], bd.kernel_desc)); } @@ -300,7 +300,7 @@ void to_model_xy(const bench_data_t &bd, vec2d &X, vec1d &y) { X.reserve(bd.size()); y.reserve(bd.size()); for (int i = 0; i < bd.size(); i++) { - sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i]); + sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i].total); X.push_back(s.to_x()); y.push_back(s.to_y()); } From 3d612f61ea71298ccb5bb497082052d1b51808f5 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 27 Dec 2024 13:06:03 -0800 Subject: [PATCH 08/14] xe: conv_v2: handle descriptor defaults in one place --- src/gpu/intel/jit/v2/conv/kernel_desc.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp index 962c00e16ca..e8463e2a6ec 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp @@ -226,6 +226,9 @@ void kernel_desc_t::set(const std::string &s) { } void kernel_desc_t::set_defaults() { + src_tag = make_conv_layout_tag(tensor_kind_t::src, src_tag.str()); + wei_tag = make_conv_layout_tag(tensor_kind_t::wei, wei_tag.str()); + dst_tag = make_conv_layout_tag(tensor_kind_t::dst, dst_tag.str()); if (loop_desc.is_empty()) { switch (prop) { case prop_kind::forward_training: @@ -254,6 +257,12 @@ void kernel_desc_t::set_defaults() { reqs.set(pvars::ic, 1); reqs.set(pvars::oc, 1); } + if (prop == prop_kind::backward_data) { + // XXX: No stride support in backward by data yet. + reqs.set(pvars::sw, 1); + reqs.set(pvars::sh, 1); + reqs.set(pvars::sd, 1); + } } void kernel_desc_t::finalize(const prb_reqs_t &final_reqs) { @@ -472,14 +481,8 @@ void kernel_desc_t::init_parse_iface(parse_iface_t *iface) { iface->add(po_entry); #undef PACK - iface->set_post_parse_func([](kernel_desc_t &desc) { - desc.src_tag - = make_conv_layout_tag(tensor_kind_t::src, desc.src_tag.str()); - desc.wei_tag - = make_conv_layout_tag(tensor_kind_t::wei, desc.wei_tag.str()); - desc.dst_tag - = make_conv_layout_tag(tensor_kind_t::dst, desc.dst_tag.str()); - }); + iface->set_post_parse_func( + [](kernel_desc_t &desc) { desc.set_defaults(); }); } arg_helper_t::arg_helper_t(const kernel_desc_t &desc) : desc_(desc) {} From 7fbf4276b7d8a327f5e00d6227b462bd5c1f09e7 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 27 Dec 2024 13:06:57 -0800 Subject: [PATCH 09/14] xe: conv_v2: update planner logic --- src/gpu/intel/jit/v2/conv/kernel_desc.cpp | 4 +- src/gpu/intel/jit/v2/conv/planner/bench.cpp | 3 +- src/gpu/intel/jit/v2/conv/planner/planner.cpp | 31 ++-- src/gpu/intel/jit/v2/conv/planner/planner.hpp | 19 ++- .../jit/v2/conv/planner/planner_main.cpp | 22 ++- src/gpu/intel/jit/v2/conv/planner/search.cpp | 136 +++++++++++++----- src/gpu/intel/jit/v2/conv/planner/search.hpp | 6 +- 7 files changed, 150 insertions(+), 71 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp index e8463e2a6ec..7c284c2253e 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp @@ -437,12 +437,12 @@ void kernel_desc_t::init_parse_iface(parse_iface_t *iface) { iface->add( "regs", "Number of registers (128 or 256).", /*required=*/true); iface->add("iter", "Iteration tile (e.g. mb32ic16oc16).", - /*required=*/true); + /*required=*/false); iface->add("iter_outer", "Outer iteration tile (e.g. mb2).", /*required=*/false); iface->add( - "tg", "Threadgroup tile (e.g. ow4oc4).", /*required=*/true); + "tg", "Threadgroup tile (e.g. ow4oc4).", /*required=*/false); iface->add("loop_desc", "Loop description, variables ordered from innermost to outermost " "(e.g. kw,kh,kd,ic)."); diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.cpp b/src/gpu/intel/jit/v2/conv/planner/bench.cpp index 839cfb2494e..bfcbf456d97 100644 --- a/src/gpu/intel/jit/v2/conv/planner/bench.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/bench.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -614,6 +614,7 @@ bench_data_t bench(const bench_manager_t &bench_mger, bool try_create( const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc) { + clear_primitive_cache(); bench_input_params_t params(kernel_desc, /*nprbs=*/1); bench_task_t task(generate_problems(params)[0]); auto engine = bench_mger.get_engine(); diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.cpp b/src/gpu/intel/jit/v2/conv/planner/planner.cpp index 36644582ad5..3115584e8ce 100644 --- a/src/gpu/intel/jit/v2/conv/planner/planner.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/planner.cpp @@ -16,6 +16,8 @@ #include "gpu/intel/jit/v2/conv/planner/planner.hpp" +#include "oneapi/dnnl/dnnl_config.h" + #include "gpu/intel/jit/v2/conv/model.hpp" #include "gpu/intel/jit/v2/conv/plan.hpp" #include "gpu/intel/jit/v2/conv/plan_registry.hpp" @@ -32,20 +34,7 @@ namespace v2 { namespace conv { namespace planner { -enum class planner_mode_t { - undef, - trace, - bench, - search, - auto_search, -}; - -struct params_t { - planner_mode_t mode = planner_mode_t::undef; - kernel_desc_t desc; -}; - -static params_t params; +static planner_params_t params; bool find_remove(const char *arg, std::string &s) { auto pos = s.find(arg); @@ -124,12 +113,12 @@ void init_params( break; default: break; } - auto iface = params.desc.parse_iface(); - iface.parse(cmd_args, params.desc); + auto &iface = params.desc.parse_iface(); + iface.parse(cmd_args, params.desc, ¶ms.parse_result); params.desc.set_defaults(); } -void planner_main(int argc, const char **argv) { +void DNNL_API planner_main(int argc, const char **argv) { bench_manager_t bench_mger; init_params(argc, argv, bench_mger); switch (params.mode) { @@ -149,14 +138,10 @@ void planner_main(int argc, const char **argv) { auto model = model_fit(bd); break; } - case planner_mode_t::auto_search: { - plan_registry() = plan_registry_t(); - auto_search(bench_mger); - break; - } + case planner_mode_t::auto_search: case planner_mode_t::search: { plan_registry() = plan_registry_t(); - search(bench_mger, params.desc); + search(bench_mger, params); break; } default: ir_error_not_expected(); diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.hpp b/src/gpu/intel/jit/v2/conv/planner/planner.hpp index a16c98223b8..a11820761ca 100644 --- a/src/gpu/intel/jit/v2/conv/planner/planner.hpp +++ b/src/gpu/intel/jit/v2/conv/planner/planner.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,8 @@ #ifndef GPU_INTEL_JIT_V2_CONV_PLANNER_PLANNER_HPP #define GPU_INTEL_JIT_V2_CONV_PLANNER_PLANNER_HPP -#include "oneapi/dnnl/dnnl_config.h" +#include "gpu/intel/jit/utils/utils.hpp" +#include "gpu/intel/jit/v2/conv/kernel_desc.hpp" namespace dnnl { namespace impl { @@ -28,7 +29,19 @@ namespace v2 { namespace conv { namespace planner { -void DNNL_API planner_main(int argc, const char **argv); +enum class planner_mode_t { + undef, + trace, + bench, + search, + auto_search, +}; + +struct planner_params_t { + planner_mode_t mode = planner_mode_t::undef; + kernel_desc_t desc; + parse_result_t parse_result; +}; } // namespace planner } // namespace conv diff --git a/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp b/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp index 973ba4c0faf..2bf0788d4bc 100644 --- a/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,25 @@ * limitations under the License. *******************************************************************************/ -#include "gpu/intel/jit/v2/conv/planner/planner.hpp" +#include "oneapi/dnnl/dnnl_config.h" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace jit { +namespace v2 { +namespace conv { +namespace planner { +void DNNL_API planner_main(int argc, const char **argv); +} +} // namespace conv +} // namespace v2 +} // namespace jit +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl int main(int argc, const char **argv) { dnnl::impl::gpu::intel::jit::v2::conv::planner::planner_main(argc, argv); diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp index 80d45c4796b..5ac485b9e03 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp @@ -269,27 +269,45 @@ class dim_tile_set_t { pvar_map_t> tiles_; }; -std::vector get_tile_schemes(prop_kind_t prop, bool is_dw) { +struct search_params_t { + kernel_desc_t base_desc; + bool is_iter_set = false; + bool is_tg_set = false; + bool is_prefetch_set = false; + + search_params_t( + const kernel_desc_t &_base_desc, const parse_result_t &parse_result) + : base_desc(_base_desc) { + is_iter_set = parse_result.is_set("--iter"); + is_tg_set = parse_result.is_set("--tg"); + is_prefetch_set = parse_result.is_set("--prefetch"); + } + + search_params_t(const planner_params_t ¶ms) + : search_params_t(params.desc, params.parse_result) {} +}; + +std::vector get_tile_schemes(const search_params_t ¶ms) { std::vector schemes; - if (prop == prop_kind::forward) { + if (params.base_desc.prop == prop_kind::forward) { schemes.emplace_back("tg=[ic], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[ic], iter=[ow,g,oc,ic]"); schemes.emplace_back("tg=[oc,mb], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[oc,mb], iter=[ow,g,oc,ic]"); schemes.emplace_back("tg=[oc,ow], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[oc,ow], iter=[ow,g,oc,ic]"); - } else if (prop == prop_kind::backward_data) { + } else if (params.base_desc.prop == prop_kind::backward_data) { schemes.emplace_back("tg=[ic,iw], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[ic,mb], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[ic,iw], iter=[iw,g,oc,ic]"); - } else if (prop == prop_kind::backward_weights) { + } else if (params.base_desc.prop == prop_kind::backward_weights) { schemes.emplace_back("tg=[oc,ic], iter=[mb,g,oc,ic]"); schemes.emplace_back("tg=[oc,ic], iter=[ow,g,oc,ic]"); } else { ir_error_not_expected(); } for (auto &s : schemes) { - if (is_dw) { + if (params.base_desc.is_dw) { s.unset(pvars::ic); s.unset(pvars::oc); } else { @@ -360,10 +378,8 @@ class kernel_search_manager_t { static const int max_descs = 256; kernel_search_manager_t( - const bench_manager_t &bench_mger, const kernel_desc_t &base_desc) - : bench_mger_(bench_mger), base_desc_(base_desc) { - reset_reqs(base_desc_); - } + const bench_manager_t &bench_mger, const search_params_t ¶ms) + : bench_mger_(bench_mger), params_(params) {} void search() { std::cout << "Starting kernel search" << std::endl; @@ -386,31 +402,25 @@ class kernel_search_manager_t { } private: - static void reset_reqs(kernel_desc_t &kernel_desc) { - if (kernel_desc.prop != prop_kind::backward_data) return; - // XXX: No stride support in backward by data yet. - kernel_desc.reqs.add(pvars::sw.var() == 1); - kernel_desc.reqs.add(pvars::sh.var() == 1); - kernel_desc.reqs.add(pvars::sd.var() == 1); - } - std::vector gen_desc_groups() const { - std::unordered_map descs; - for (auto &s : get_tile_schemes(base_desc_.prop, base_desc_.is_dw)) { + std::unordered_set seen; + std::vector descs; + for (auto &s : get_tile_schemes(params_)) { dim_tile_set_t tile_set(s); auto tiling_descs = tile_set.create_tiling_descs(); for (auto &td : tiling_descs) { - auto d = base_desc_; - d.thread_group_tile = td.thread_group; - d.iter_tile = td.iter; + auto d = params_.base_desc; + if (!params_.is_tg_set) d.thread_group_tile = td.thread_group; + if (!params_.is_iter_set) d.iter_tile = td.iter; + auto d_key = jit::stringify(d); + if (seen.count(d_key) > 0) continue; + seen.insert(d_key); if (!finalize_conv_desc(d, bench_mger_.hw())) { std::cout << d.brief_str() << ": \033[1;31mFAIL\033[0m" << std::endl; continue; } - auto d_key = jit::stringify(d); - if (descs.find(d_key) != descs.end()) continue; - descs[d_key] = d; + descs.push_back(d); std::cout << d.brief_str() << ": \033[1;32mOK\033[0m" << std::endl; } @@ -418,15 +428,21 @@ class kernel_search_manager_t { ir_info() << "gen_desc_groups(): descs.size() = " << descs.size() << std::endl; std::unordered_map desc_groups; - for (auto &kv : descs) { - auto &d = kv.second; + std::vector prefetch_dists; + if (params_.is_prefetch_set) { + prefetch_dists.push_back(params_.base_desc.prefetch.dist); + } else { + prefetch_dists.push_back(1); + prefetch_dists.push_back(3); + } + for (auto &d : descs) { auto ret = desc_groups.emplace( d.reqs.str(), search_kernel_desc_group_t(d.reqs)); ret.first->second.add_desc(d); - for (int dist : {1, 3}) { + for (int dist : prefetch_dists) { auto _d = d; _d.prefetch = prefetch_desc_t(dist, true, true); - reset_reqs(_d); + _d.reqs = params_.base_desc.reqs; _d.is_finalized = false; if (!finalize_conv_desc(_d, bench_mger_.hw())) { std::cout << d.brief_str() << ": \033[1;31mFAIL\033[0m" @@ -484,7 +500,7 @@ class kernel_search_manager_t { } const bench_manager_t &bench_mger_; - kernel_desc_t base_desc_; + search_params_t params_; }; class search_sequence_t { @@ -586,12 +602,35 @@ bench_data_set_t bench_kernel_desc_group(const bench_manager_t &bench_mger, return bd_set; } -void search(const bench_manager_t &bench_mger, const kernel_desc_t &desc) { - kernel_search_manager_t mger(bench_mger, desc); - mger.search(); +std::string merge_cmd_lines(const std::string &recipe_line, + const parse_result_t &cmd_parse_result) { + auto &iface = kernel_desc_t::parse_iface(); + kernel_desc_t recipe_desc; + parse_result_t recipe_parse_result; + iface.parse(recipe_line, recipe_desc, &recipe_parse_result); + bool is_first = true; + std::ostringstream oss; + for (auto &kv : cmd_parse_result.args()) { + auto &name = kv.first; + ; + auto &value = kv.second; + if (!is_first) oss << " "; + oss << name << "=" << value; + is_first = false; + } + for (auto &kv : recipe_parse_result.args()) { + auto &name = kv.first; + auto &value = kv.second; + if (cmd_parse_result.args().count(name) > 0) continue; + if (!is_first) oss << " "; + oss << name << "=" << value; + is_first = false; + } + return oss.str(); } -void auto_search(const bench_manager_t &bench_mger) { +void auto_search( + const bench_manager_t &bench_mger, const planner_params_t ¶ms) { // clang-format off std::vector recipes = { "--hw xehpc --prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --fma dpas --simd 16 --regs 256 --2d 1", @@ -617,18 +656,41 @@ void auto_search(const bench_manager_t &bench_mger) { "--hw xehpc --dw 1 --prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 32 --regs 128 --align 1", }; // clang-format on + auto &iface = kernel_desc_t::parse_iface(); double t = get_msec(); + std::unordered_set seen; for (const char *_r : recipes) { - auto r = std::string(_r) + " --iter x --tg x"; + std::string line = merge_cmd_lines(_r, params.parse_result); + if (seen.count(line) > 0) continue; + seen.insert(line); kernel_desc_t desc; - desc.set(r); + parse_result_t parse_result; + iface.parse(line, desc, &parse_result); + //auto r = std::string(_r) + " --iter x --tg x"; + // TODO: Remove. desc.hw = hw_t(bench_mger.get_engine().get()); - search(bench_mger, desc); + kernel_search_manager_t mger( + bench_mger, search_params_t(desc, parse_result)); + mger.search(); } t = get_msec() - t; std::cout << "Kernel search done, took: " << t / 1e3 << " sec" << std::endl; } +void search(const bench_manager_t &bench_mger, const planner_params_t ¶ms) { + switch (params.mode) { + case planner_mode_t::search: { + kernel_search_manager_t mger(bench_mger, search_params_t(params)); + mger.search(); + break; + } + case planner_mode_t::auto_search: + auto_search(bench_mger, params); + break; + default: ir_error_not_expected(); + } +} + } // namespace planner } // namespace conv } // namespace v2 diff --git a/src/gpu/intel/jit/v2/conv/planner/search.hpp b/src/gpu/intel/jit/v2/conv/planner/search.hpp index 70a00892386..3ee39fe26c7 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.hpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #define GPU_INTEL_JIT_V2_CONV_PLANNER_SEARCH_HPP #include "gpu/intel/jit/v2/conv/planner/bench.hpp" +#include "gpu/intel/jit/v2/conv/planner/planner.hpp" namespace dnnl { namespace impl { @@ -31,8 +32,7 @@ class kernel_desc_t; namespace planner { -void search(const bench_manager_t &bench_mger, const kernel_desc_t &desc); -void auto_search(const bench_manager_t &bench_mger); +void search(const bench_manager_t &bench_mger, const planner_params_t ¶ms); } // namespace planner } // namespace conv From 1aeb36051a054f120ab0322329f5e915f6cd040a Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 6 Jan 2025 15:13:35 -0800 Subject: [PATCH 10/14] xe: conv_v2: generalize performance modeling --- src/gpu/intel/jit/v2/conv/model.cpp | 69 ++- src/gpu/intel/jit/v2/conv/model.hpp | 36 +- src/gpu/intel/jit/v2/conv/plan_registry.cpp | 8 +- src/gpu/intel/jit/v2/conv/plan_registry.hpp | 12 +- .../intel/jit/v2/conv/plan_registry_data.cpp | 426 +++++++++--------- .../intel/jit/v2/conv/planner/model_fit.cpp | 44 +- src/gpu/intel/jit/v2/conv/planner/search.cpp | 2 +- 7 files changed, 348 insertions(+), 249 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/model.cpp b/src/gpu/intel/jit/v2/conv/model.cpp index 862139923ef..e6b2e766e64 100644 --- a/src/gpu/intel/jit/v2/conv/model.cpp +++ b/src/gpu/intel/jit/v2/conv/model.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -188,6 +188,10 @@ struct sample_t { n = t[pvars::n]; k = t[pvars::k]; } + + static model_kind_t model_kind(const kernel_desc_t &desc) { + return model_kind_t::data_parallel; + } }; float coef_kl(float x, float a, float b) { @@ -229,7 +233,9 @@ float coef_wp(float x, float a, float b) { // and a few extra threadgroups a distinct increase in time is typically // observed. This effect is more pronounced with a smaller number of full // waves. -float model_t::predict(float kl, float waves, const vec1d &coef) { +float predict_data_parallel(const vec1d &x, const vec1d &coef) { + float kl = x[0]; + float waves = x[1]; float waves_frac = waves - (int)waves; float wp = (waves_frac == 0 ? 1 : waves_frac); float wf = std::ceil(waves); @@ -242,11 +248,16 @@ float model_t::predict(float kl, float waves, const vec1d &coef) { return Tw * (wf + wp * coef_wp(wf, a_wp, b_wp)); } +float model_t::predict(model_kind_t kind, const vec1d &x, const vec1d &coef) { + switch (kind) { + case model_kind_t::data_parallel: return predict_data_parallel(x, coef); + default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); + } + return 0; +} + float model_t::predict(const vec1d &x) const { - ir_assert(x.size() == 2); - float kl = x[0]; - float waves = x[1]; - return model_t::predict(kl, waves, coef_); + return predict(kind_, x, coef_); } float model_t::predict(const problem_t &prb, const kernel_desc_t &desc) const { @@ -283,6 +294,14 @@ void model_t::parse(std::istream &in) { deserialize_from_hex(coef_, s_data); } +size_t model_t::coef_count(model_kind_t kind) { + switch (kind) { + case model_kind_t::data_parallel: return 5; + default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); + } + return 0; +} + std::string to_str(const vec1d &x) { std::ostringstream oss; bool is_first = true; @@ -294,7 +313,43 @@ std::string to_str(const vec1d &x) { return oss.str(); } -void to_model_xy(const bench_data_t &bd, vec2d &X, vec1d &y) { +float model_set_t::eff(const problem_t &prb, const kernel_desc_t &desc) const { + auto kind = sample_t::model_kind(desc); + for (auto &m : models_) { + if (m.kind() == kind) return m.eff(prb, desc); + } + ir_error_not_expected() << "Matching model not found: " << desc.str(); + return 0; +} + +void model_set_t::stringify(std::ostream &out) const { + serialized_data_t s; + for (auto &m : models_) { + s.append(m.kind()); + for (auto &c : m.coef()) { + s.append(c); + } + } + out << data_to_hex(s.get_data()); +} + +void model_set_t::parse(std::istream &in) { + auto s_data = stream_parse(in); + auto s = serialized_t::from_data(hex_to_data(s_data)); + deserializer_t d(s); + while (!d.empty()) { + auto kind = d.pop(); + size_t coef_count = model_t::coef_count(kind); + vec1d coef(coef_count); + for (size_t i = 0; i < coef_count; i++) { + d.pop(coef[i]); + } + models_.emplace_back(kind, coef); + } +} + +void to_model_data( + model_kind_t kind, const bench_data_t &bd, vec2d &X, vec1d &y) { X.clear(); y.clear(); X.reserve(bd.size()); diff --git a/src/gpu/intel/jit/v2/conv/model.hpp b/src/gpu/intel/jit/v2/conv/model.hpp index 82d8682a51a..87ebdb19f6b 100644 --- a/src/gpu/intel/jit/v2/conv/model.hpp +++ b/src/gpu/intel/jit/v2/conv/model.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,10 +32,23 @@ namespace conv { using vec1d = std::vector; using vec2d = std::vector>; +enum class model_kind_t : uint8_t { + undef = 0, + data_parallel = 1, +}; + +static auto model_kind_names = nstl::to_array({ + make_enum_name(model_kind_t::undef, "undef"), + make_enum_name(model_kind_t::data_parallel, "data_parallel"), +}); +GPU_DEFINE_PARSE_ENUM(model_kind_t, model_kind_names) + class model_t { public: model_t() = default; - model_t(const vec1d &coef) : coef_(coef) {} + model_t(model_kind_t kind, const vec1d &coef) : kind_(kind), coef_(coef) {} + bool is_empty() const { return kind_ == model_kind_t::undef; } + model_kind_t kind() const { return kind_; } const vec1d &coef() const { return coef_; } float predict(const vec1d &x) const; float predict(const problem_t &prb, const kernel_desc_t &desc) const; @@ -44,13 +57,28 @@ class model_t { void stringify(std::ostream &out) const; void parse(std::istream &in); - static float predict(float kl, float waves, const vec1d &coef); + static float predict(model_kind_t kind, const vec1d &x, const vec1d &coef); + static size_t coef_count(model_kind_t kind); private: + model_kind_t kind_; vec1d coef_; }; -void to_model_xy(const bench_data_t &bd, vec2d &X, vec1d &y); +class model_set_t { +public: + model_set_t() = default; + model_set_t(const model_t &model) { models_.push_back(model); } + float eff(const problem_t &prb, const kernel_desc_t &desc) const; + void stringify(std::ostream &out) const; + void parse(std::istream &in); + +private: + std::vector models_; +}; + +void to_model_data( + model_kind_t kind, const bench_data_t &bd, vec2d &X, vec1d &y); void dump_csv(const bench_data_t &bd, const model_t &model); void dump_model_params(const kernel_desc_t &kernel_desc, const model_t &model); diff --git a/src/gpu/intel/jit/v2/conv/plan_registry.cpp b/src/gpu/intel/jit/v2/conv/plan_registry.cpp index dd0e585efff..269e78ca9b1 100644 --- a/src/gpu/intel/jit/v2/conv/plan_registry.cpp +++ b/src/gpu/intel/jit/v2/conv/plan_registry.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ kernel_desc_t plan_registry_t::find_best(const problem_t &prb) const { float best_eff = 0; for (auto &e : entries_) { if (!e.desc.can_fit(prb)) continue; - float eff = e.model.eff(prb, e.desc); + float eff = e.model_set.eff(prb, e.desc); if (eff > best_eff) { best_eff = eff; best = e.desc; @@ -86,14 +86,14 @@ void plan_registry_t::entry_t::stringify(std::ostream &out) const { ir_assert(desc.is_finalized) << "Cannot stringify non-finalized descriptor"; jit::stringify(out, desc); out << " model="; - jit::stringify(out, model); + jit::stringify(out, model_set); } void plan_registry_t::entry_t::parse(std::istream &in) { jit::parse(in, desc); desc.is_finalized = true; stream_match(in, "model="); - jit::parse(in, model); + jit::parse(in, model_set); } struct plan_registry_instance_t { diff --git a/src/gpu/intel/jit/v2/conv/plan_registry.hpp b/src/gpu/intel/jit/v2/conv/plan_registry.hpp index 4e64a434c58..8d2110752f5 100644 --- a/src/gpu/intel/jit/v2/conv/plan_registry.hpp +++ b/src/gpu/intel/jit/v2/conv/plan_registry.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,11 +33,11 @@ class plan_registry_t { public: struct entry_t { kernel_desc_t desc; - model_t model; + model_set_t model_set; entry_t() = default; - entry_t(const kernel_desc_t &desc, const model_t &model) - : desc(desc), model(model) {} + entry_t(const kernel_desc_t &desc, const model_set_t &model_set) + : desc(desc), model_set(model_set) {} void stringify(std::ostream &out) const; void parse(std::istream &in); }; @@ -45,9 +45,9 @@ class plan_registry_t { plan_registry_t() = default; plan_registry_t(const char **entries); - void set(const kernel_desc_t &desc, const model_t &model) { + void set(const kernel_desc_t &desc, const model_set_t &model_set) { ir_assert(desc.is_finalized); - entries_.emplace_back(desc, model); + entries_.emplace_back(desc, model_set); } int size() const { return (int)entries_.size(); } kernel_desc_t find_best(const problem_t &prb) const; diff --git a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp index 71ab7b31b37..45edc05e377 100644 --- a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp +++ b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,218 +25,218 @@ namespace conv { // clang-format off const char** get_plan_registry_entries() { static const char *entries[] = { - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000E0004B4414C4BF3EFEC7B23F0000803FADD1A03C", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000A83F49440168683FFFBFA13F65C6EA40F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000914D554405161A3FFF4F853F985FC53FF90B5940", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=05000000000000001FF47A441234E13E04F44A3F62FCB53FF6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000A6FBA84400708D3F5F34D23D65FE8C3F15C0E63E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000600924441234C83EFBA1B03F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=05000000000000003B4C434413F4C63EFBEFA33FFD778F3F05E00E3F", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=0500000000000000CA3613441438CE3EFD5F9D3F66CE863F6334B93D", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=05000000000000003368314417888D3EFEC77B3F7FB9F640F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=0500000000000000DDFA00440C20F53EFEC38E3F0000803F2D9A093E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb4oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=0500000000000000404F05441260EC3EFD83AB3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=050000000000000031AF5C44140CC83E05A3643F55811542F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=05000000000000009CEA3B440658053F043A783F326BEA41FD7F0D40", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=05000000000000000F6D38441828983E043C6C3F349E7140F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=05000000000000003425F043152CC13EFE8B8A3F0000803F0FEDC93E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=0500000000000000B19027441222DA3EFE6B873FCDFED33F016C913F", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0500000000000000AA475245FE479C3F04604D3F9A79823F17809F3E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0500000000000000473F5645FC3FB83F04204C3F0000803F3200513E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=05000000000000002A874B45FF27943F02A84B3F0000803F1200E83E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0500000000000000DDDA4D45FF4F993F02604D3F0000803F1480C73E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=050000000000000073435445FFF7A53F02F85A3F34DB813F3380713E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0500000000000000B6CF4945FFB7903F03504E3F33F3843F1400C03E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=05000000000000006BAB5145FF6FA13F02F04E3F0000803F1400DE3E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0500000000000000866A544500E89C3F02A0623F0000803F36005B3E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000E78FF543175CA53EFDB5833F0000803F124D9C3E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000A7AE30441140F33E00B08E3F9A69A13F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000624E34440550083F0108913F0094A33F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=oc4ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000D38643441362D13E008C993F0000803FAF8D263B", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=050000000000000018BED844FD3B943F1658AE3E0000803FF1FFF741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc64 tg=mb2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=05000000000000004017374415C4973E06F2423F16D25541F6FFA741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000B744DA44FF3B8A3F12A0E33E33238B3F3100743E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0500000000000000A2B2664406D41A3FFE5F7F3F342D843F1700E33E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=050000000000000088D08744FEF3903F0044963F32450740F8FF8441", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=050000000000000004261B440638113F00C1873F9A71C33F0540183F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0500000000000000E2EE244406A40E3FFE6B8C3FCC44A33F2D9A223E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow64 tg=oc4ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0500000000000000F998FA433444493E03C85D3FAA894E40F6FFA741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0500000000000000FB813C44084E003FFFA58E3F9903AD3F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow16 tg=ic16 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=05000000000000006AE592440028803F0600173F9959DD3F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0500000000000000426608441420BE3E06882F3F6590F63FF3DF5E41", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0500000000000000B421404406FC1A3FFE0A8E3F3295B23F0240633F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=05000000000000002EA8084501A4683FFE8FA43F9A79823FA9D1F03C", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=050000000000000027F6324501E0633FFE6F923F6646963F1300C53E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=050000000000000073790745011C693FFFEF9E3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=050000000000000053341A450738203F00107F3F9ABA863F3580673E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=05000000000000005988DF4401C0743FFE2FAF3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=05000000000000009CA8AF440508233F00208D3F0000803F309A043E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=050000000000000045ED03451304E43E0160523F0382883FFC5FA23F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0500000000000000A826ED4402283F3FFE679A3F34DB813F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb16oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=0500000000000000BB9F804435A01F3E0054813F67B6833FF93FFE3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc8ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=050000000000000041CC9E440718223F0154683F0000803F1240E43E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=05000000000000002F42934405A0303F04204C3F0000803F0470243F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=05000000000000006451B844FC0FB63F02F0443F0000803F3600243E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=05000000000000005D46984406701F3F03BC503F334F803F14C0DC3E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=05000000000000009A1E964400688B3F0450303F9AD2893F1200C53E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=05000000000000008FEBB944FF179A3F0468273F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=0500000000000000D2CF8C4408E0133F04D03C3F4B86843F0680283F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=mb16oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0500000000000000F80897440830143F0298513FCF628E3F3300333E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=05000000000000003A274944F2E1F83CFF64703FC9264942F6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=05000000000000005D394C443510233E00B4583F3B0CF03FF6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0500000000000000EBF1A144FF9F9E3F0568183F0000803F1000E83E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=050000000000000033BA864406F0303F05203D3F0000803F0280783F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0500000000000000B3A0324474C8C63D0192573F99B30140F6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0500000000000000BCC68E4402885C3F05782B3F0000803F06800F3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=05000000000000007500BE44FECFBE3F0528123F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=0500000000000000FD0198441300D93E01C8623F99FF9B3F00C08D3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=0500000000000000E862A44408D0003F00B0753F0000803F0540543F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb8oc32 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=05000000000000004815DE4403143F3FFE47973F6696C13F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=0500000000000000C187C8440348383FFE1FA13F69F6CD3F1200D43E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=05000000000000009F20AA4408300A3F00A87D3F0000803F08A0173F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=05000000000000005C71DC440358323FFEDBA33FFF2FE93F128D8E3E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=0500000000000000A673A3441000E83E002C8B3FFF7A833F00B4803F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=0500000000000000F099B64406101B3F0040863FCDECB83F6134C33D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0500000000000000B4235B4405440F3FFE47883F35D1883F5B34F53D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=050000000000000086C1274405DC123F0078993F6616903F2D5A2B3E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=mb4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=05000000000000007CDD15441274D33E0268773F7F0BD63FFEB78B3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=050000000000000072D6C94316048F3E036C643F0086993F0720153F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic2iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=05000000000000000A56F04332943F3E02007B3FB41D803F17C0AA3E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=050000000000000066732744063C083FFE878E3FCAAA973F17B7D138", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb32oc32 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=05000000000000006F774F441448AF3E0388613F6834FB3FF6FFA241", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0500000000000000C14A5B4405EC203F00C8763F6868923F1480A93E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc64 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000564314440578263FFD37BB3F0290923FFA3F1B40", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc32 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000DC38194406BC1E3FFE17A93F9B28BA3FFD7FAA3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000B5261D441350E33EFE2F8C3F31652C40F81F8341", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=05000000000000001F6D2A44030C383FFEF39F3F0000803F6134D73D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic4iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000F2253044161CEA3E00DC8F3F9A27803F17B7D138", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000BD8B3C4416E4BD3E0270473FD40AA242F6FFA741", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=050000000000000042ED5F440E30F93E05FA353FE4F73242F6FFA741", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0500000000000000C2B9424403B8273F8006863F6564813FFFAFCF3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000A2E8DB4403702E3F00E4873F0000803F13C0F03E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000655B1A4508C0153FFFF7783F9A79823F16C0873E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000B51BB1440420423F00B07A3F0000803F2E5A263E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000005ED31B4502204C3F00B8633F0000803F5D34FA3D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000009E62F84413B0D83E0408463F3442933FFB2FAA3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=ic4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000D6651C4501005D3F01C0653F0000803F291A483E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000000DDB154504E0453F00E06D3F0000803F128D893E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=050000000000000014F8C14408080F3F00406D3F6842853F06E0183F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0500000000000000E3FF984409181D3F0148513F672E933F1300ED3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb32oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0500000000000000C267AB44F340723DFE4B7A3F6491933FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=05000000000000001E22924409701A3F0490403F0000803F1100E33E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0500000000000000A06184442ED05D3E0018723F4D5D833FFC3FB83F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=05000000000000004A3FAE4408F00D3FFA4FB73FCDE0853F0630193F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=05000000000000007348974413F0E33E0098973F0000803F0660203F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=050000000000000069DE9A440100853F03CC4A3F98ED803F12A0ED3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0500000000000000EA958E440610203F0440403F0000803F05D0193F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=05000000000000003267864404D0373F0508373FE5C5873F03A05D3F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=050000000000000039AC8F440150803F0368403F67388C3FFF77943F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0500000000000000897F9B440680373F03B0433FCD8EA03FFC5F933F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0500000000000000F50D784418E0853E00B8723F35E9A83FF87F1C40", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0500000000000000EE529244FDFF983F0338393F3490843F02E0593F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=050000000000000002B688440630373F04703D3F00E5883F03E0593F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0500000000000000231DA7440110843FFBFFAC3F00A4843F0BE0093F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=050000000000000034428C4405605C3F02B0433F0041843F02A0623F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000008B6AAC4416809A3E0040723F0178AC3FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=ic2mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000003F82B1441060F13EFF0F983F00268D3F1000E83E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=050000000000000030A0C14406C00B3FFAFFAC3F99739A3FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000005D9998441510BA3E01785D3F0000803FFBDFD63F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000E0EAB04408600C3F00A8963F0000803F0880F43E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000008F8F9F443260693E00E4823FCC40923FFFAF843F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0500000000000000C124B6440780373FFF07873FCCA6863F1300DE3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=05000000000000001467A9441480B83E00B07A3F344F803F03F0493F", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0500000000000000D51D8C43F540403D01C0563F99F9334017B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=050000000000000035E5084437001F3EFEFF753F66E6A63FF67F2841", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0500000000000000AC38EF437040EA3D0110663F346FC23FC700AA40", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0500000000000000C7D86943F383533C0390403F33755B42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=05000000000000009383A643F1FFF741F6FFA7419A6D1942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0500000000000000D7348F437880823D04E8473F022C923F17B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=050000000000000019D7EF43F603333CFF4B843F9631F93FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc64 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=0500000000000000647D8D43F7A0263D0420423F0000803F17B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=0500000000000000569FCC4378A08F3D00206F3F00202540F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow64 tg=ic16 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=050000000000000031C70D443680213E01C0653FCD141D40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow32 tg=ic8oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=05000000000000006B302B443440393E00A0763F672A823F1500A23E", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=050000000000000094A5D7433380353E03104D3F9889E33FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow16 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=0500000000000000F84D6A43F4403B3DFF7F873F66CAF53FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=050000000000000076F4C44339F0063E0330413FFFEBAA3FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow32 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=05000000000000001B97A643F7FF0F42F6FFA7413313AD40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=05000000000000008EEFCB437540B33DFF87703FFF879340F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb32oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0500000000000000CCB2FE44F400443D00407C3F6666D83FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=oc8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=050000000000000082AAE6447880963D02AC563F65B4AC3FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=050000000000000099FC0545F0005D3D01A0623F65C6E43FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=050000000000000059E0C344F0407C3D00207E3F66FEA93FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0500000000000000230D9C44F000583D01646C3F01D8B83FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=oc2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=050000000000000011D8AB44F2C0383D00206A3F32E3C03FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=05000000000000009034F9447800803DFF7F823F3273D33FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=ic2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=050000000000000000CCE4447180C83D01885C3FFF3FE73FF8FF8840", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0500000000000000A6A88B43F7C01A3D0368453F9A211840F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=05000000000000001B917644F801A23CFE3F7C3FFFE99D42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=05000000000000003E380244F603563CFF5F893F34F3A740F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=05000000000000005815AC43F2035B3C030C4C3F67DAA042F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=05000000000000003D15C843F200713D02104D3F9CB94C40F37FEA40", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=050000000000000015E48044F821963C00887A3FCB899F42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=050000000000000002A5B243F5C1AA3C03E84C3F97594341F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0500000000000000B3308343F483443C0028673F01809842F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc32ow8 tg=oc8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 model=050000000000000063BB0044F781B33C00B8863F9AA96B40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow32 tg=oc2 loop_desc=mb,ow,oh,od 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0500000000000000B7A19B443600103E0040863F6646B940F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=05000000000000007D537A437140D13D04702E3F3473B63FC7E0C440", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0500000000000000EFB79B43F600173D0040863F99412842F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=050000000000000054405244F880053D0140593FE5603040F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=050000000000000093739B43F521AA3C01A85F3F9929D73FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic4oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0500000000000000179E0044F281E03C0110703FCDD09741F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0500000000000000D8ABF143F621A03C0342573FFD770F40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic4oc2 loop_desc=mb,ow,oh,od ext=bias model=050000000000000033E02B44F4E1CB3C0018683F64D43C42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc16 loop_desc=mb,ow,oh,od ext=bias model=0500000000000000C23F2D44F860073DFF7B813FCDDCF43FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=050000000000000009F32944F0E1D53C02C85D3F353C7942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=oc4 loop_desc=mb,ow,oh,od ext=bias model=05000000000000002FC23044F401C03C0100763FCD3CA440F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic4 loop_desc=mb,ow,oh,od ext=bias model=0500000000000000F0B64844F401C03C01805F3F65E49942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc2 loop_desc=mb,ow,oh,od ext=bias model=050000000000000069912244F3804B3DFFDF773F33F36740F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=05000000000000006CC8F643F481C73C0280643F9813E441F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od ext=bias model=0500000000000000CEC02C44F441C13C0268723FFDCB3642F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=05000000000000000553AB450000803F0000803F00300640FFFF843F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=0500000000000000B9ADA5450000803F0000803F6656F73F0200623F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=0500000000000000CE91A9450000803F0000803F99B9E93F0500213F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=05000000000000009B76A8450000803F0000803F9A7902402F007E3E", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=05000000000000004360AE450000803F0000803FCD8CC93F0480283F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=0500000000000000C9D4A5450000803F0000803F6726F13FFF7F783F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=0500000000000000157AAB450000803F0000803F9911B73F3300603E", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=05000000000000009EE0AB450000803F0000803F67EEC83F3600293E", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0500000000000000A0C914450000803F0000803F34959942F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0500000000000000699018450000803F0000803FCEE59A42F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=050000000000000058D322450000803F0000803F5F76BF40F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0500000000000000FE3B1B450000803F0000803FCE8C2F41F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=05000000000000009D3A17450000803F0000803F98A96E415F002041", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0500000000000000380415450000803F0000803F00128242F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=05000000000000007EEC16450000803F0000803F64E6AC405A007041", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0500000000000000CF0218450000803F0000803F3313CD40F83F1341", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=050000000000000042FEDA440000803F0000803F33A33C40FA7F8140", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=05000000000000007B4EDA4400007B3F0000803F65F64D40FC7F0840", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=0500000000000000014BD7440000803F0000803F34D34240FB7F0D40", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=05000000000000003894DE440000853F0000803FCB4C4540F9FF3C40", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=0500000000000000D57BD7440000803F0000803FCDCC1340FCFFC03F", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=05000000000000001CF3D6440000803F0000803FCB9C7040F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=05000000000000006A27DA4400E0863F0000803F9BB98C40F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=0500000000000000EF0CD4440050803F0000803F9B495F40291A5C3E", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000183E1A450000803F0000803FFFB75C41F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000AA431C450000803F0000803FFDFF6541F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000025A16450000803F0000803F99C28F42F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000B63C16450000803F0000803F998B0842F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000014313450000803F0000803F998B8442F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000D2E415450000803F0000803F67167641F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000DD4E12450000803F0000803FCBCC7941F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000955F18450000803F0000803FCCC49440FBFFD93F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=05000000000000007055E0440000803F0000803F34FB2640F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000BD82EB440000803F0000803F66CE0640F77FA440", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=050000000000000091B1D0440100763F0000803F64F64D40F87FAE40", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=05000000000000009B1BCD440000803F0000803FCACC7640F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000800BDE440080823F0000803FE6690C4005805A3F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=05000000000000007FEBC744001C823F0000803F99B98C40F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0500000000000000D60AD5440000803F0000803F335311400240633F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=05000000000000003A92C34400807D3F0000803F660E9140F6FF5540", - "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=050000000000000069CC71447940813D01305F3FCC2C87420000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=05000000000000005EBA5944F500493DFEFF7F3F341391420000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=050000000000000027096344F980003D0000713F65E65F410000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=05000000000000002EB44044F500493D01005D3FFFBF66420000803F", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01E0004B4414C4BF3EFEC7B23F0000803FADD1A03C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01A83F49440168683FFFBFA13F65C6EA40F6FFA741", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01914D554405161A3FFF4F853F985FC53FF90B5940", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=011FF47A441234E13E04F44A3F62FCB53FF6FFA741", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01A6FBA84400708D3F5F34D23D65FE8C3F15C0E63E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01600924441234C83EFBA1B03F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=013B4C434413F4C63EFBEFA33FFD778F3F05E00E3F", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01CA3613441438CE3EFD5F9D3F66CE863F6334B93D", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=013368314417888D3EFEC77B3F7FB9F640F6FFA741", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01DDFA00440C20F53EFEC38E3F0000803F2D9A093E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb4oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01404F05441260EC3EFD83AB3F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=0131AF5C44140CC83E05A3643F55811542F6FFA741", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=019CEA3B440658053F043A783F326BEA41FD7F0D40", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=010F6D38441828983E043C6C3F349E7140F6FFA741", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=013425F043152CC13EFE8B8A3F0000803F0FEDC93E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01B19027441222DA3EFE6B873FCDFED33F016C913F", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01AA475245FE479C3F04604D3F9A79823F17809F3E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01473F5645FC3FB83F04204C3F0000803F3200513E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=012A874B45FF27943F02A84B3F0000803F1200E83E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01DDDA4D45FF4F993F02604D3F0000803F1480C73E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0173435445FFF7A53F02F85A3F34DB813F3380713E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01B6CF4945FFB7903F03504E3F33F3843F1400C03E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=016BAB5145FF6FA13F02F04E3F0000803F1400DE3E", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01866A544500E89C3F02A0623F0000803F36005B3E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01E78FF543175CA53EFDB5833F0000803F124D9C3E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01A7AE30441140F33E00B08E3F9A69A13F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01624E34440550083F0108913F0094A33F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=oc4ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01D38643441362D13E008C993F0000803FAF8D263B", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0118BED844FD3B943F1658AE3E0000803FF1FFF741", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc64 tg=mb2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=014017374415C4973E06F2423F16D25541F6FFA741", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01B744DA44FF3B8A3F12A0E33E33238B3F3100743E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01A2B2664406D41A3FFE5F7F3F342D843F1700E33E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0188D08744FEF3903F0044963F32450740F8FF8441", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0104261B440638113F00C1873F9A71C33F0540183F", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01E2EE244406A40E3FFE6B8C3FCC44A33F2D9A223E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow64 tg=oc4ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01F998FA433444493E03C85D3FAA894E40F6FFA741", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01FB813C44084E003FFFA58E3F9903AD3F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow16 tg=ic16 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=016AE592440028803F0600173F9959DD3F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01426608441420BE3E06882F3F6590F63FF3DF5E41", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01B421404406FC1A3FFE0A8E3F3295B23F0240633F", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=012EA8084501A4683FFE8FA43F9A79823FA9D1F03C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0127F6324501E0633FFE6F923F6646963F1300C53E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0173790745011C693FFFEF9E3F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0153341A450738203F00107F3F9ABA863F3580673E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=015988DF4401C0743FFE2FAF3F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=019CA8AF440508233F00208D3F0000803F309A043E", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0145ED03451304E43E0160523F0382883FFC5FA23F", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=01A826ED4402283F3FFE679A3F34DB813F17B7D138", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb16oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=01BB9F804435A01F3E0054813F67B6833FF93FFE3F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc8ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=0141CC9E440718223F0154683F0000803F1240E43E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=012F42934405A0303F04204C3F0000803F0470243F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=016451B844FC0FB63F02F0443F0000803F3600243E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=015D46984406701F3F03BC503F334F803F14C0DC3E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=019A1E964400688B3F0450303F9AD2893F1200C53E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=018FEBB944FF179A3F0468273F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=01D2CF8C4408E0133F04D03C3F4B86843F0680283F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=mb16oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01F80897440830143F0298513FCF628E3F3300333E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=013A274944F2E1F83CFF64703FC9264942F6FFA741", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=015D394C443510233E00B4583F3B0CF03FF6FFA741", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01EBF1A144FF9F9E3F0568183F0000803F1000E83E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0133BA864406F0303F05203D3F0000803F0280783F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01B3A0324474C8C63D0192573F99B30140F6FFA741", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01BCC68E4402885C3F05782B3F0000803F06800F3F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=017500BE44FECFBE3F0528123F0000803F17B7D138", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01FD0198441300D93E01C8623F99FF9B3F00C08D3F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01E862A44408D0003F00B0753F0000803F0540543F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb8oc32 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=014815DE4403143F3FFE47973F6696C13F17B7D138", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01C187C8440348383FFE1FA13F69F6CD3F1200D43E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=019F20AA4408300A3F00A87D3F0000803F08A0173F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=015C71DC440358323FFEDBA33FFF2FE93F128D8E3E", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01A673A3441000E83E002C8B3FFF7A833F00B4803F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01F099B64406101B3F0040863FCDECB83F6134C33D", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01B4235B4405440F3FFE47883F35D1883F5B34F53D", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0186C1274405DC123F0078993F6616903F2D5A2B3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=mb4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=017CDD15441274D33E0268773F7F0BD63FFEB78B3F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0172D6C94316048F3E036C643F0086993F0720153F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic2iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=010A56F04332943F3E02007B3FB41D803F17C0AA3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0166732744063C083FFE878E3FCAAA973F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb32oc32 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=016F774F441448AF3E0388613F6834FB3FF6FFA241", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01C14A5B4405EC203F00C8763F6868923F1480A93E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc64 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01564314440578263FFD37BB3F0290923FFA3F1B40", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc32 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01DC38194406BC1E3FFE17A93F9B28BA3FFD7FAA3F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01B5261D441350E33EFE2F8C3F31652C40F81F8341", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=011F6D2A44030C383FFEF39F3F0000803F6134D73D", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic4iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01F2253044161CEA3E00DC8F3F9A27803F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01BD8B3C4416E4BD3E0270473FD40AA242F6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0142ED5F440E30F93E05FA353FE4F73242F6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01C2B9424403B8273F8006863F6564813FFFAFCF3F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01A2E8DB4403702E3F00E4873F0000803F13C0F03E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01655B1A4508C0153FFFF7783F9A79823F16C0873E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01B51BB1440420423F00B07A3F0000803F2E5A263E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=015ED31B4502204C3F00B8633F0000803F5D34FA3D", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=019E62F84413B0D83E0408463F3442933FFB2FAA3F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=ic4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01D6651C4501005D3F01C0653F0000803F291A483E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=010DDB154504E0453F00E06D3F0000803F128D893E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0114F8C14408080F3F00406D3F6842853F06E0183F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01E3FF984409181D3F0148513F672E933F1300ED3E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb32oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01C267AB44F340723DFE4B7A3F6491933FF6FFA741", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=011E22924409701A3F0490403F0000803F1100E33E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01A06184442ED05D3E0018723F4D5D833FFC3FB83F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=014A3FAE4408F00D3FFA4FB73FCDE0853F0630193F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=017348974413F0E33E0098973F0000803F0660203F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0169DE9A440100853F03CC4A3F98ED803F12A0ED3E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01EA958E440610203F0440403F0000803F05D0193F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=013267864404D0373F0508373FE5C5873F03A05D3F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0139AC8F440150803F0368403F67388C3FFF77943F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01897F9B440680373F03B0433FCD8EA03FFC5F933F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01F50D784418E0853E00B8723F35E9A83FF87F1C40", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01EE529244FDFF983F0338393F3490843F02E0593F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0102B688440630373F04703D3F00E5883F03E0593F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01231DA7440110843FFBFFAC3F00A4843F0BE0093F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0134428C4405605C3F02B0433F0041843F02A0623F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018B6AAC4416809A3E0040723F0178AC3FF6FFA741", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=ic2mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=013F82B1441060F13EFF0F983F00268D3F1000E83E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0130A0C14406C00B3FFAFFAC3F99739A3FF6FFA741", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=015D9998441510BA3E01785D3F0000803FFBDFD63F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01E0EAB04408600C3F00A8963F0000803F0880F43E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018F8F9F443260693E00E4823FCC40923FFFAF843F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01C124B6440780373FFF07873FCCA6863F1300DE3E", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=011467A9441480B83E00B07A3F344F803F03F0493F", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01D51D8C43F540403D01C0563F99F9334017B7D138", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0135E5084437001F3EFEFF753F66E6A63FF67F2841", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01AC38EF437040EA3D0110663F346FC23FC700AA40", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01C7D86943F383533C0390403F33755B42F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=019383A643F1FFF741F6FFA7419A6D1942F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01D7348F437880823D04E8473F022C923F17B7D138", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0119D7EF43F603333CFF4B843F9631F93FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc64 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=01647D8D43F7A0263D0420423F0000803F17B7D138", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=01569FCC4378A08F3D00206F3F00202540F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow64 tg=ic16 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=0131C70D443680213E01C0653FCD141D40F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow32 tg=ic8oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=016B302B443440393E00A0763F672A823F1500A23E", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=0194A5D7433380353E03104D3F9889E33FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow16 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=01F84D6A43F4403B3DFF7F873F66CAF53FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=0176F4C44339F0063E0330413FFFEBAA3FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow32 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=011B97A643F7FF0F42F6FFA7413313AD40F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=018EEFCB437540B33DFF87703FFF879340F6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb32oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od ext=out_b4,bias model=01CCB2FE44F400443D00407C3F6666D83FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=oc8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0182AAE6447880963D02AC563F65B4AC3FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=0199FC0545F0005D3D01A0623F65C6E43FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=0159E0C344F0407C3D00207E3F66FEA93FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=01230D9C44F000583D01646C3F01D8B83FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=oc2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0111D8AB44F2C0383D00206A3F32E3C03FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=019034F9447800803DFF7F823F3273D33FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=ic2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0100CCE4447180C83D01885C3FFF3FE73FF8FF8840", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=01A6A88B43F7C01A3D0368453F9A211840F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=011B917644F801A23CFE3F7C3FFFE99D42F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=013E380244F603563CFF5F893F34F3A740F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=015815AC43F2035B3C030C4C3F67DAA042F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=013D15C843F200713D02104D3F9CB94C40F37FEA40", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0115E48044F821963C00887A3FCB899F42F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0102A5B243F5C1AA3C03E84C3F97594341F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=01B3308343F483443C0028673F01809842F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc32ow8 tg=oc8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 model=0163BB0044F781B33C00B8863F9AA96B40F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow32 tg=oc2 loop_desc=mb,ow,oh,od 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01B7A19B443600103E0040863F6646B940F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=017D537A437140D13D04702E3F3473B63FC7E0C440", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01EFB79B43F600173D0040863F99412842F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0154405244F880053D0140593FE5603040F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0193739B43F521AA3C01A85F3F9929D73FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic4oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01179E0044F281E03C0110703FCDD09741F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01D8ABF143F621A03C0342573FFD770F40F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic4oc2 loop_desc=mb,ow,oh,od ext=bias model=0133E02B44F4E1CB3C0018683F64D43C42F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc16 loop_desc=mb,ow,oh,od ext=bias model=01C23F2D44F860073DFF7B813FCDDCF43FF6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=0109F32944F0E1D53C02C85D3F353C7942F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=oc4 loop_desc=mb,ow,oh,od ext=bias model=012FC23044F401C03C0100763FCD3CA440F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic4 loop_desc=mb,ow,oh,od ext=bias model=01F0B64844F401C03C01805F3F65E49942F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc2 loop_desc=mb,ow,oh,od ext=bias model=0169912244F3804B3DFFDF773F33F36740F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=016CC8F643F481C73C0280643F9813E441F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od ext=bias model=01CEC02C44F441C13C0268723FFDCB3642F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=010553AB450000803F0000803F00300640FFFF843F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01B9ADA5450000803F0000803F6656F73F0200623F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01CE91A9450000803F0000803F99B9E93F0500213F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=019B76A8450000803F0000803F9A7902402F007E3E", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=014360AE450000803F0000803FCD8CC93F0480283F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01C9D4A5450000803F0000803F6726F13FFF7F783F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01157AAB450000803F0000803F9911B73F3300603E", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=019EE0AB450000803F0000803F67EEC83F3600293E", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01A0C914450000803F0000803F34959942F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01699018450000803F0000803FCEE59A42F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0158D322450000803F0000803F5F76BF40F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01FE3B1B450000803F0000803FCE8C2F41F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=019D3A17450000803F0000803F98A96E415F002041", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01380415450000803F0000803F00128242F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=017EEC16450000803F0000803F64E6AC405A007041", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01CF0218450000803F0000803F3313CD40F83F1341", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=0142FEDA440000803F0000803F33A33C40FA7F8140", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=017B4EDA4400007B3F0000803F65F64D40FC7F0840", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01014BD7440000803F0000803F34D34240FB7F0D40", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=013894DE440000853F0000803FCB4C4540F9FF3C40", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01D57BD7440000803F0000803FCDCC1340FCFFC03F", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=011CF3D6440000803F0000803FCB9C7040F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=016A27DA4400E0863F0000803F9BB98C40F6FFA741", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01EF0CD4440050803F0000803F9B495F40291A5C3E", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01183E1A450000803F0000803FFFB75C41F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01AA431C450000803F0000803FFDFF6541F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01025A16450000803F0000803F99C28F42F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01B63C16450000803F0000803F998B0842F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01014313450000803F0000803F998B8442F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01D2E415450000803F0000803F67167641F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01DD4E12450000803F0000803FCBCC7941F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01955F18450000803F0000803FCCC49440FBFFD93F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=017055E0440000803F0000803F34FB2640F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01BD82EB440000803F0000803F66CE0640F77FA440", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0191B1D0440100763F0000803F64F64D40F87FAE40", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=019B1BCD440000803F0000803FCACC7640F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01800BDE440080823F0000803FE6690C4005805A3F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=017FEBC744001C823F0000803F99B98C40F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01D60AD5440000803F0000803F335311400240633F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=013A92C34400807D3F0000803F660E9140F6FF5540", + "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=0169CC71447940813D01305F3FCC2C87420000803F", + "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=015EBA5944F500493DFEFF7F3F341391420000803F", + "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=0127096344F980003D0000713F65E65F410000803F", + "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=012EB44044F500493D01005D3FFFBF66420000803F", nullptr, }; return entries; diff --git a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp index 9692dff6fcd..6e96aad9e0e 100644 --- a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,11 +27,12 @@ namespace planner { namespace { -float r2_score(const vec2d &X, const vec1d &y, const vec1d &coef) { +float r2_score( + model_kind_t kind, const vec2d &X, const vec1d &y, const vec1d &coef) { std::vector y_true, y_pred; for (size_t i = 0; i < X.size(); i++) { y_true.push_back(y[i]); - y_pred.push_back(model_t::predict(X[i][0], X[i][1], coef)); + y_pred.push_back(model_t::predict(kind, X[i], coef)); } float u = 0; float v = 0; @@ -65,6 +66,9 @@ struct model_params_t { float operator()() const { return val; } }; + model_params_t() = default; + model_params_t(model_kind_t kind) : kind(kind) {} + void add(const std::string &name, float val, float lo, float hi) { vec.emplace_back(param_t(name, val, lo, hi)); } @@ -86,6 +90,7 @@ struct model_params_t { return oss.str(); } + model_kind_t kind = model_kind_t::undef; std::vector vec; }; @@ -93,7 +98,7 @@ float r2_score(const vec2d &X, const vec1d &y, const model_params_t ¶ms) { vec1d coef; for (int i = 0; i < params.size(); i++) coef.push_back(params[i].val); - return r2_score(X, y, coef); + return r2_score(params.kind, X, y, coef); } void find_optimal_param( @@ -120,8 +125,8 @@ void find_optimal_param( } // namespace -model_t model_fit(const vec2d &X, const vec1d &y, bool verbose = false) { - model_params_t params; +model_t model_fit_data_parallel(const vec2d &X, const vec1d &y, bool verbose) { + model_params_t params(model_kind_t::data_parallel); // Empirically-based parameter ranges. params.add("T0", 1000, 1, 100000); params.add("a_kl", 1, 0.0001f, 100); @@ -143,7 +148,17 @@ model_t model_fit(const vec2d &X, const vec1d &y, bool verbose = false) { vec1d coef; for (int i = 0; i < params.size(); i++) coef.push_back(params[i].val); - return model_t(coef); + return model_t(params.kind, coef); +} + +model_t model_fit(model_kind_t kind, const vec2d &X, const vec1d &y, + bool verbose = false) { + switch (kind) { + case model_kind_t::data_parallel: + return model_fit_data_parallel(X, y, verbose); + default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); + } + return model_t(); } model_t model_fit(const bench_data_t &bd) { @@ -152,10 +167,11 @@ model_t model_fit(const bench_data_t &bd) { return model_t(); } // Step 1. Fit model. + model_kind_t kind = model_kind_t::data_parallel; vec2d X; vec1d y; - to_model_xy(bd, X, y); - auto ml_model = model_fit(X, y); + to_model_data(kind, bd, X, y); + auto model = model_fit(kind, X, y); // Step 2. Remove outliers where the fitted model predicts significantly // higher times. For example this may happen due to better L1 cache reuse @@ -164,15 +180,15 @@ model_t model_fit(const bench_data_t &bd) { vec2d X_adjusted; vec1d y_adjusted; for (size_t i = 0; i < X.size(); i++) { - float pred = ml_model.predict(X[i]); + float pred = model.predict(X[i]); if ((pred - y[i]) > 0.25 * y[i]) continue; X_adjusted.push_back(X[i]); y_adjusted.push_back(y[i]); } - ml_model = model_fit(X_adjusted, y_adjusted, /*verbose=*/true); - dump_csv(bd, ml_model); - dump_model_params(bd.kernel_desc, ml_model); - return model_t(ml_model); + model = model_fit(kind, X_adjusted, y_adjusted, /*verbose=*/true); + dump_csv(bd, model); + dump_model_params(bd.kernel_desc, model); + return model_t(model); } } // namespace planner diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp index 5ac485b9e03..eb183c5bb76 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp @@ -395,7 +395,7 @@ class kernel_search_manager_t { if (!bd_model) continue; auto model = model_fit(bd_model); auto d_ext = try_extensions(bench_mger_, d); - registry.set(d_ext, model); + registry.set(d_ext, model_set_t(model)); } } std::cout << "Kernel search completed" << std::endl; From 5492d310907a44d8c4fe400dec34e6c8d8798b59 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 Jan 2025 15:00:12 -0800 Subject: [PATCH 11/14] x64: fix copyrights --- src/cpu/x64/brgemm/brgemm.cpp | 2 +- src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp | 2 +- src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp | 2 +- src/cpu/x64/jit_brdgmm_dw_conv.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpu/x64/brgemm/brgemm.cpp b/src/cpu/x64/brgemm/brgemm.cpp index 17e182032df..44ede79e75f 100644 --- a/src/cpu/x64/brgemm/brgemm.cpp +++ b/src/cpu/x64/brgemm/brgemm.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2020-2024 Intel Corporation +* Copyright 2020-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp b/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp index 583c520e697..b3765ee159b 100644 --- a/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp +++ b/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp b/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp index 49d4771a9b8..82406a2d218 100644 --- a/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp +++ b/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/cpu/x64/jit_brdgmm_dw_conv.cpp b/src/cpu/x64/jit_brdgmm_dw_conv.cpp index e391fcaadf7..21c9a0ea599 100644 --- a/src/cpu/x64/jit_brdgmm_dw_conv.cpp +++ b/src/cpu/x64/jit_brdgmm_dw_conv.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From f4dd9a22005cd688aa44ee306eb108ff9721e2f9 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Tue, 7 Jan 2025 14:46:44 -0800 Subject: [PATCH 12/14] xe: jit: add atomic_add for int data type --- src/gpu/intel/jit/codegen/send.hpp | 16 ++++++++++++--- src/gpu/intel/jit/ir/message.cpp | 20 +------------------ src/gpu/intel/jit/ir/message.hpp | 24 +++++++++++++++++++---- src/gpu/intel/jit/v2/conv/kernel_desc.cpp | 3 +-- src/gpu/intel/jit/v2/conv/plan.cpp | 9 +++++++-- src/gpu/intel/jit/v2/ir/bridge.hpp | 3 ++- src/gpu/intel/jit/v2/ir/send.hpp | 11 ++++++++--- 7 files changed, 52 insertions(+), 34 deletions(-) diff --git a/src/gpu/intel/jit/codegen/send.hpp b/src/gpu/intel/jit/codegen/send.hpp index 66968669927..bc7bbc1808d 100644 --- a/src/gpu/intel/jit/codegen/send.hpp +++ b/src/gpu/intel/jit/codegen/send.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,7 +140,7 @@ class send_impl_t { host->load(mod, data, spec, base, addr); } else if (send_.is_atomic()) { atomic_helper_t::call( - host, ngen::AtomicOp::fadd, mod, spec, base, addr, data); + host, to_atomic_op(send_.op), mod, spec, base, addr, data); } else if (send_.is_store()) { host->store(mod, spec, base, addr, data); } else { @@ -202,7 +202,7 @@ class send_impl_t { } else if (send_.is_store()) { host->store.ugm(mod, *lsc_spec, host->A64, header, data); } else if (send_.is_atomic()) { - host->atomic.ugm(ngen::AtomicOp::fadd, mod, *lsc_spec, + host->atomic.ugm(to_atomic_op(send_.op), mod, *lsc_spec, to_address_base(send_.address), header, data); } } else { @@ -274,6 +274,16 @@ class send_impl_t { return ngen::AddressBase(); } + static ngen::AtomicOp to_atomic_op(send_op_t op) { + switch (op) { + case send_op_t::atomic_add: return ngen::AtomicOp::add; + case send_op_t::atomic_fadd: return ngen::AtomicOp::fadd; + case send_op_t::atomic_cmpwr: return ngen::AtomicOp::cmpwr; + default: ir_error_not_expected(); + } + return ngen::AtomicOp(); + } + const send_t &send_; }; diff --git a/src/gpu/intel/jit/ir/message.cpp b/src/gpu/intel/jit/ir/message.cpp index a84fe3dbaa6..58969d62aeb 100644 --- a/src/gpu/intel/jit/ir/message.cpp +++ b/src/gpu/intel/jit/ir/message.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021-2024 Intel Corporation +* Copyright 2021-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,24 +31,6 @@ namespace gpu { namespace intel { namespace jit { -std::ostream &operator<<(std::ostream &out, const send_op_t op) { - const char *s = nullptr; - switch (op) { - case send_op_t::atomic_fadd: s = "atomic_fadd"; break; - case send_op_t::atomic_cmpwr: s = "atomic_cmpwr"; break; - case send_op_t::load: s = "load"; break; - case send_op_t::load_2d: s = "load_2d"; break; - case send_op_t::prefetch: s = "prefetch"; break; - case send_op_t::prefetch_2d: s = "prefetch_2d"; break; - case send_op_t::store: s = "store"; break; - case send_op_t::store_2d: s = "store_2d"; break; - case send_op_t::undef: s = "undef"; break; - default: ir_error_not_expected(); s = "unknown"; - } - - return out << s; -} - stmt_t send_t::create_offset_store(const expr_t &header_buf, const expr_t &mem_buf, const expr_t &_mem_off, bool is_signed_offset) const { diff --git a/src/gpu/intel/jit/ir/message.hpp b/src/gpu/intel/jit/ir/message.hpp index 5d556fce66b..39bcdb2070c 100644 --- a/src/gpu/intel/jit/ir/message.hpp +++ b/src/gpu/intel/jit/ir/message.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022-2024 Intel Corporation +* Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,6 +46,7 @@ GPU_DEFINE_PARSE_ENUM(send_kind_t, send_kind_names) // Send operation kind. enum class send_op_t { undef, + atomic_add, atomic_fadd, atomic_cmpwr, load, @@ -56,7 +57,19 @@ enum class send_op_t { store_2d, }; -std::ostream &operator<<(std::ostream &out, const send_op_t value); +static auto send_op_names = nstl::to_array({ + make_enum_name(send_op_t::undef, "undef"), + make_enum_name(send_op_t::atomic_add, "atomic_add"), + make_enum_name(send_op_t::atomic_fadd, "atomic_fadd"), + make_enum_name(send_op_t::atomic_cmpwr, "atomic_cmpwr"), + make_enum_name(send_op_t::load, "load"), + make_enum_name(send_op_t::load_2d, "load_2d"), + make_enum_name(send_op_t::prefetch, "prefetch"), + make_enum_name(send_op_t::prefetch_2d, "prefetch_2d"), + make_enum_name(send_op_t::store, "store"), + make_enum_name(send_op_t::store_2d, "store_2d"), +}); +GPU_DEFINE_PARSE_ENUM(send_op_t, send_op_names) // Send address model. enum class send_address_t { @@ -188,7 +201,7 @@ class send_t : public func_impl_t { } std::string str() const override { std::ostringstream oss; - oss << op; + oss << to_string(op); oss << "."; oss << type.str(); if (is_scattered()) oss << "x" << slots; @@ -224,7 +237,10 @@ class send_t : public func_impl_t { return call({mem_buf, mem_off, reg_buf, mask, x, y, pattern}); } - bool is_atomic() const { return op == send_op_t::atomic_fadd; } + bool is_atomic() const { + return utils::one_of(op, send_op_t::atomic_add, send_op_t::atomic_fadd, + send_op_t::atomic_cmpwr); + } bool is_load() const { return op == send_op_t::load; } bool is_load_2d() const { return op == send_op_t::load_2d; } bool is_prefetch() const { return op == send_op_t::prefetch; } diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp index 7c284c2253e..fb6a71d4c35 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp @@ -575,8 +575,7 @@ tensor_config_t get_tensor_config( send_kind_t kernel_desc_t::access_kind( send_op_t op, tensor_kind_t tensor) const { - if (use_2d_access && tensor != tensor_kind_t::undef - && op != send_op_t::atomic_fadd) + if (use_2d_access && tensor != tensor_kind_t::undef && !is_atomic(op)) return send_kind_t::_2d; return send_kind_t::undef; } diff --git a/src/gpu/intel/jit/v2/conv/plan.cpp b/src/gpu/intel/jit/v2/conv/plan.cpp index 18550715cf0..bfbc9c93d85 100644 --- a/src/gpu/intel/jit/v2/conv/plan.cpp +++ b/src/gpu/intel/jit/v2/conv/plan.cpp @@ -651,7 +651,7 @@ class plan_builder_t { const layout_t &bias_reg_layout, const view_t &bias_mem_view, epilogue_store_plan_t &plan, prb_reqs_t &reqs) const { auto params = get_send_params(tensor_kind_t::undef, - is_atomic ? send_op_t::atomic_fadd : send_op_t::store, + is_atomic ? send_op_t::atomic_add : send_op_t::store, bias_mem_view); auto store = try_create_send_plan(__func__, params, bias_mem_view); if (!store) return false; @@ -771,7 +771,7 @@ class plan_builder_t { const view_t &c_mem_view, epilogue_store_plan_t &plan, prb_reqs_t &reqs) const { auto params = get_send_params(tensor_kind_t::c, - is_atomic ? send_op_t::atomic_fadd : send_op_t::store, + is_atomic ? send_op_t::atomic_add : send_op_t::store, c_mem_view); // TODO: Implement fallback from 2D to block/scattered messages to // allow partial use of 2D messages when possible. @@ -825,6 +825,11 @@ class plan_builder_t { send_params_t get_send_params(tensor_kind_t abc, send_op_t op, const view_t &view, send_kind_t send_kind = send_kind_t::undef, send_address_t send_address = send_address_t::a64) const { + if (op == send_op_t::atomic_add) { + auto &type = view.type(); + ir_assert(type.is_f32() || type.is_s32()); + if (type.is_f32()) op = send_op_t::atomic_fadd; + } send_params_t params; params.hw = desc_.hw; params.kind = (send_kind != send_kind_t::undef diff --git a/src/gpu/intel/jit/v2/ir/bridge.hpp b/src/gpu/intel/jit/v2/ir/bridge.hpp index f8c3998b69e..4d13e34d199 100644 --- a/src/gpu/intel/jit/v2/ir/bridge.hpp +++ b/src/gpu/intel/jit/v2/ir/bridge.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2024 Intel Corporation +* Copyright 2024-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,6 +47,7 @@ inline jit::send_op_t to_ir(send_op_t op, bool is_2d = false) { switch (op) { #define CASE(name) \ case v2::send_op_t::name: ret = jit::send_op_t::name; break; + CASE(atomic_add); CASE(atomic_fadd); CASE(load); CASE(prefetch); diff --git a/src/gpu/intel/jit/v2/ir/send.hpp b/src/gpu/intel/jit/v2/ir/send.hpp index e0e481b2fba..eb889878911 100644 --- a/src/gpu/intel/jit/v2/ir/send.hpp +++ b/src/gpu/intel/jit/v2/ir/send.hpp @@ -37,6 +37,7 @@ static const int max_slot_size = 8; enum class send_op_t { undef, + atomic_add, atomic_fadd, load, prefetch, @@ -45,6 +46,7 @@ enum class send_op_t { static auto send_op_names = nstl::to_array({ make_enum_name(send_op_t::undef, "undef"), + make_enum_name(send_op_t::atomic_add, "atomic_add"), make_enum_name(send_op_t::atomic_fadd, "atomic_fadd"), make_enum_name(send_op_t::load, "load"), make_enum_name(send_op_t::prefetch, "prefetch"), @@ -53,6 +55,10 @@ static auto send_op_names = nstl::to_array({ GPU_DEFINE_PARSE_ENUM(send_op_t, send_op_names) +inline bool is_atomic(send_op_t op) { + return utils::one_of(op, send_op_t::atomic_add, send_op_t::atomic_fadd); +} + enum class send_address_t { undef, a64, @@ -749,7 +755,7 @@ class send_plan_builder_t { int inner_elems = inner_last.elems(); int inner_bytes = type_size * inner_elems; int slot_size = ir_utils::max_pow2_divisor(inner_bytes); - if (params.op == send_op_t::atomic_fadd) slot_size = type_size; + if (is_atomic(params.op)) slot_size = type_size; int grf_size = plan.hw.grf_size(); if (slot_size < grf_size) @@ -900,8 +906,7 @@ class send_plan_builder_t { if (params.kind == send_kind_t::scattered && inner_bytes > max_slot_size * max_slots) break; - if (params.op == send_op_t::atomic_fadd && it.elems() > max_slots) - break; + if (is_atomic(params.op) && it.elems() > max_slots) break; inner_last = it; } return inner_last; From dbc96ca3789215f7350d08bb36fd15b16f00b1c4 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Fri, 20 Dec 2024 14:48:00 -0800 Subject: [PATCH 13/14] xe: conv_v2: enable Stream-K kernels --- src/gpu/intel/jit/v2/conv/bridge.hpp | 1 + src/gpu/intel/jit/v2/conv/kernel_desc.cpp | 71 ++- src/gpu/intel/jit/v2/conv/kernel_desc.hpp | 6 + src/gpu/intel/jit/v2/conv/model.cpp | 331 ++++++++++---- src/gpu/intel/jit/v2/conv/model.hpp | 16 +- src/gpu/intel/jit/v2/conv/plan.cpp | 17 +- src/gpu/intel/jit/v2/conv/plan_registry.cpp | 15 +- .../intel/jit/v2/conv/plan_registry_data.cpp | 424 +++++++++--------- src/gpu/intel/jit/v2/conv/planner/bench.cpp | 58 ++- .../intel/jit/v2/conv/planner/model_fit.cpp | 50 ++- .../intel/jit/v2/conv/planner/model_fit.hpp | 4 +- src/gpu/intel/jit/v2/conv/planner/planner.cpp | 3 +- src/gpu/intel/jit/v2/conv/planner/search.cpp | 37 +- src/gpu/intel/jit/v2/conv/problem.cpp | 16 +- src/gpu/intel/jit/v2/conv/problem.hpp | 5 +- src/gpu/intel/jit/v2/ir/tensor.hpp | 16 +- 16 files changed, 678 insertions(+), 392 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/bridge.hpp b/src/gpu/intel/jit/v2/conv/bridge.hpp index dca06c19e6a..7895991a9a6 100644 --- a/src/gpu/intel/jit/v2/conv/bridge.hpp +++ b/src/gpu/intel/jit/v2/conv/bridge.hpp @@ -80,6 +80,7 @@ inline problem_t to_problem( prb.set_wei_tag(wei); prb.set_dst_tag(dst); prb.set_shape(shape); + if (pd->attr()->post_ops_.len() > 0) prb.set_with_post_ops(true); prb.normalize(); return prb; diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp index fb6a71d4c35..643269ab0ab 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp @@ -213,6 +213,10 @@ bool kernel_desc_t::is_supported() const { ir_check(regs != 0) << "Invalid regs: " << regs; ir_check(is_tg_size_ok(*this)) << "Invalid thread_group_tile: " << thread_group_tile; + if (use_stream_k) { + ir_check(c_type() == accumulator_type(a_type(), b_type())) + << "Output/accumulator types must match for Stream-K"; + } ir_check(is_grf_usage_ok(*this)) << "GRF usage exceeded"; return true; } @@ -221,7 +225,13 @@ void kernel_desc_t::set(const std::string &s) { operator=(kernel_desc_t()); if (s.empty()) return; auto &iface = parse_iface(); - iface.parse(s, *this); + parse_result_t result; + iface.parse(s, *this, &result); + if (!result.is_set("--iter") || !result.is_set("--tg")) { + ir_info() << "Error: missing --iter and/or --tg parameters in kernel " + "descriptor.\n"; + ir_error_not_expected(); + } set_defaults(); } @@ -297,6 +307,10 @@ bool fit_impl(const kernel_desc_t &desc, const problem_t &prb, bool exact) { ir_check(fit_tag(tensor_kind_t::c, desc, prb, exact)); ir_check(prb.is_depthwise() == desc.is_dw) << "Mixing depthwise/non-depthwise descriptor and problem"; + if (desc.use_stream_k) { + ir_check(!prb.with_bias_fwd() && !prb.with_post_ops()) + << "Stream-K is incompatible with post-ops/bias"; + } if (exact) { ir_check(prb.with_bias_bwd_w() == desc.with_bias_bwd_w()) << "Problem and descriptor bias reduction mismatch"; @@ -330,7 +344,14 @@ void fit_to_impl(kernel_desc_t &desc, const problem_t &prb) { fit_tag_to(tensor_kind_t::a, desc, prb); fit_tag_to(tensor_kind_t::b, desc, prb); fit_tag_to(tensor_kind_t::c, desc, prb); - desc.bias_type = prb.bias_type(); + if (!prb.bias_type().is_undef()) { + if (desc.use_stream_k) { + auto acc_type = accumulator_type(desc.a_type(), desc.b_type()); + desc.bias_type = acc_type; + } else { + desc.bias_type = prb.bias_type(); + } + } } bool kernel_desc_t::can_fit(const problem_t &prb) const { @@ -382,6 +403,7 @@ std::string kernel_desc_t::brief_str() const { oss << "i_" << iter_tile.str(); oss << "_T_" << thread_group_tile.str(); oss << "_p_" << prefetch.str(); + oss << "_sk_" << (use_stream_k ? "1" : "0"); return oss.str(); } @@ -624,9 +646,10 @@ void kernel_desc_t::init_kernel_iface(kernel_iface_t &kernel_iface) const { dim_t dummy; if (reqs.get_value(e.dim, dummy)) continue; dim_t iter_size = iter_tile.get(e.dim, 1); + dim_t tg_size = thread_group_tile.get(e.dim, 1); + dim_t size = iter_size * tg_size; std::string bound_name = e.dim.str(); - if (iter_size != 1) - bound_name += "_divup_" + std::to_string(iter_size); + if (size != 1) bound_name += "_divup_" + std::to_string(size); kernel_iface.register_arg(bound_name + "_magic", type_t::u64()); } } @@ -683,6 +706,44 @@ bool try_register_internal_arg(kernel_info_t &kernel_info, const expr_t &var, return false; } +dim_t stream_k_thread_groups( + dim_t total_iters, dim_t max_thread_groups_per_wave) { + const dim_t min_iters_per_tg = 2; + dim_t ref_iters = utils::div_up(total_iters, min_iters_per_tg); + return std::min(ref_iters, max_thread_groups_per_wave); +} + +type_t accumulator_type(const type_t &a_type, const type_t &b_type) { + ir_assert(a_type.size() == b_type.size()); + return a_type.is_fp() ? type_t::f32() : type_t::s32(); +} + +kernel_desc_t to_stream_k(const kernel_desc_t &desc, bool check_ext) { + if (desc.use_stream_k) return desc; + if (check_ext && !desc.ext.has(extension_kind_t::stream_k)) + return kernel_desc_t(); + if (desc.with_bias_fwd()) return kernel_desc_t(); + + auto sk_desc = desc; + sk_desc.use_stream_k = true; + auto out_kind = pick_c(sk_desc.prop, tensor_kind_t::src, tensor_kind_t::wei, + tensor_kind_t::dst); + auto acc_type = accumulator_type(sk_desc.a_type(), sk_desc.b_type()); + switch (out_kind) { + case tensor_kind_t::src: + sk_desc.src_tag = sk_desc.src_tag.with_type(acc_type); + break; + case tensor_kind_t::wei: + sk_desc.wei_tag = sk_desc.wei_tag.with_type(acc_type); + break; + case tensor_kind_t::dst: + sk_desc.dst_tag = sk_desc.dst_tag.with_type(acc_type); + break; + default: ir_error_not_expected(); + } + return sk_desc; +} + void init_kernel_info(kernel_info_t &kernel_info, const problem_t &prb, const kernel_desc_t &desc, const grid_t &tg_grid, const pvar_tile_t &grid_dims, dim_t max_tgs, dim_t &stream_k_tgs) { @@ -700,7 +761,7 @@ void init_kernel_info(kernel_info_t &kernel_info, const problem_t &prb, iters_per_tile *= dim_iters_per_tile; } dim_t total_iters = iters_per_tile * tg_grid.size(0, grid_dims); - stream_k_tgs = std::min(total_iters, max_tgs); + stream_k_tgs = stream_k_thread_groups(total_iters, max_tgs); dim_t iters_per_tg = utils::div_up(total_iters, stream_k_tgs); pvar_map[pvar_t("sk_iters_per_tile")] = iters_per_tile; pvar_map[pvar_t("sk_total_iters")] = total_iters; diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp index 6475e9193b0..60e6c22f44d 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp @@ -231,6 +231,7 @@ enum class extension_kind_t : uint32_t { out_b2 = 2, out_b4 = 4, bias = 8, + stream_k = 16, }; static auto extension_kind_names = nstl::to_array({ @@ -239,6 +240,7 @@ static auto extension_kind_names = nstl::to_array({ make_enum_name(extension_kind_t::out_b2, "out_b2"), make_enum_name(extension_kind_t::out_b4, "out_b4"), make_enum_name(extension_kind_t::bias, "bias"), + make_enum_name(extension_kind_t::stream_k, "stream_k"), }); GPU_DEFINE_PARSE_ENUM(extension_kind_t, extension_kind_names) @@ -484,6 +486,10 @@ class grid_t { grid_t create_thread_group_grid(const kernel_desc_t &desc); grid_t create_thread_grid(const kernel_desc_t &desc); +dim_t stream_k_thread_groups( + dim_t total_iters, dim_t max_thread_groups_per_wave); +type_t accumulator_type(const type_t &a_type, const type_t &b_type); +kernel_desc_t to_stream_k(const kernel_desc_t &desc, bool check_ext = true); class kernel_params_t : public kernel_params_base_t { public: diff --git a/src/gpu/intel/jit/v2/conv/model.cpp b/src/gpu/intel/jit/v2/conv/model.cpp index e6b2e766e64..bddfc62c1a4 100644 --- a/src/gpu/intel/jit/v2/conv/model.cpp +++ b/src/gpu/intel/jit/v2/conv/model.cpp @@ -107,91 +107,197 @@ struct hw_config_t { } }; -struct sample_t { - problem_t prb; - kernel_desc_t kernel_desc; - bench_time_t time; +class sample_impl_t { +public: + sample_impl_t(model_kind_t model_kind, const problem_t &prb, + const kernel_desc_t &desc) + : model_kind_(model_kind), prb_(prb), desc_(desc) { + hw_cfg_ = hw_config_t(prb_.hw(), desc_.fma, desc_.a_type()); + } + virtual ~sample_impl_t() = default; + virtual vec1d to_x() const = 0; + virtual float to_y() const = 0; + +protected: + model_kind_t model_kind_ = model_kind_t::undef; + problem_t prb_; + kernel_desc_t desc_; + hw_config_t hw_cfg_; +}; - hw_config_t hw_cfg; +std::vector feature_names(model_kind_t kind) { + switch (kind) { + case model_kind_t::data_parallel: + return std::vector({"kl", "waves"}); + case model_kind_t::stream_k: return std::vector({"iters"}); + case model_kind_t::data_copy: + return std::vector({"bytes"}); + default: ir_error_not_expected(); + } + return std::vector(); +} + +void to_bmnk(prop_kind_t prop, const pvar_tile_t &tile, dim_t &b, dim_t &m, + dim_t &n, dim_t &k) { + const auto t = to_gemm(tile, prop); + b = t[pvars::b]; + m = t[pvars::m]; + n = t[pvars::n]; + k = t[pvars::k]; +} + +struct bmnk_helper_t { dim_t b, m, n, k; dim_t bt, mt, nt, kt; dim_t bl, ml, nl, kl; dim_t bi, mi, ni, ki; - float pad_eff = 0; - - sample_t() = default; - sample_t(const problem_t &prb, const kernel_desc_t &kernel_desc, - const bench_time_t &time = bench_time_t()) - : prb(prb), kernel_desc(kernel_desc), time(time) { - hw_cfg = hw_config_t( - prb.hw(), kernel_desc.fma, kernel_desc.src_tag.type()); + dim_t tiles; + dim_t iters; + + bmnk_helper_t(const problem_t &prb, const kernel_desc_t &desc) { auto padded_shape = prb.shape(); - pad_eff = 1; + dim_t tmp_iters = 1; for (auto &d : padded_shape) { if (!is_conv_index(d)) continue; - dim_t tg = kernel_desc.thread_group_tile.get(d, 1); - dim_t iter = kernel_desc.iter_tile.get(d, 1); + dim_t tg = desc.thread_group_tile.get(d, 1); + dim_t iter = desc.iter_tile.get(d, 1); dim_t dim = padded_shape[d]; dim_t padded_dim = utils::rnd_up(dim, tg * iter); padded_shape[d] = padded_dim; - pad_eff *= ((float)dim / padded_dim); + if (!to_gemm(d, prb.prop()).is_undef()) { + tmp_iters *= utils::div_up(dim, iter * tg); + } } to_bmnk(prb.prop(), padded_shape, b, m, n, k); - to_bmnk(prb.prop(), kernel_desc.thread_group_tile, bt, mt, nt, kt); - to_bmnk(prb.prop(), kernel_desc.iter_tile, bi, mi, ni, ki); + to_bmnk(prb.prop(), desc.thread_group_tile, bt, mt, nt, kt); + to_bmnk(prb.prop(), desc.iter_tile, bi, mi, ni, ki); bl = ml = nl = 1; kl = ir_utils::safe_div(k, kt * ki); + tiles = 1; + tiles *= ir_utils::safe_div(b, bl * bt * bi); + tiles *= ir_utils::safe_div(m, ml * mt * mi); + tiles *= ir_utils::safe_div(n, nl * nt * ni); + iters = tiles * kl; + ir_assert(tmp_iters == iters); } +}; - static std::vector feature_names() { - std::vector ret; - ret.push_back("kl"); - ret.push_back("waves"); - return ret; +dim_t layout_size(const layout_tag_t &tag, const problem_t &prb) { + ir_assert(!tag.is_any() && !tag.is_empty()); + pvar_tile_t tile; + for (auto &d : tag.desc().letter_map()) + tile[d] = prb.shape().at(d); + dim_t elems = 1; + for (auto &e : tag.raw_tag().entries()) { + auto d = tag.desc().prb_dim(e.index()); + dim_t e_block = (e.block != 0 ? e.block : tile.at(d)); + elems *= e_block; + tile[d] = utils::div_up(tile[d], e_block); + } + ir_assert(tile.elems() == 1); + return elems * tag.type().size(); +} + +float conv_time_nsec(const bench_time_t &time) { + if (time.nkernels() == 0) return 0; + if (time.nkernels() == 1) return time.total; + ir_assert(utils::one_of(time.nkernels(), 2, 3)) + << "Expecting zero-out -> conv [-> reorder] kernel sequence."; + return time.kernel_times[1]; +} + +class data_parallel_sample_t : public sample_impl_t { +public: + data_parallel_sample_t(const problem_t &prb, const kernel_desc_t &desc, + const bench_time_t &time) + : sample_impl_t(model_kind_t::data_parallel, prb, desc) + , nsec_(conv_time_nsec(time)) { + bmnk_helper_t h(prb, desc); + int tgs_per_wave = hw_cfg_.max_tgs_per_gpu(h.bt * h.mt * h.nt * h.kt); + kl_ = h.kl; + waves_ = (float)h.tiles / tgs_per_wave; } - vec1d to_x() const { + vec1d to_x() const override { std::vector ret; - ret.push_back(kl); - ret.push_back(waves()); + ret.push_back(kl_); + ret.push_back(waves_); return ret; } - float to_y() const { return time.total; } + float to_y() const override { return nsec_; } - float ntgs() const { - float ntgs = 1.0f; - ntgs *= ir_utils::safe_div(b, bl * bt * bi); - ntgs *= ir_utils::safe_div(m, ml * mt * mi); - ntgs *= ir_utils::safe_div(n, nl * nt * ni); - ntgs *= ir_utils::safe_div(k, kl * kt * ki); - return ntgs; +private: + uint64_t nsec_ = 0; + dim_t kl_ = 0; + float waves_ = 0; +}; + +class stream_k_sample_t : public sample_impl_t { +public: + stream_k_sample_t(const problem_t &prb, const kernel_desc_t &desc, + const bench_time_t &time) + : sample_impl_t(model_kind_t::stream_k, prb, desc) + , nsec_(conv_time_nsec(time)) { + bmnk_helper_t h(prb, desc); + iters_ = h.iters; } - float ops() const { return 2.0f * b * m * n * k; } + vec1d to_x() const override { return vec1d({(float)iters_}); } + float to_y() const override { return nsec_; } - float waves() const { - int tgs_per_wave = hw_cfg.max_tgs_per_gpu(bt * mt * nt * kt); - return ntgs() / tgs_per_wave; - } +private: + uint64_t nsec_ = 0; + dim_t iters_; +}; - float eff() const { - float sec = time.total / 1e9; - return ops() / 1e9 / sec / hw_cfg.max_gops_per_sec(); +class data_copy_sample_t : public sample_impl_t { +public: + data_copy_sample_t(const problem_t &prb, const kernel_desc_t &desc, + const bench_time_t &time) + : sample_impl_t(model_kind_t::data_copy, prb, desc) + , nsec_(time.total - conv_time_nsec(time)) { + auto &desc_tag = desc.layout_tag(tensor_kind_t::c); + auto prb_tag = prb.layout_tag(tensor_kind_t::c); + if (desc.use_stream_k) bytes_ += layout_size(desc_tag, prb); + if (prb_tag.is_any()) prb_tag = desc_tag.with_type(prb_tag.type()); + if (prb_tag != desc_tag) { + bytes_ += layout_size(prb_tag, prb); + bytes_ += layout_size(desc_tag, prb); + } } - static void to_bmnk(prop_kind_t prop, const pvar_tile_t &tile, dim_t &b, - dim_t &m, dim_t &n, dim_t &k) { - const auto t = to_gemm(tile, prop); - b = t[pvars::b]; - m = t[pvars::m]; - n = t[pvars::n]; - k = t[pvars::k]; - } + vec1d to_x() const override { return vec1d({(float)bytes_}); } + float to_y() const override { return nsec_; } - static model_kind_t model_kind(const kernel_desc_t &desc) { - return model_kind_t::data_parallel; +private: + uint64_t nsec_ = 0; + dim_t bytes_ = 0; +}; + +class sample_t { +public: + sample_t(model_kind_t kind, const problem_t &prb, const kernel_desc_t &desc, + const bench_time_t &time = bench_time_t()) { + switch (kind) { + case model_kind_t::data_parallel: + impl_ = std::make_shared( + prb, desc, time); + break; + case model_kind_t::stream_k: + impl_ = std::make_shared(prb, desc, time); + break; + case model_kind_t::data_copy: + impl_ = std::make_shared(prb, desc, time); + break; + default: ir_error_not_expected(); + } } + vec1d to_x() const { return impl_->to_x(); } + float to_y() const { return impl_->to_y(); } + +private: + std::shared_ptr impl_; }; float coef_kl(float x, float a, float b) { @@ -248,9 +354,64 @@ float predict_data_parallel(const vec1d &x, const vec1d &coef) { return Tw * (wf + wp * coef_wp(wf, a_wp, b_wp)); } +float predict_stream_k(const vec1d &x, const vec1d &coef) { + float iters = x[0]; + float a = coef[0]; + float b = coef[1]; + return a + b * iters; +} + +float predict_data_copy(const vec1d &x, const vec1d &coef) { + float bytes = x[0]; + float a = coef[0]; + float b = coef[1]; + return a + b * bytes; +} + +void model_t::coef_ranges(model_kind_t kind, const vec2d &X, const vec1d &y, + std::vector &coef_names, vec1d &coef_init, vec1d &coef_min, + vec1d &coef_max) { + auto add = [&](const char *name, float init, float min, float max) { + coef_names.emplace_back(name); + coef_init.emplace_back(init); + coef_min.emplace_back(min); + coef_max.emplace_back(max); + }; + switch (kind) { + case model_kind_t::data_parallel: + // Empirically-based parameter ranges. + add("T0", 1000, 1, 100000); + add("a_kl", 1, 0.0001f, 100); + add("b_kl", 1, 0.0001f, 100); + add("a_wp", 2, 1, 100); + add("b_wp", 1, 0.0001f, 100); + break; + case model_kind_t::stream_k: + case model_kind_t::data_copy: { + float t_min = *std::min_element(y.begin(), y.end()); + float t_max = *std::max_element(y.begin(), y.end()); + float t0 = *std::min_element(y.begin(), y.end()); + float t1 = 0; + float x1 = 0; + for (size_t i = 0; i < y.size(); i++) { + if (y[i] < 0.5 * t_max) continue; + t1 += (y[i] - t_min); + x1 += X[i][0]; + } + t1 /= x1; + add("T0", t0, t0 / 10, t0 * 10); + add("T1", t1, t1 / 10, t1 * 10); + break; + } + default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); + } +} + float model_t::predict(model_kind_t kind, const vec1d &x, const vec1d &coef) { switch (kind) { case model_kind_t::data_parallel: return predict_data_parallel(x, coef); + case model_kind_t::stream_k: return predict_stream_k(x, coef); + case model_kind_t::data_copy: return predict_data_copy(x, coef); default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); } return 0; @@ -261,47 +422,46 @@ float model_t::predict(const vec1d &x) const { } float model_t::predict(const problem_t &prb, const kernel_desc_t &desc) const { - sample_t s(prb, desc); + sample_t s(kind_, prb, desc); return predict(s.to_x()); } -float model_t::eff(const problem_t &prb, const kernel_desc_t &desc) const { - using namespace ir_utils; - sample_t s(prb, desc); - auto x = s.to_x(); - float raw_eff = s.ops() / predict(x); - return raw_eff * s.pad_eff; -} - void model_t::score(const bench_data_t &bd) { vec2d X; X.reserve(bd.size()); vec1d y_test; vec1d y_pred; for (int i = 0; i < bd.size(); i++) { - sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i].total); + sample_t s(kind_, bd.prbs[i], bd.kernel_desc, bd.times[i]); y_test.push_back(s.to_y()); y_pred.push_back(predict(bd.prbs[i], bd.kernel_desc)); } } -void model_t::stringify(std::ostream &out) const { - out << serialize_to_hex(coef_); -} - -void model_t::parse(std::istream &in) { - auto s_data = stream_parse(in); - deserialize_from_hex(coef_, s_data); -} - size_t model_t::coef_count(model_kind_t kind) { switch (kind) { case model_kind_t::data_parallel: return 5; + case model_kind_t::stream_k: return 2; + case model_kind_t::data_copy: return 2; default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); } return 0; } +bool with_data_copy(const problem_t &prb, const kernel_desc_t &desc) { + if (desc.use_stream_k) return true; + auto &prb_tag = prb.layout_tag(tensor_kind_t::c); + auto &desc_tag = desc.layout_tag(tensor_kind_t::c); + bool is_layout_compatible + = (prb_tag.is_any() || prb_tag.raw_tag() == desc_tag.raw_tag()); + bool is_type_compatible = (prb_tag.type().size() == desc_tag.type().size()); + if (is_layout_compatible && is_type_compatible) return false; + if (is_layout_compatible + && desc.ext.has(extensions_t::out_size(prb_tag.type().size()))) + return false; + return !is_layout_compatible || !is_type_compatible; +} + std::string to_str(const vec1d &x) { std::ostringstream oss; bool is_first = true; @@ -313,12 +473,25 @@ std::string to_str(const vec1d &x) { return oss.str(); } -float model_set_t::eff(const problem_t &prb, const kernel_desc_t &desc) const { - auto kind = sample_t::model_kind(desc); +float model_set_t::time(const problem_t &prb, const kernel_desc_t &desc) const { + float ret = 0; + if (desc.use_stream_k) { + ret += time(model_kind_t::stream_k, prb, desc); + } else { + ret += time(model_kind_t::data_parallel, prb, desc); + } + if (with_data_copy(prb, desc)) { + ret += time(model_kind_t::data_copy, prb, desc); + } + return ret; +} + +float model_set_t::time(model_kind_t kind, const problem_t &prb, + const kernel_desc_t &desc) const { for (auto &m : models_) { - if (m.kind() == kind) return m.eff(prb, desc); + if (m.kind() == kind) return m.predict(prb, desc); } - ir_error_not_expected() << "Matching model not found: " << desc.str(); + ir_error_not_expected() << "Unknown kind: " << to_string(kind); return 0; } @@ -355,7 +528,7 @@ void to_model_data( X.reserve(bd.size()); y.reserve(bd.size()); for (int i = 0; i < bd.size(); i++) { - sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i].total); + sample_t s(kind, bd.prbs[i], bd.kernel_desc, bd.times[i]); X.push_back(s.to_x()); y.push_back(s.to_y()); } @@ -365,12 +538,12 @@ void dump_csv(const bench_data_t &bd, const model_t &model) { auto name = bd.kernel_desc.brief_str(); std::ofstream out(name + ".csv"); out << "desc,"; - for (auto &name : sample_t::feature_names()) { + for (auto &name : feature_names(model.kind())) { out << name << ","; } out << "time,model_time" << std::endl; for (int i = 0; i < bd.size(); i++) { - sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i]); + sample_t s(model.kind(), bd.prbs[i], bd.kernel_desc, bd.times[i]); auto x = s.to_x(); auto y = s.to_y(); float model_time = model.predict(x); diff --git a/src/gpu/intel/jit/v2/conv/model.hpp b/src/gpu/intel/jit/v2/conv/model.hpp index 87ebdb19f6b..d57eeaff38c 100644 --- a/src/gpu/intel/jit/v2/conv/model.hpp +++ b/src/gpu/intel/jit/v2/conv/model.hpp @@ -35,11 +35,15 @@ using vec2d = std::vector>; enum class model_kind_t : uint8_t { undef = 0, data_parallel = 1, + stream_k = 2, + data_copy = 3, }; static auto model_kind_names = nstl::to_array({ make_enum_name(model_kind_t::undef, "undef"), make_enum_name(model_kind_t::data_parallel, "data_parallel"), + make_enum_name(model_kind_t::stream_k, "stream_k"), + make_enum_name(model_kind_t::data_copy, "data_copy"), }); GPU_DEFINE_PARSE_ENUM(model_kind_t, model_kind_names) @@ -52,11 +56,11 @@ class model_t { const vec1d &coef() const { return coef_; } float predict(const vec1d &x) const; float predict(const problem_t &prb, const kernel_desc_t &desc) const; - float eff(const problem_t &prb, const kernel_desc_t &desc) const; void score(const bench_data_t &bd); - void stringify(std::ostream &out) const; - void parse(std::istream &in); + static void coef_ranges(model_kind_t kind, const vec2d &X, const vec1d &y, + std::vector &coef_names, vec1d &coef_init, + vec1d &coef_min, vec1d &coef_max); static float predict(model_kind_t kind, const vec1d &x, const vec1d &coef); static size_t coef_count(model_kind_t kind); @@ -69,11 +73,15 @@ class model_set_t { public: model_set_t() = default; model_set_t(const model_t &model) { models_.push_back(model); } - float eff(const problem_t &prb, const kernel_desc_t &desc) const; + void add(const model_t &model) { models_.push_back(model); } + float time(const problem_t &prb, const kernel_desc_t &desc) const; void stringify(std::ostream &out) const; void parse(std::istream &in); private: + float time(model_kind_t kind, const problem_t &prb, + const kernel_desc_t &desc) const; + std::vector models_; }; diff --git a/src/gpu/intel/jit/v2/conv/plan.cpp b/src/gpu/intel/jit/v2/conv/plan.cpp index bfbc9c93d85..48135555fc6 100644 --- a/src/gpu/intel/jit/v2/conv/plan.cpp +++ b/src/gpu/intel/jit/v2/conv/plan.cpp @@ -72,8 +72,8 @@ class multiply_info_t { , iter_tile_(iter_tile) , bmnk_map_(bmnk_map) , a_type_(a_type) - , b_type_(b_type) { - init_acc_type(); + , b_type_(b_type) + , acc_type_(accumulator_type(a_type, b_type)) { if (!init(a_desc, b_desc, c_desc)) return; is_valid_ = true; } @@ -196,19 +196,6 @@ class multiply_info_t { } private: - void init_acc_type() { - ir_assert(a_type_.size() == b_type_.size()); - switch (fma_) { - case fma_kind_t::mad: - acc_type_ = a_type_.is_fp() ? type_t::f32() : type_t::s32(); - break; - case fma_kind_t::dpas: - acc_type_ = a_type_.is_fp() ? type_t::f32() : type_t::s32(); - break; - default: ir_error_not_expected(); - } - } - bool fma_type_supported(const type_t &type) const { switch (fma_) { case fma_kind_t::mad: diff --git a/src/gpu/intel/jit/v2/conv/plan_registry.cpp b/src/gpu/intel/jit/v2/conv/plan_registry.cpp index 269e78ca9b1..a2e53e4d7e9 100644 --- a/src/gpu/intel/jit/v2/conv/plan_registry.cpp +++ b/src/gpu/intel/jit/v2/conv/plan_registry.cpp @@ -51,14 +51,21 @@ plan_registry_t::plan_registry_t(const char **entries) { kernel_desc_t plan_registry_t::find_best(const problem_t &prb) const { kernel_desc_t best; - float best_eff = 0; + float min_time = std::numeric_limits::max(); for (auto &e : entries_) { if (!e.desc.can_fit(prb)) continue; - float eff = e.model_set.eff(prb, e.desc); - if (eff > best_eff) { - best_eff = eff; + float time = e.model_set.time(prb, e.desc); + if (time < min_time) { + min_time = time; best = e.desc; } + auto desc = to_stream_k(e.desc); + if (desc.is_empty() || !desc.can_fit(prb)) continue; + time = e.model_set.time(prb, desc); + if (time < min_time) { + min_time = time; + best = desc; + } } return best; } diff --git a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp index 45edc05e377..d85adf6a54f 100644 --- a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp +++ b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp @@ -25,218 +25,218 @@ namespace conv { // clang-format off const char** get_plan_registry_entries() { static const char *entries[] = { - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01E0004B4414C4BF3EFEC7B23F0000803FADD1A03C", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01A83F49440168683FFFBFA13F65C6EA40F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01914D554405161A3FFF4F853F985FC53FF90B5940", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=011FF47A441234E13E04F44A3F62FCB53FF6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01A6FBA84400708D3F5F34D23D65FE8C3F15C0E63E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01600924441234C83EFBA1B03F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=013B4C434413F4C63EFBEFA33FFD778F3F05E00E3F", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4 model=01CA3613441438CE3EFD5F9D3F66CE863F6334B93D", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=013368314417888D3EFEC77B3F7FB9F640F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01DDFA00440C20F53EFEC38E3F0000803F2D9A093E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb4oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01404F05441260EC3EFD83AB3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=0131AF5C44140CC83E05A3643F55811542F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=019CEA3B440658053F043A783F326BEA41FD7F0D40", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=010F6D38441828983E043C6C3F349E7140F6FFA741", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=013425F043152CC13EFE8B8A3F0000803F0FEDC93E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4 model=01B19027441222DA3EFE6B873FCDFED33F016C913F", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01AA475245FE479C3F04604D3F9A79823F17809F3E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01473F5645FC3FB83F04204C3F0000803F3200513E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=012A874B45FF27943F02A84B3F0000803F1200E83E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01DDDA4D45FF4F993F02604D3F0000803F1480C73E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=0173435445FFF7A53F02F85A3F34DB813F3380713E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01B6CF4945FFB7903F03504E3F33F3843F1400C03E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=016BAB5145FF6FA13F02F04E3F0000803F1400DE3E", - "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4 model=01866A544500E89C3F02A0623F0000803F36005B3E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01E78FF543175CA53EFDB5833F0000803F124D9C3E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01A7AE30441140F33E00B08E3F9A69A13F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01624E34440550083F0108913F0094A33F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=oc4ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01D38643441362D13E008C993F0000803FAF8D263B", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=0118BED844FD3B943F1658AE3E0000803FF1FFF741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc64 tg=mb2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=014017374415C4973E06F2423F16D25541F6FFA741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01B744DA44FF3B8A3F12A0E33E33238B3F3100743E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4 model=01A2B2664406D41A3FFE5F7F3F342D843F1700E33E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0188D08744FEF3903F0044963F32450740F8FF8441", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=0104261B440638113F00C1873F9A71C33F0540183F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01E2EE244406A40E3FFE6B8C3FCC44A33F2D9A223E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow64 tg=oc4ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01F998FA433444493E03C85D3FAA894E40F6FFA741", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01FB813C44084E003FFFA58E3F9903AD3F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow16 tg=ic16 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=016AE592440028803F0600173F9959DD3F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01426608441420BE3E06882F3F6590F63FF3DF5E41", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4 model=01B421404406FC1A3FFE0A8E3F3295B23F0240633F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=012EA8084501A4683FFE8FA43F9A79823FA9D1F03C", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0127F6324501E0633FFE6F923F6646963F1300C53E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0173790745011C693FFFEF9E3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0153341A450738203F00107F3F9ABA863F3580673E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=015988DF4401C0743FFE2FAF3F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=019CA8AF440508233F00208D3F0000803F309A043E", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=0145ED03451304E43E0160523F0382883FFC5FA23F", - "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4 model=01A826ED4402283F3FFE679A3F34DB813F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb16oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=01BB9F804435A01F3E0054813F67B6833FF93FFE3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc8ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=0141CC9E440718223F0154683F0000803F1240E43E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=012F42934405A0303F04204C3F0000803F0470243F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=016451B844FC0FB63F02F0443F0000803F3600243E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=015D46984406701F3F03BC503F334F803F14C0DC3E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=019A1E964400688B3F0450303F9AD2893F1200C53E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=018FEBB944FF179A3F0468273F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2 model=01D2CF8C4408E0133F04D03C3F4B86843F0680283F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=mb16oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01F80897440830143F0298513FCF628E3F3300333E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow32 tg=x loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=013A274944F2E1F83CFF64703FC9264942F6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=x loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=015D394C443510233E00B4583F3B0CF03FF6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01EBF1A144FF9F9E3F0568183F0000803F1000E83E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=0133BA864406F0303F05203D3F0000803F0280783F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01B3A0324474C8C63D0192573F99B30140F6FFA741", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=01BCC68E4402885C3F05782B3F0000803F06800F3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2 model=017500BE44FECFBE3F0528123F0000803F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01FD0198441300D93E01C8623F99FF9B3F00C08D3F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01E862A44408D0003F00B0753F0000803F0540543F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb8oc32 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=014815DE4403143F3FFE47973F6696C13F17B7D138", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01C187C8440348383FFE1FA13F69F6CD3F1200D43E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=019F20AA4408300A3F00A87D3F0000803F08A0173F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=015C71DC440358323FFEDBA33FFF2FE93F128D8E3E", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=x loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01A673A3441000E83E002C8B3FFF7A833F00B4803F", - "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01F099B64406101B3F0040863FCDECB83F6134C33D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01B4235B4405440F3FFE47883F35D1883F5B34F53D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0186C1274405DC123F0078993F6616903F2D5A2B3E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=mb4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=017CDD15441274D33E0268773F7F0BD63FFEB78B3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0172D6C94316048F3E036C643F0086993F0720153F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic2iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=010A56F04332943F3E02007B3FB41D803F17C0AA3E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0166732744063C083FFE878E3FCAAA973F17B7D138", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb32oc32 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=016F774F441448AF3E0388613F6834FB3FF6FFA241", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01C14A5B4405EC203F00C8763F6868923F1480A93E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc64 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01564314440578263FFD37BB3F0290923FFA3F1B40", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc32 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01DC38194406BC1E3FFE17A93F9B28BA3FFD7FAA3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01B5261D441350E33EFE2F8C3F31652C40F81F8341", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=011F6D2A44030C383FFEF39F3F0000803F6134D73D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic4iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01F2253044161CEA3E00DC8F3F9A27803F17B7D138", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01BD8B3C4416E4BD3E0270473FD40AA242F6FFA741", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0142ED5F440E30F93E05FA353FE4F73242F6FFA741", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01C2B9424403B8273F8006863F6564813FFFAFCF3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01A2E8DB4403702E3F00E4873F0000803F13C0F03E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01655B1A4508C0153FFFF7783F9A79823F16C0873E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01B51BB1440420423F00B07A3F0000803F2E5A263E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=015ED31B4502204C3F00B8633F0000803F5D34FA3D", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=019E62F84413B0D83E0408463F3442933FFB2FAA3F", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=ic4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01D6651C4501005D3F01C0653F0000803F291A483E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=010DDB154504E0453F00E06D3F0000803F128D893E", - "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0114F8C14408080F3F00406D3F6842853F06E0183F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01E3FF984409181D3F0148513F672E933F1300ED3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb32oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01C267AB44F340723DFE4B7A3F6491933FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=011E22924409701A3F0490403F0000803F1100E33E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01A06184442ED05D3E0018723F4D5D833FFC3FB83F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=014A3FAE4408F00D3FFA4FB73FCDE0853F0630193F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=017348974413F0E33E0098973F0000803F0660203F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=0169DE9A440100853F03CC4A3F98ED803F12A0ED3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 model=01EA958E440610203F0440403F0000803F05D0193F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=013267864404D0373F0508373FE5C5873F03A05D3F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0139AC8F440150803F0368403F67388C3FFF77943F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=x loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01897F9B440680373F03B0433FCD8EA03FFC5F933F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01F50D784418E0853E00B8723F35E9A83FF87F1C40", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01EE529244FDFF983F0338393F3490843F02E0593F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0102B688440630373F04703D3F00E5883F03E0593F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=01231DA7440110843FFBFFAC3F00A4843F0BE0093F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 model=0134428C4405605C3F02B0433F0041843F02A0623F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018B6AAC4416809A3E0040723F0178AC3FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=ic2mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=013F82B1441060F13EFF0F983F00268D3F1000E83E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0130A0C14406C00B3FFAFFAC3F99739A3FF6FFA741", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=015D9998441510BA3E01785D3F0000803FFBDFD63F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01E0EAB04408600C3F00A8963F0000803F0880F43E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018F8F9F443260693E00E4823FCC40923FFFAF843F", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=mb2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01C124B6440780373FFF07873FCCA6863F1300DE3E", - "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=x loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=011467A9441480B83E00B07A3F344F803F03F0493F", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01D51D8C43F540403D01C0563F99F9334017B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0135E5084437001F3EFEFF753F66E6A63FF67F2841", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01AC38EF437040EA3D0110663F346FC23FC700AA40", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01C7D86943F383533C0390403F33755B42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=019383A643F1FFF741F6FFA7419A6D1942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=01D7348F437880823D04E8473F022C923F17B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias model=0119D7EF43F603333CFF4B843F9631F93FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc64 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4 model=01647D8D43F7A0263D0420423F0000803F17B7D138", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=01569FCC4378A08F3D00206F3F00202540F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow64 tg=ic16 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=0131C70D443680213E01C0653FCD141D40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow32 tg=ic8oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=016B302B443440393E00A0763F672A823F1500A23E", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=0194A5D7433380353E03104D3F9889E33FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow16 tg=ic4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=01F84D6A43F4403B3DFF7F873F66CAF53FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=0176F4C44339F0063E0330413FFFEBAA3FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow32 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4 model=011B97A643F7FF0F42F6FFA7413313AD40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias model=018EEFCB437540B33DFF87703FFF879340F6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb32oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od ext=out_b4,bias model=01CCB2FE44F400443D00407C3F6666D83FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=oc8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0182AAE6447880963D02AC563F65B4AC3FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=0199FC0545F0005D3D01A0623F65C6E43FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=0159E0C344F0407C3D00207E3F66FEA93FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=out_b4,bias model=01230D9C44F000583D01646C3F01D8B83FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=oc2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0111D8AB44F2C0383D00206A3F32E3C03FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=x loop_desc=mb,ow,oh,od ext=out_b4,bias model=019034F9447800803DFF7F823F3273D33FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=ic2 loop_desc=mb,ow,oh,od ext=out_b4,bias model=0100CCE4447180C83D01885C3FFF3FE73FF8FF8840", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=01A6A88B43F7C01A3D0368453F9A211840F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=011B917644F801A23CFE3F7C3FFFE99D42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=013E380244F603563CFF5F893F34F3A740F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=x loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=015815AC43F2035B3C030C4C3F67DAA042F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=ic8oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 model=013D15C843F200713D02104D3F9CB94C40F37FEA40", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic2oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0115E48044F821963C00887A3FCB899F42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=0102A5B243F5C1AA3C03E84C3F97594341F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias model=01B3308343F483443C0028673F01809842F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc32ow8 tg=oc8 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 model=0163BB0044F781B33C00B8863F9AA96B40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow32 tg=oc2 loop_desc=mb,ow,oh,od 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01B7A19B443600103E0040863F6646B940F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=017D537A437140D13D04702E3F3473B63FC7E0C440", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01EFB79B43F600173D0040863F99412842F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0154405244F880053D0140593FE5603040F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=0193739B43F521AA3C01A85F3F9929D73FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic4oc4 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01179E0044F281E03C0110703FCDD09741F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic2oc2 loop_desc=mb,ow,oh,od 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias model=01D8ABF143F621A03C0342573FFD770F40F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic4oc2 loop_desc=mb,ow,oh,od ext=bias model=0133E02B44F4E1CB3C0018683F64D43C42F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc16 loop_desc=mb,ow,oh,od ext=bias model=01C23F2D44F860073DFF7B813FCDDCF43FF6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=0109F32944F0E1D53C02C85D3F353C7942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=oc4 loop_desc=mb,ow,oh,od ext=bias model=012FC23044F401C03C0100763FCD3CA440F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic4 loop_desc=mb,ow,oh,od ext=bias model=01F0B64844F401C03C01805F3F65E49942F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc2 loop_desc=mb,ow,oh,od ext=bias model=0169912244F3804B3DFFDF773F33F36740F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic8 loop_desc=mb,ow,oh,od ext=bias model=016CC8F643F481C73C0280643F9813E441F6FFA741", - "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=mb,ow,oh,od ext=bias model=01CEC02C44F441C13C0268723FFDCB3642F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=010553AB450000803F0000803F00300640FFFF843F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01B9ADA5450000803F0000803F6656F73F0200623F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01CE91A9450000803F0000803F99B9E93F0500213F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=019B76A8450000803F0000803F9A7902402F007E3E", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=014360AE450000803F0000803FCD8CC93F0480283F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01C9D4A5450000803F0000803F6726F13FFF7F783F", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=01157AAB450000803F0000803F9911B73F3300603E", - "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4 model=019EE0AB450000803F0000803F67EEC83F3600293E", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01A0C914450000803F0000803F34959942F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01699018450000803F0000803FCEE59A42F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=0158D322450000803F0000803F5F76BF40F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01FE3B1B450000803F0000803FCE8C2F41F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=019D3A17450000803F0000803F98A96E415F002041", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01380415450000803F0000803F00128242F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=017EEC16450000803F0000803F64E6AC405A007041", - "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4 model=01CF0218450000803F0000803F3313CD40F83F1341", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=0142FEDA440000803F0000803F33A33C40FA7F8140", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=017B4EDA4400007B3F0000803F65F64D40FC7F0840", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01014BD7440000803F0000803F34D34240FB7F0D40", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=013894DE440000853F0000803FCB4C4540F9FF3C40", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01D57BD7440000803F0000803FCDCC1340FCFFC03F", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=011CF3D6440000803F0000803FCB9C7040F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=016A27DA4400E0863F0000803F9BB98C40F6FFA741", - "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2 model=01EF0CD4440050803F0000803F9B495F40291A5C3E", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01183E1A450000803F0000803FFFB75C41F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01AA431C450000803F0000803FFDFF6541F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01025A16450000803F0000803F99C28F42F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01B63C16450000803F0000803F998B0842F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01014313450000803F0000803F998B8442F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01D2E415450000803F0000803F67167641F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01DD4E12450000803F0000803FCBCC7941F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01955F18450000803F0000803FCCC49440FBFFD93F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=017055E0440000803F0000803F34FB2640F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01BD82EB440000803F0000803F66CE0640F77FA440", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0191B1D0440100763F0000803F64F64D40F87FAE40", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=019B1BCD440000803F0000803FCACC7640F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01800BDE440080823F0000803FE6690C4005805A3F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=017FEBC744001C823F0000803F99B98C40F6FFA741", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=x loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01D60AD5440000803F0000803F335311400240633F", - "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=013A92C34400807D3F0000803F660E9140F6FF5540", - "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=0169CC71447940813D01305F3FCC2C87420000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=out_b4,bias model=015EBA5944F500493DFEFF7F3F341391420000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=0127096344F980003D0000713F65E65F410000803F", - "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=x loop_desc=mb,ow,oh,od reqs=ic1oc1 ext=bias model=012EB44044F500493D01005D3FFFBF66420000803F", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow64 tg=oc8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=0106901E4434B85C3EFFA1833F9CE9D23FF1FFF741020619CB46C63807410399B91D451D78403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 tg=oc4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=0147E6604410B4DE3E04187C3FBFE56B41F6FFA74102D00E9B467748B6400300481F45CF74403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=0163710C441682A23EFD13943FFEC9913F04E03B3F02310A1846A93D32410300B01C45A984403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc16ow16 tg=oc4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=013F53F64307940F3FFE1F9C3F02D48A3F11A5C63E02CE56D9459A41AC3F03CDFC114516AE403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=0169AC5B441588CE3E05C8623F32F11842F6FFA74102B601374693F4B03F03001C16450574403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow8 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=014496E4431300CA3EFFB3853F67F0823F0760113F02B41EC5458608374003001C1645CBBC403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=01BDEFB744FFDF8B3F15F09D3E9A09B23FFE7FA03F0206DEA645CB15354103CFCC334599CA3F3C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=16777216:g*oc*outsz<=16777216:ic%64==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b2,out_b4,stream_k model=019CAC23441308E03E002E843FCD9EC73FFD97CE3F02E093F445E0BC12400366FE1845915C403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=0126407C44138CE83EFF8F4F3F632CBC3FF6FFA74102D6645D465EEB1C400300EC2345B69D403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=010F7D434400487E3F0028993F6636A841F6FFA74102133623462D34B63F0300002A45F572403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=oc4ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=019533854415F8BD3EFC119B3F0000803F17C0C33E02C87762479E8A5D420300181B4539B1403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=012E45594403402C3F01A0803F658D0240F7FF9D410270E11046C46B04400300481F451675403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ic8 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=0167E7AD44FFAF893F31BA113E01608C3F1380D63E02C011914584DE4341039A211A450A96403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=011D64174415A6D83EFE0BA63F0000803FD368113D02636C9C4632889D4003006C2445C87D403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=0151AF3944132CC63EFD2D9D3F6480953F0710253F02B15A324657F9094103002818452071403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=16777216:g*oc*od*oh*outsz*ow<=16777216:ic%64==0:mb<=16777216:oc%64==0 ext=out_b2,out_b4,stream_k model=0131A9124410BCE03EFDDB9E3F68CE863F6274BF3D02061D8D468E4E32400300481F455B73403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=0135794D45FF179F3F04C03D3F0000803F1000ED3E02997E8F469E840041039A2F244530D0403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=01111E984513A0DE3EFE9FA33F0000803F17B7D13802E03DD247C32BC44003CCD04E453868403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=0147BE4845000C973F0430373F0000803F0800083F0252637B477E6288410333271E455E94403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=011D4C97451484E63EFFDF9A3F0000803F17B7D138022DE83146E46B9B410333ED46458496403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=01A6225845FEAFB13F0270563F679E803F1D808B3E02800E8F46E048EB3F0301602C45F684403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=01E5BC4445FFCD8E3F0428353F9979823F1100ED3E0200762A46663B96410367241C45A7B7403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow16 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=01362E804515F8C23E0180823F0000803F6134E63D026FAA2A471E741D400332ED4645A4C4403C", + "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b2,out_b4,stream_k model=0155485445FEEBA23F03904F3F0000803F1780953E0233BEDB46137A744003CDFC2545B988403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=01B20F1A443198633E0284423FB18EE041F6FFA74102708C4346121A04400300A8214521A4403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=01A6A9534402B04D3F06A82D3F67DEE73FF6FFA74102C719DE45D317703F0366D22345C69A403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=015109394407940A3FFE5D8B3F305BB33F17B7D1380268A41046AE81FB4003000016453585403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=mb8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=01D3C7244404C8213FFE538B3FCB70983F2A5A4E3E02246C4E46899546400300521845FC7A403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=01F6BB2544032C403F00D0733F9A6DB23F1200DE3E02ADB1BA4531FA094003997D1F450E24403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc64ow32 tg=oc2ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=015CBFF1433424463E050E593F6B314440F6FFA74102928CCF464E7EA3400366BA17450366403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=01D00107441238C93E065C293FC0A3C73FF13F7C41025D5C0146077DAB3F0300A82145116F403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=4194304:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b4,stream_k model=013CA64244038C303FFF3B943FFE11B83F0058823F028081F345247997400300A82145175D403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc64 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01AAFC4C4412A8F43E052C453F01041140F4FFCF4102EF061947A87114400367A64345BA84403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01DD02664405AA0D3FFF73893F0030863F29FA583E02EE497646548503410333F324457358403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=oc2ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=019806234408AC013F00508F3F672A823F17B7D1380276BDB24611FBA24003CD7C26458A75403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=019C2D3144064C113F00508F3F3576AA3F64A3213C02B36E9446C9DB524003000020453F93403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc64 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01688C45440575293F0694283F0000803FF6FFA7410206290146165C893F0300A41A45E690403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01F23CEE431350B13EFE1F7E3F0000803F119DC43E02781B55460A421A400300A82145E96B403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc32 tg=mb2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01AE7662440258503F01344C3F348FE73FF6FFA74102F4501A4672E3CC3F0300B01C458896403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=4194304:ic%32==0:mb<=16777216:oc%64==0 ext=out_b1,out_b4,stream_k model=01673461440558233F00E8743FCE54843F15C0EB3E029CD41A46C7CCA7400300002045B57D403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=0181FA06450180693FFEEF9E3F0000803F7000BC3D0265EF6445F91FDB410367241C45C203413C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=0128F6324501F0713FFF7F8C3F313B8E3F1680BD3E022C3E3C46196F5E4103997D1F45A2B2403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=016C6903450168633FFE37983F0000803F17B7D13802AFA64947FA81B6410300F61C4545B8403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=01973A1E4506E0313F00A8783F67CE863F3480583E02B3AE89465A64314003CDFC25456C98403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=014F96DD4400C8803FFEA7B43F0000803F17B7D13802D0BDE9463AD3E74003CD2C224541BC403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=01C29607451270FA3E03A0533F65DF843FFE1FA63F02F9D76C460FDCAC3F0367241C45E4AE403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=011D7BDC4404802D3F0098923F9A3F833F18C0823E02CD205C469434F63F0333271E45D8B9403C", + "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b4,stream_k model=016255EE4403B44E3FFE3F9A3F0000803FD346A33B02FE8CB246DC0E864003E7A71E45A96B403C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=0156AE874405902C3F07003A3F99B08D3F07E0183F028DF50746F14BA73F03CC2C404568EA383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=01F4708D4404204C3F05541D3F0000803FFA3FEA3F0201A2304516BE1B3F03002A3445CA23393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=01AEA8A144FE8FAE3F0608143F0000803F08F00D3F02997D8B45C4358E3F0333B33D451700393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=01E6EE834408681D3F07BC413F0000803F00508A3F02676DEA459D52353F0300004845B6D8383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=ow2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=01FB983F446E20FB3D0289563FFE31DD3FF1FFF74102898A0B468A57233F0366683C45BDDE383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=011C5143443740113E0260573F4EE90440F6FFA74102A71AFA45AE08B73E0334D14645DBFB383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=01C24E8D440250623F06782B3F0000803F07201F3F029A699945C8BF673F0300D83145516D393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc*outsz%64==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b1,out_b2,stream_k model=017600BE44FB37D93F0808003F0000803F17B7D138026059B445A28113400333FB3245E221393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb8oc32 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=010F0764440398513FFF4F943F0000803F17B7D13802CBCE8A46B5E5853F03CDCC334522EB383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb16oc32 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01AC617A4431A06F3E009C663F0000803FFA5F1940027AB611469989AA3E030000344548C5383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01D569B144FE2BB83F0590273F0000803F3640203E02805FAC45469A823F03322D3845FD18393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01511F914400408B3F05F8283F9CA9883F1540D53E02C459B245C5904D3F0333A12C45B251393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01C8308B4407E8153F0684423FCFE0853F0640133F02CC565E46DADF2940039AD92E455AF7383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2ow2 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01F9628F4408D0193F04644E3FFF51823F0660203F02CEFD6A4675F4AB3F03CD843445C0FA383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic4 loop_desc=kw,kh,kd,ic 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01D334B344FF0F983F0564173F0000803F17B7D13802DFD468468B04034003333731459439393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow4 loop_desc=kw,kh,kd,ic 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc*outsz%64==0 ext=out_b1,out_b2,stream_k model=01D9F6934406D02D3F03E0543F0000803F1120C83E0222894F46032BAB3F039A093F452DD9383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=01678AEC4436B8203EFE6A793F9A988A3FFE9FB23F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=01AC29AF440878123F00E8793F0000803F07A0123F02F54D56461202533F03002A3445CE30393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8mb8oc32 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=01A042DB44038C3A3FFE25913F99D98E3F17B7D138020A58A846EC8F26410300D04145892C393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=01A08CC8440218363FFF47973F3203E63FF97FDC3F023E741847AB5F6A4003CCAA41458DC2383C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=oc2 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2 model=016027E84437B0193EFECF783FE88B883FFAEFFD3F", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=019FF4B7440C00E33EFB5FBB3F98ED803F0020513F020068F6452BC3D33E039A093F455D16393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic16 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=01A4E8DB4404402C3FFEBFA13F33330F402E1A0C3E02F32A1B46EE22294103007C2C45FF42393C", + "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic4 loop_desc=kw,kh,kd,ic ext=out_b1,out_b2,stream_k model=01702CB74405001C3FFF47833F9C59C03F279A633E028589B446783BDC3F03007C36459637393C", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=014D7820440428443FFE4F943F0000803F32806C3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0161F1724401C0653FFD4FC13FCD28AC3FF6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01E38B12440834013F0526463F664E3B41F6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic4iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01716529441022F83EFD6F883F0000803F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=014FF92D440288663FFFB7903F9989E33FFE3FA43F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic4iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01918A00441538C43E00A06C3FCCECB83FFAEF9840", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=0194623D4414CEC93E0438433F5A08A242F6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic2iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=8388608:g*oc<=8388608:ic%32==0:iw<=16777216:oc%32==0:ow<=16777216:sd1sh1sw1 model=01BE8B3C440570333F00007B3F0000803FFB1FB03F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01BBDC2844050E0E3FFFFF8E3FFD5F8C3F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=mb8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=017A752D440214443F015C833F9B91853F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0170BFC643186C9A3E02CC5E3F330F993F0598103F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=ic2iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01522EE7432E78743E01EC703F9625A93FF9C33240", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb32oc32 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0140024F440EECF13E0364493F13F75342F6FFA741", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=0127D726440870153FFF818F3FCBB4903F17B7D138", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=iw16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=01638AE54331E8633E04EE643F49CDD33FF57FFE40", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:sd1sh1sw1 model=012C2E564405A62A3F01346A3F1B2A973F1540853E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=iw8 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=019DC21C4502A0623F0130643F0000803F28DA503E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 tg=ic4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=010997D84404E03B3F01406D3F0000803F1600C03E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc16 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018F9B1C450740223F0130783F00A4843F18009D3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic2iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01BC811F4502E05E3F02E8653F0000803F2D9A0E3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc16 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01D0D0F9441360E73E04883E3F36F7953FFCDFEA3F", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=0131AEB1440180553F00207E3F0000803F35801C3E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01183E1A4501D0733F00D0693F0000803F3200743E", + "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=010A34C64406102A3F00B0613FCDC8823F07801E3F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2iw8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=0128AA894403F06C3F03803C3F9979823FFEFF7F3F02405AF345F278A44003671630458E0D393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=01D1AA9244FF1F603F0130783F0000803F0140723F0200940746BB7BA53F03CDD030455831393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=016A4251443818113E01A4593F32E0CC3FF6FFA741026685F745A3C6AD3E0399512A45F2E3383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=0109FB9144FC1FAB3F04F03A3F32238B3FFFDF6D3F02E9FFBB45D3DDCA3F0300D831453415393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=015B6DA644FC2FA53FFD1F973F6731873F1210DD3E0233A7504582AA943F03345331459F21393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=019306894404C03D3F06883E3FCC32883F0200533F029A8D0C469CD1A33F0367163045E820393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=012C2B9044FDCF963F0398333FFF17833F0280693F024BE676458C1F753F03007C3645EA26393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:iw<=16777216:oc%16==0:ow<=16777216:sd1sh1sw1 ext=stream_k model=0126AA89440000713F0498383F00BC873F00F08A3F0212EBDA456441383F0300D831459D15393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb32oc8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=01CF60A544F3306E3DFE0B793FFF7E943FF6FFA241028A33D24634B1233F03000034456F0E393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=01E7CA934413A0DE3E00F0993F0000803F06301E3F0200DE80455F61053F03007035451BFF383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=ic2iw2 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=017F58944407601B3F02D04B3F998E913F0FC0F53E02A0C5624670B3BF3F03CDCC3345B80E393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=010D3090440730283F03B84A3F64388C3F07400E3F02CC7846467FC6A93F0300FC36456B0B393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=011A30824418408F3EFE5F663F6664813FFF0FA73F029ADAFD45257DAD3E03990145451EE3383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=01F582AC440850123FFA8FAE3F31A1823F09A01C3F0266C642455162193F0300A03B455523393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 loop_desc=kw,kh,kd,oc 2d=1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=0150549644FF8F8B3F02A0493F0000803F07400E3F02D856CB4566A4C73F03CC2E35454018393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 loop_desc=kw,kh,kd,oc 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0:sd1sh1sw1 ext=stream_k model=0169EC8A4408E0223F0418403F0000803F06A01C3F023F1E4B4620A2254003CD84344536FC383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=018A5CBC447848833DFEEF713F0000803FF67F4440", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic4iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 ext=stream_k model=0149B0B04406E01D3F01F06C3FCD58953F0940183F02B4A31B463295574003002637452B4F393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 ext=stream_k model=016966A04417D6853E0078673F0000803FFBAF004002B8931D460E774A3F03CDCC3345BA31393C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=015419FF4435D0263E00247F3F0000803FFABFD33F", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 ext=stream_k model=01EB90AA4408B0073F0090813F0000803F0500123F026D0E454616E5CD3E0300B83E4553EA383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 tg=ic4 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 model=01CFA0D84473D8D93DFEBB733F0000803FF63FA941", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 ext=stream_k model=01E2BCA34430285B3E0080823F9880873FFFBFA13F02F400074673FF473F0333233F4558FD383C", + "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 loop_desc=kw,kh,kd,oc reqs=sd1sh1sw1 ext=stream_k model=01619BAC441760923E0020833F0029813F00A07B3F02CC4FFC451586CD3E0300003E4582C7383C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow16 tg=ic8oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=01B7EFA1437EC0BA3DFEAF753F33BF8A3F17B7D13802D50C1B46D6ABB840034F2B1F4555C63F3C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow64 tg=ic16oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=01F78357440F00D43EFEDF953FFFD7B83FF6FFA74102D17EF74547F86D410300E022451349403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=0198D1C2437380CD3D025A493F3453AE3FF6FFA741024ECFB14598CE2C4003A043A8452E9A313C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=019E206D4336C0043E05503F3F9BD9B1403A03D23F02F403F0450038F13E0380A0AB457930323C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow32 tg=ic8 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=018E4EA843F1FFF741F6FFA741CD04A741F6FFA741022DBFDE4585B9134003A512EA4547892E3C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=01B04DD7433380443E0290403FFF0FE13F5A607441025BE2B9455265C140032C45DA45603D323C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow16 tg=ic8 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=011BCA70433540393E04D0373F658E9F3F9801344002E6091E46038BC83F03B4244345FA5F3E3C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=8388608:g*oc<=8388608:ic%32==0:iw%sw==0:iw<=16777216*sw:oc%32==0:oc<=4194304:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=out_b4,bias,stream_k model=019324CA43F2C0563D0260753F01404A40F6FFA741021350B745F7546640032087EE450A7F293C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=0151FC7443EA435C3C00C87B3F68E82E42F6FFA74102CCEC6F440DC8803F0314C119453314413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=013641C143F283623C02E4553F72FEA142F6FFA7410265DE224437A3004103F4681F45BA8C413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=01A77A5E43F1433E3C067B2D3F6603A242F6FFA74102CC0E2E458E316E3F03E7B10D45AB45423C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=01D6BC9443F6FF1942F6FFA74167C6C73FF6FFA741023BD2A246C3B8363F0367872645EFA43F3C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=010CA2E243F481C73CFF179A3F02E0DA3FF6FFA74102D6159B45D14A83400332912D4527A03F3C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=012BA58843F701B13CFE1F9C3F67663B40F6FFA74102668F9646EFD78B3F0318F728454A42403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc32 tg=ic4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=01E7D38943F1FFF741F6FFA74167A6A23FF6FFA741026BDD0246816F283F03CD332745EB51403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=8388608:g*oc*od*oh*ow<=8388608:ic%32==0:mb<=16777216:oc%32==0:oc<=4194304 ext=out_b4,bias,stream_k model=01A93E8143F6FF1942F6FFA741CD1CAA41F6FFA74102A86E1046B308683F03E6AE2345CA2E403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow16 tg=ic2oc2 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=0113F8C1447860983D0160613F66A6BF3FF6FFA74102D9490A46B7738C40034DA91F456EBA403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow16 tg=ic8 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=0182F79C4478F0943D0338523F9AADB63FF6FFA741020000B445D188D9400343E119453C79413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb32oc16 tg=oc4 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=013C43F244F4003F3D00407C3F3283B43FF6FFA7410200A33B465A77B9400367C111451F45413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=oc2 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=01035103457700B23D01007B3F6506CC3FF6FFA741023435644631134C4003002A2045BF8D413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=ic4 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=01EA3DAA44F2805A3D0100763F67B6A03FF6FFA741029AD9FE45574155400366981045BFC5413C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 tg=oc8 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=017BE6EA447280A53D02904A3F0120C23FF6FFA74102CDF65B46C3423F410366241C459BFB403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=01ACA0FB4478008A3DFF3F813F0040E73FF6FFA7410233A65A4628814A40038F671E45BAFB403C", + "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 loop_desc=ow,oh,od,mb ext=out_b4,bias,stream_k model=01473CEC447260AC3D01C8583FCD44C03FF6FFA741028C582146ADF6C53F03668E1745BC9D413C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic4oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=0192300144F481CC3C0130783F01AC1142F6FFA741028698D2458F0BA840036710344596AD393C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow16 tg=oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=01DCADEF43F9419E3C0120833F328F3542F6FFA74102B7E91846B58F9F3F03B281294564E53A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=0108BA9A43F481A43C00AC883F00989B42F6FFA7410267AD06469D0A6E3E034E4A3A4523A43A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic2oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=010B38F543F461C43C016C553F9AC9CA3FF6FFA7410225CADD45BBD62B400399062D45859E393C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow64 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=0184CC5544F6000D3D0200583F329F2B40F6FFA74102268BAF450C14D140034C572745E19F3A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic2oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=0134DC7A437A00803D01904F3FFE1FC23F5A0075410218FFF24566CDBD3E033D432D45E4953A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=014AFC9B43F581B33C0268633FCEACEE3FF6FFA741023339E145D198323F03C06B21451C1D3A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic2oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*sw<=4194304:g*oc<=4194304:ic%16==0:iw%sw==0:iw<=16777216*sw:oc%16==0:ow<=16777216:sw==1|kw==1:sw==1|pw==0 ext=bias,stream_k model=012A15F343F661A63C0304593F66D62840F6FFA74102011ACE445381F93F038D6E2345E64C3B3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb32oc16 tg=ic2oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=01F5310544F681953C0308693FB582A142F6FFA74102006A01453167AF4003A7F02445D3003B3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic4oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=0129A86B44FA41853CFFB38A3FCEDC5A41F6FFA7410235295F450219D640038D60194502E33D3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc32 tg=oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=010C340444F480283DFFA7873F00A09640F6FFA74102832D65462FDC9C3F0366D22345574B3A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc16 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=01AA77B543FA003A3D0260753F32DB0140F6FFA74102C20F6946C3B7AD3E03E7A0194524C03A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=014E118443F303423CFFA3813FCCCC8641F6FFA741023213B34419BFBA3E030153174592383E3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic2oc4 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=01B1B47344F9C1873C01E0813F00139F42F6FFA7410200001844BFCF53410300231B4532EB3C3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x1 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=013D84AF43F581B33C01F05D3F6786A040F6FFA741020038704550E20C3F0325742745A4243B3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2oc2 loop_desc=ow,oh,od,mb 2d=1 prefetch=x3 reqs=g*ic*id*ih*iw<=4194304:g*oc*od*oh*ow<=4194304:ic%16==0:mb<=16777216:oc%16==0 ext=bias,stream_k model=01298E8543F307D03B01009E3FFE470940F6FFA7410277262A46776AC33E030EF22545D18E3A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16mb8oc16 tg=oc16 loop_desc=ow,oh,od,mb ext=bias,stream_k model=010C5635447600A83D0060843F9BD90E40FAFF194002E6330C46AEC57F400367463645C0263A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow8 tg=ic8 loop_desc=ow,oh,od,mb ext=bias,stream_k model=014A5CFB43F001E33C02986A3F66B53442F6FFA74102A6D8A645683904400300203B45C5DD383C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2oc2 loop_desc=ow,oh,od,mb ext=bias,stream_k model=012B0E4044F381C73CFEE77E3F33CBF641F6FFA741025E84CD46C76AE03F03662B214598513B3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic4oc2 loop_desc=ow,oh,od,mb ext=bias,stream_k model=0171ED2E44F401C03C01A0673FFF1F1042F6FFA74102E6267E4681A3544003E8B5284545483A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic8 loop_desc=ow,oh,od,mb ext=bias,stream_k model=0174932F44F421D23C03445F3FE86EA142F6FFA74102E600624662FA5840031BB32645AE543A3C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 loop_desc=ow,oh,od,mb ext=bias,stream_k model=01430E24447500FD3D01E0543F6566DE4060000C41021BCB1046123EB53E0332CF3D4533FD383C", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic32oc16ow8 tg=oc16 loop_desc=ow,oh,od,mb ext=bias model=01C6987744F3C1D23CFFC3843F66461941F6FFA741", + "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=oc4 loop_desc=ow,oh,od,mb ext=bias,stream_k model=01F6E23344F401C03C01B46C3F67CEA940F6FFA741020A7B7046B579D93F0300E631454F233A3C", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=01815DA6450000803F0000803F9971E03F0800263F02341BA245ACB1004103AD982C4736DE0240", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=016972AA450000803F0000803F3303E63F1100C53E0271389545F0D3014103B7B63447FC3A0F40", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=0142D1B0450000803F0000803F00C0B53F0400353F025D688A4585CF7F41038C2DB64714D3E53F", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=01681FAA45000A803F0000803F667EDB3F7600B23D02D9289845EAEA00400369B06C4797060240", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=01E793A8450000803F0000803F00B0D43F18809F3E022881A54548F27D40036E50134724190340", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=01A886A5450000803F0000803F0080CE3F0900083F028877A445B36580410373110647688A1C40", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=0169E3AC450000803F0000803F0168AE3F7680BE3D025A839445E8E0773F03AD0A014786531E40", + "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b2,out_b4,stream_k model=01C95CAB450000803F0000803F9BC9AD3F7200CB3D0276EFA645E3C87A3F03A99A08471ECC1040", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=0105F91E450000803F0000803FCB0C5E40F6FFA7410228ADAE4515AF914103A5F6F74689700640", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=0105351C450000803F0000803F65767F40F6FFA7410236E2AD45D5498D4003FADB1747CA06FB3F", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow16 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=0198FE20450000803F0000803F98898640F6FFA741026059A64513C8934103084B3D4716210740", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=010F9F18450000803F0000803F0240ED40F6FFA74102303BA845B68F174003A4B6174736B40E40", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=017DE51E450000803F0000803F9B495F40F6FFA74102D032AA4540910F4103DE54284780630340", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=01254E17450000803F0000803F66C64740F6FFA74102C02B9D45FE4B8C3F039BD20A47AACD0C40", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=013E5016450000803F0000803F6138AF40F6FFA74102407F99451BEF104003885B45479BC11240", + "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b4,stream_k model=01D68A16450000803F0000803F66467940FFFF893F0200CCB0451AB68E3F034ACF10472C740A40", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=012FC8E0440000803F0000803F67CE0640FF7FA53F02B7028E450D05834003D2F93047488DE93F", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow4 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01F9F0E54400A0803F0000803F66DEE73F02004E3F02374FAB45789A7F4003C561D646F7D01640", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01B290E9440060843F0000803F00600C400040863F02807BA84521F30640030F04FB461F321040", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01CB92E8440080823F0000803FCC7CE83FFCBFAB3F029DC8A84592D7004103E0313B477F17D83F", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01B20FDC4400C07E3F0000803F3263F23FFB7FD73F02FBF59A454A7EFF3F03718F38475F410840", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01DB38DB440000803F0000803F99B12A40F6BF3841029859A94545AC763F0387E6E246D8CE1340", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=01800BDE440000803F0000803FCB7CE83FFE7FB93F02FBE099456C8A773F038978F84675D00840", + "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 loop_desc=kw,kh,kd,ic reqs=ic1oc1 ext=out_b1,out_b2,stream_k model=014F07E04400C07E3F0000803F9919F63F1100D93E02E9FCAA454DD0014003E1591F4703EF0340", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01B95319450000803F0000803F61049840F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=014DE818450000803F0000803F63C68740F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0178A015450000803F0000803F66E66C40F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=0104AD16450000803F0000803F5C68B540F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=010FDB15450000803F0000803FFA93C640F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=010A1E12450000803F0000803F6646B940F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01913112450000803F0000803F32B3C040F6FFA741", + "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 model=01521D17450000803F0000803F0110444003005D3F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw4 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=011F03E24400B07F3F0000803F6726F13FFCBFC93F0243DD9E45E938814003B5DC7C462CA44D40", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw16 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=0190A3E0440040813F0000803F9A01F33FF7FFDE3F0265A0A345902A71410338650247B6F4F93F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=0170E4DD440000803F0000803F68FE0C40FFFF843F022EE5A145F925014003AA01EF46A6AB0D40", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=01DB38DB4400B07F3F0000803FCD0CFB3FF67F23410243DD9E452A03004003E47ACF465C371A40", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=01D57BD74400A0803F0000803FCBACEE3FFC7F03400238FB9C455F70F6400385792D472C4DFE3F", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=019B1DE8440040863F0000803FCB8CC93F0460663F028CCF9B454974773F03630C92460BCB2F40", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=01CA01D0440000853F0000803F9A391B40FB7F1740029D62A045E6F5F13F03E4C8074732001040", + "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 loop_desc=kw,kh,kd,oc reqs=ic1oc1sd1sh1sw1 ext=stream_k model=010362DA440000803F0000803F99D1EC3FFF3F813F02C0C9A045C1A16C3F0397B32247EF820040", + "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16mb8 loop_desc=ow,oh,od,mb reqs=ic1oc1 ext=out_b4,bias,stream_k model=0148676E44F73F9A411580953E99D901410000803F02522C8C460655AF3E034B81964535E13F40", + "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16ow8 loop_desc=ow,oh,od,mb reqs=ic1oc1 ext=out_b4,bias,stream_k model=0177D35B443500243E0100533F00208C420000803F02C2668146BB4DAB3E03712E2446DACFB33F", + "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 loop_desc=ow,oh,od,mb reqs=ic1oc1 ext=bias,stream_k model=01C2615E4403005D3F03805A3F9999FD410000803F02C2ED6446FBD7943F03ECF20346E7D4E13F", + "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 loop_desc=ow,oh,od,mb reqs=ic1oc1 ext=bias,stream_k model=016B324644F800303DFE9F6C3FFF5F5A420000803F025673A5463879FA3F035BDCEC4546553A40", nullptr, }; return entries; diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.cpp b/src/gpu/intel/jit/v2/conv/planner/bench.cpp index bfcbf456d97..f28bc8c909d 100644 --- a/src/gpu/intel/jit/v2/conv/planner/bench.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/bench.cpp @@ -134,26 +134,45 @@ class bench_task_base_t { strm.wait(); int ntasks = (int)vec.size(); int nentries = 0; + int nkernels = 0; CHECK(dnnl_query_profiling_data( strm.get(), profiling_data_kind::time, &nentries, nullptr)); + CHECK(dnnl_query_profiling_data(strm.get(), + profiling_data_kind::time_per_kernel, &nkernels, nullptr)); ir_assert(nentries == ntasks * iters); std::vector entries(nentries); + std::vector kernel_entries; CHECK(dnnl_query_profiling_data(strm.get(), profiling_data_kind::time, &nentries, entries.data())); - for (int i = 0; i < ntasks * iters; i += iters) { - auto time = entries[i]; + int kernels_per_entry = ir_utils::safe_div(nkernels, nentries); + if (kernels_per_entry > 1) { + kernel_entries.resize(nkernels); + CHECK(dnnl_query_profiling_data(strm.get(), + profiling_data_kind::time_per_kernel, &nkernels, + kernel_entries.data())); + } + auto get_bench_time = [&](int i, int j) { + int idx = iters * i + j; + if (kernels_per_entry == 1) return bench_time_t(entries[idx]); + int beg = idx * kernels_per_entry; + int end = idx * kernels_per_entry + kernels_per_entry; + return bench_time_t(entries[idx], kernel_entries.begin() + beg, + kernel_entries.begin() + end); + }; + for (int i = 0; i < ntasks; i++) { + auto time = get_bench_time(i, 0); for (int j = 1; j < iters; j++) { - time = std::min(time, entries[i + j]); + auto j_time = get_bench_time(i, j); + time = time.min(j_time); } - vec[i / iters].set_time(time); + vec[i].set_time(time); } - return status::success; } - uint64_t time() const { return time_; } - void set_time(uint64_t time) { time_ = time; } + const bench_time_t &time() const { return time_; } + void set_time(const bench_time_t &time) { time_ = time; } protected: void set_primitive(const primitive &prim) { prim_ = prim; } @@ -181,12 +200,13 @@ class bench_task_base_t { } primitive prim_; - uint64_t time_ = 0; + bench_time_t time_; }; using problem_t = dnnl::impl::gpu::intel::jit::v2::conv::problem_t; using kernel_desc_t = dnnl::impl::gpu::intel::jit::v2::conv::kernel_desc_t; using bench_data_t = dnnl::impl::gpu::intel::jit::v2::conv::bench_data_t; +using bench_time_t = dnnl::impl::gpu::intel::jit::v2::conv::bench_time_t; using pvar_tile_t = dnnl::impl::gpu::intel::jit::pvar_tile_t; namespace pvars = dnnl::impl::gpu::intel::jit::pvars; @@ -489,7 +509,7 @@ std::vector generate_problems(const bench_input_params_t ¶ms) { continue; auto prb = params.problem(); prb.set_shape(shape); - if (!params.reqs.fits(prb.shape())) continue; + if (!params.reqs.fits(prb.shape() | prb.vars())) continue; ret.push_back(prb); if ((int)ret.size() >= params.nprbs) break; } @@ -531,11 +551,6 @@ bench_data_t bench(const bench_manager_t &bench_mger, << std::endl; clear_primitive_cache(); - { - auto guard = plan_preset_t::instance().make_guard(kernel_desc); - if (!tasks[0].init_primitive(eng)) return {}; - } - ir_assert(kernel_desc.spec_strategy == spec_strategy_t::none); auto kernel_desc_min_dims = kernel_desc; kernel_desc_min_dims.spec_strategy = spec_strategy_t::min_dims; @@ -683,6 +698,21 @@ kernel_desc_t try_extensions( } } + // Try Stream-K. + if (kernel_desc.prop != prop_kind::backward_data + || (kernel_desc.a_type() == type_t::f32() + && kernel_desc.b_type() == type_t::f32())) { + auto d = to_stream_k(kernel_desc, /*check_ext=*/false); + if (!d.is_empty()) { + d.is_finalized = false; + d.set_defaults(); + if (finalize_conv_desc(d, bench_mger.hw()) + && try_create(bench_mger, d)) { + ext.add(extension_kind_t::stream_k); + } + } + } + prb_reqs_t out_reqs; prb_reqs_t::merge(reqs_vec, out_type_sizes, pvar_t("outsz"), out_reqs); auto _kernel_desc = kernel_desc; diff --git a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp index 6e96aad9e0e..e7419ac705a 100644 --- a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp @@ -125,14 +125,8 @@ void find_optimal_param( } // namespace -model_t model_fit_data_parallel(const vec2d &X, const vec1d &y, bool verbose) { - model_params_t params(model_kind_t::data_parallel); - // Empirically-based parameter ranges. - params.add("T0", 1000, 1, 100000); - params.add("a_kl", 1, 0.0001f, 100); - params.add("b_kl", 1, 0.0001f, 100); - params.add("a_wp", 2, 1, 100); - params.add("b_wp", 1, 0.0001f, 100); +model_t model_fit( + model_params_t ¶ms, const vec2d &X, const vec1d &y, bool verbose) { int nparams = params.size(); // Perform a coordinate descent search optimizing one parameter at a time. // The goal is to maximize R2. See conv/model.cpp file for more details on @@ -153,21 +147,21 @@ model_t model_fit_data_parallel(const vec2d &X, const vec1d &y, bool verbose) { model_t model_fit(model_kind_t kind, const vec2d &X, const vec1d &y, bool verbose = false) { - switch (kind) { - case model_kind_t::data_parallel: - return model_fit_data_parallel(X, y, verbose); - default: ir_error_not_expected() << "Unknown kind: " << to_string(kind); + model_params_t params(kind); + std::vector param_names; + std::vector param_values; + std::vector param_min; + std::vector param_max; + model_t::coef_ranges( + kind, X, y, param_names, param_values, param_min, param_max); + for (size_t i = 0; i < param_names.size(); i++) { + params.add(param_names[i], param_values[i], param_min[i], param_max[i]); } - return model_t(); + return model_fit(params, X, y, verbose); } -model_t model_fit(const bench_data_t &bd) { - if (!bd) { - std::cout << "Warning: empty bench_data." << std::endl; - return model_t(); - } +model_t model_fit(model_kind_t kind, const bench_data_t &bd) { // Step 1. Fit model. - model_kind_t kind = model_kind_t::data_parallel; vec2d X; vec1d y; to_model_data(kind, bd, X, y); @@ -188,7 +182,23 @@ model_t model_fit(const bench_data_t &bd) { model = model_fit(kind, X_adjusted, y_adjusted, /*verbose=*/true); dump_csv(bd, model); dump_model_params(bd.kernel_desc, model); - return model_t(model); + return model; +} + +void model_fit(const bench_data_t &bd, model_set_t &model_set) { + if (!bd) { + std::cout << "Warning: empty bench_data." << std::endl; + return; + } + if (bd.kernel_desc.use_stream_k) { + auto model1 = model_fit(model_kind_t::stream_k, bd); + auto model2 = model_fit(model_kind_t::data_copy, bd); + model_set.add(model1); + model_set.add(model2); + } else { + auto model = model_fit(model_kind_t::data_parallel, bd); + model_set.add(model); + } } } // namespace planner diff --git a/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp b/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp index f5f7be24d02..5a659ff3ae2 100644 --- a/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp +++ b/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ namespace v2 { namespace conv { namespace planner { -model_t model_fit(const bench_data_t &bd); +void model_fit(const bench_data_t &bd, model_set_t &model_set); } // namespace planner } // namespace conv diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.cpp b/src/gpu/intel/jit/v2/conv/planner/planner.cpp index 3115584e8ce..3e4a3edf6ac 100644 --- a/src/gpu/intel/jit/v2/conv/planner/planner.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/planner.cpp @@ -135,7 +135,8 @@ void DNNL_API planner_main(int argc, const char **argv) { } case planner_mode_t::bench: { auto bd = bench(bench_mger, params.desc); - auto model = model_fit(bd); + model_set_t model_set; + model_fit(bd, model_set); break; } case planner_mode_t::auto_search: diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp index eb183c5bb76..3b029cee151 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp @@ -384,18 +384,12 @@ class kernel_search_manager_t { void search() { std::cout << "Starting kernel search" << std::endl; auto desc_groups = gen_desc_groups(); - auto ®istry = plan_registry(); for (auto &dg : desc_groups) { auto bench_data_set = bench_kernel_desc_group( bench_mger_, dg, bench_nprbs, max_descs); auto best = bench_data_set.find_best(registry_top_k); for (auto &bd : best) { - auto &d = bd.kernel_desc; - auto bd_model = bench(bench_mger_, d, model_nprbs); - if (!bd_model) continue; - auto model = model_fit(bd_model); - auto d_ext = try_extensions(bench_mger_, d); - registry.set(d_ext, model_set_t(model)); + update_registry(bd.kernel_desc); } } std::cout << "Kernel search completed" << std::endl; @@ -479,24 +473,20 @@ class kernel_search_manager_t { return tiles; } - // TODO: Use search_desc. - void search_desc(const kernel_desc_t &_desc) const { - auto iter_outer_tiles = generate_iter_outer_tiles(_desc); + void update_registry(const kernel_desc_t &desc) const { auto ®istry = plan_registry(); - for (auto &iter_outer : iter_outer_tiles) { - auto desc = _desc; - desc.iter_outer_tile = iter_outer; - std::cout << "Running benchmark for descriptor: " << desc.cmd_str() - << std::endl; - auto bd = bench(bench_mger_, desc); - if (!bd) { - std::cout << "Benchmarking failed" << std::endl; - continue; - } - auto model = model_fit(bd); - registry.set(desc, model); - return; + auto bd = bench(bench_mger_, desc, model_nprbs); + if (!bd) return; + model_set_t model_set; + model_fit(bd, model_set); + auto d_ext = try_extensions(bench_mger_, desc); + if (d_ext.ext.has(extension_kind_t::stream_k)) { + // Fit another model for Stream-K. + auto d_sk = to_stream_k(d_ext); + auto bd = bench(bench_mger_, d_sk, model_nprbs); + model_fit(bd, model_set); } + registry.set(d_ext, model_set); } const bench_manager_t &bench_mger_; @@ -666,7 +656,6 @@ void auto_search( kernel_desc_t desc; parse_result_t parse_result; iface.parse(line, desc, &parse_result); - //auto r = std::string(_r) + " --iter x --tg x"; // TODO: Remove. desc.hw = hw_t(bench_mger.get_engine().get()); kernel_search_manager_t mger( diff --git a/src/gpu/intel/jit/v2/conv/problem.cpp b/src/gpu/intel/jit/v2/conv/problem.cpp index 1d1fb692b30..a6e46f882ba 100644 --- a/src/gpu/intel/jit/v2/conv/problem.cpp +++ b/src/gpu/intel/jit/v2/conv/problem.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,12 +156,14 @@ std::string problem_t::desc_str() const { std::string problem_t::str() const { std::ostringstream oss; oss << "Conv problem" << std::endl; - oss << " HW: " << to_string(hw_.to_ngen()) << std::endl; - oss << " Propagation: " << jit::to_string(prop_) << std::endl; - oss << " Source: " << src_tag_ << std::endl; - oss << " Weights: " << wei_tag_ << std::endl; - oss << " Destination: " << dst_tag_ << std::endl; - oss << " Descriptor: " << desc_str(); + oss << " HW: " << to_string(hw_.to_ngen()) << std::endl; + oss << " Propagation: " << jit::to_string(prop_) << std::endl; + oss << " Source: " << src_tag_ << std::endl; + oss << " Weights: " << wei_tag_ << std::endl; + oss << " Destination: " << dst_tag_ << std::endl; + oss << " With post-ops: " << ir_utils::to_string(with_post_ops_) + << std::endl; + oss << " Descriptor: " << desc_str(); return oss.str(); } diff --git a/src/gpu/intel/jit/v2/conv/problem.hpp b/src/gpu/intel/jit/v2/conv/problem.hpp index 101150d4c69..56e2862274a 100644 --- a/src/gpu/intel/jit/v2/conv/problem.hpp +++ b/src/gpu/intel/jit/v2/conv/problem.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,6 +52,7 @@ class problem_t { return src_tag_; } const pvar_tile_t &shape() const { return shape_; } + bool with_post_ops() const { return with_post_ops_; } pvar_map_t vars() const; bool is_depthwise() const { dim_t g = shape_.at(pvars::g); @@ -70,6 +71,7 @@ class problem_t { void set_dst_tag(const layout_tag_t &tag) { dst_tag_ = tag; } void set_bias_type(const type_t &bias_type) { bias_type_ = bias_type; } void set_shape(const pvar_tile_t &shape) { shape_ = shape; } + void set_with_post_ops(bool value) { with_post_ops_ = value; } bool with_bias_fwd() const { return prop_ == prop_kind::forward && !bias_type_.is_undef(); } @@ -98,6 +100,7 @@ class problem_t { type_t bias_type_; pvar_tile_t shape_; std::array dhw_map_; + bool with_post_ops_ = false; }; } // namespace conv diff --git a/src/gpu/intel/jit/v2/ir/tensor.hpp b/src/gpu/intel/jit/v2/ir/tensor.hpp index 021cac44676..86d8361d5b3 100644 --- a/src/gpu/intel/jit/v2/ir/tensor.hpp +++ b/src/gpu/intel/jit/v2/ir/tensor.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -305,12 +305,20 @@ class layout_tag_t { const layout_raw_tag_t &raw_tag() const { return raw_tag_; } bool matches(const layout_tag_t &other, const pvar_tile_t &sizes, bool check_type = true) const; + layout_tag_t with_type(const type_t &new_type) const { + return layout_tag_t(desc_, new_type, raw_tag_); + } std::string str() const; IR_DEFINE_DUMP() -#if __cplusplus >= 202002L - bool operator==(const layout_tag_t &other) const = default; -#endif + bool operator==(const layout_tag_t &other) const { + return (desc_ == other.desc_) && (type_ == other.type_) + && (raw_tag_ == other.raw_tag_); + } + + bool operator!=(const layout_tag_t &other) const { + return !operator==(other); + } void stringify(std::ostream &out) const { jit::stringify(out, raw_tag_); From f766ccd3072c0a214cb0dcf70fd80c7f6e6a72a5 Mon Sep 17 00:00:00 2001 From: "Chereshnev, Eugene" Date: Mon, 6 Jan 2025 17:16:59 -0800 Subject: [PATCH 14/14] xe: conv_v2: remove hw from kernel descriptor --- src/gpu/intel/jit/v2/conv/builder.cpp | 4 +- src/gpu/intel/jit/v2/conv/kernel_desc.cpp | 15 ++++--- src/gpu/intel/jit/v2/conv/kernel_desc.hpp | 4 +- src/gpu/intel/jit/v2/conv/plan.cpp | 45 ++++++++++---------- src/gpu/intel/jit/v2/conv/plan.hpp | 2 +- src/gpu/intel/jit/v2/conv/planner/bench.cpp | 5 ++- src/gpu/intel/jit/v2/conv/planner/bench.hpp | 9 ++-- src/gpu/intel/jit/v2/conv/planner/search.cpp | 9 ++-- src/gpu/intel/utils.hpp | 2 +- 9 files changed, 48 insertions(+), 47 deletions(-) diff --git a/src/gpu/intel/jit/v2/conv/builder.cpp b/src/gpu/intel/jit/v2/conv/builder.cpp index 74daff02911..72ddfba4c64 100644 --- a/src/gpu/intel/jit/v2/conv/builder.cpp +++ b/src/gpu/intel/jit/v2/conv/builder.cpp @@ -544,7 +544,7 @@ class post_op_builder_t : public ir_builder_t { emit(func.call({expr_t(rhs.elems()), rhs_buf})); } ir_assert(lhs.nblocks() > 0); - int max_simd = (2 * desc_.hw.grf_size()) / sizeof(float); + int max_simd = (2 * desc_.hw_desc.grf_size()) / sizeof(float); auto &lhs0 = lhs.blocks()[0]; int elems = math::gcd(max_simd, lhs0.int_size()); bool is_bcast = !rhs.dim_sizes().has(lhs0.dim); @@ -966,7 +966,7 @@ class conv_builder_t : public ir_builder_t { stmt_t build_ir(const exec_config_t &exec_cfg, const kernel_desc_t &desc, var_manager_t &var_mgr) { - auto plan = create_conv_plan(desc); + auto plan = create_conv_plan(desc, exec_cfg.hw()); if (!plan) ir_except_not_implemented("Cannot create plan."); ir_info() << desc << std::endl; diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp index 643269ab0ab..f6a74a74b6f 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp @@ -192,26 +192,27 @@ int estimate_grf_usage_bytes(const kernel_desc_t &desc) { return into(abc_size); } -bool is_tg_size_ok(const kernel_desc_t &desc) { - int max_tg_size = desc.hw.max_tg_size(desc.regs, desc.simd); +bool is_tg_size_ok(const kernel_desc_t &desc, const hw_t &hw) { + int max_tg_size = hw.max_tg_size(desc.regs, desc.simd); return desc.thread_group_tile.elems() <= max_tg_size; } bool is_grf_usage_ok(const kernel_desc_t &desc) { int size = estimate_grf_usage_bytes(desc); - if (size > desc.hw.grf_size() * desc.regs) { return false; } + if (size > desc.hw_desc.grf_size() * desc.regs) { return false; } return true; } -bool kernel_desc_t::is_supported() const { +bool kernel_desc_t::is_supported(const hw_t &hw) const { ir_check(prop != prop_kind::undef) << "Invalid prop: " << ir_utils::to_string(prop); - ir_check(!hw.is_undef()) << "Invalid hw: " << jit::to_string(hw.to_ngen()); + ir_check(hw_desc.hw != ngen::HW::Unknown) + << "Invalid hw: " << jit::to_string(hw_desc.hw); ir_check(fma != fma_kind_t::undef) << "Invalid fma: " << jit::to_string(fma); ir_check(simd != 0) << "Invalid simd: " << simd; ir_check(regs != 0) << "Invalid regs: " << regs; - ir_check(is_tg_size_ok(*this)) + ir_check(is_tg_size_ok(*this, hw)) << "Invalid thread_group_tile: " << thread_group_tile; if (use_stream_k) { ir_check(c_type() == accumulator_type(a_type(), b_type())) @@ -417,7 +418,7 @@ std::string kernel_desc_t::str() const { oss << "Source tag: " << src_tag << std::endl; oss << "Weights tag: " << wei_tag << std::endl; oss << "Destination tag: " << dst_tag << std::endl; - oss << "HW: " << jit::to_string(hw.to_ngen()) + oss << "HW: " << jit::to_string(hw_desc.hw) << std::endl; oss << "FMA kind: " << to_string(fma) << std::endl; oss << "SIMD: " << simd << std::endl; diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp index 60e6c22f44d..a17c844736e 100644 --- a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp +++ b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp @@ -49,6 +49,7 @@ namespace conv { struct hw_desc_t { ngen::HW hw = ngen::HW::Unknown; + int grf_size() const { return ngen::GRF::bytes(hw); } void stringify(std::ostream &out) const { jit::stringify(out, hw); } void parse(std::istream &in) { jit::parse(in, hw); } #if __cplusplus >= 202002L @@ -289,11 +290,10 @@ class kernel_desc_t : public kernel_desc_base_t { extensions_t ext; gpu_post_ops_t post_ops; - hw_t hw; bool is_finalized = false; bool is_empty() const { return prop == prop_kind::undef; } - bool is_supported() const; + bool is_supported(const hw_t &hw) const; void set(const std::string &s); void set_defaults(); void finalize(const prb_reqs_t &final_reqs); diff --git a/src/gpu/intel/jit/v2/conv/plan.cpp b/src/gpu/intel/jit/v2/conv/plan.cpp index 48135555fc6..87bdff34798 100644 --- a/src/gpu/intel/jit/v2/conv/plan.cpp +++ b/src/gpu/intel/jit/v2/conv/plan.cpp @@ -335,7 +335,8 @@ class multiply_info_t { class plan_builder_t { public: plan_builder_t() = default; - plan_builder_t(const kernel_desc_t &desc) : desc_(desc) { + plan_builder_t(const kernel_desc_t &desc, const hw_t &hw) + : desc_(desc), hw_(hw) { reqs_ = desc_.reqs; desc_.reqs = prb_reqs_t(); } @@ -437,12 +438,12 @@ class plan_builder_t { } plan_t init_plan() { - plan_t plan(desc_.hw); + plan_t plan(hw_); if (!try_init_plan(plan, reqs_) || !check_plan(plan)) return plan_t(); // Re-create plan to ensure all collected requirements are cross-used // between sub-plans. - plan = plan_t(desc_.hw); + plan = plan_t(hw_); if (!try_init_plan(plan, reqs_) || !check_plan(plan)) { ir_error_not_expected(); return plan_t(); @@ -525,10 +526,10 @@ class plan_builder_t { } else { auto &src = load.reg_layout(); auto dst = mul_info_.to_compatible_layout(abc, load.reg_layout()); - reorder = reorder_plan_t(desc_.hw, src, dst); + reorder = reorder_plan_t(hw_, src, dst); reg_layout = reorder.dst; } - plan = x2r_plan_t(desc_.hw); + plan = x2r_plan_t(hw_); plan.tensor_kind = abc; plan.load = std::move(load); plan.reorder = std::move(reorder); @@ -545,7 +546,7 @@ class plan_builder_t { auto inst_tile = mul_info_.inst_tile(); auto acc_layout = mul_info_.acc_layout(a, b, c_layout_); ir_check(!acc_layout.is_empty()) << "init_fma_plan: cannot vectorize."; - plan = fma_plan_t(desc_.hw); + plan = fma_plan_t(hw_); plan.simd = desc_.simd; plan.fma = desc_.fma; plan.a_layout = a; @@ -648,7 +649,7 @@ class plan_builder_t { if (bias_reg_layout != store.reg_layout()) { auto store_layout = store.reg_layout(); if (bias_reg_layout != store_layout) { - plan.bias_reorder = reorder_plan_t(desc_.hw); + plan.bias_reorder = reorder_plan_t(hw_); plan.bias_reorder.src = std::move(bias_reg_layout); plan.bias_reorder.dst = std::move(store_layout); } @@ -691,7 +692,7 @@ class plan_builder_t { ir_assert(k_tg > 1); ir_assert(desc_.thread_group_tile.elems() == k_tg) << "Local k-slicing assumes no split by M/N."; - ir_check(c_layout.size() % desc_.hw.grf_size() == 0) + ir_check(c_layout.size() % hw_.grf_size() == 0) << "init_slm_reduce_plan: c_layout is not aligned to a " "reigster boundary."; @@ -740,11 +741,11 @@ class plan_builder_t { auto &load_layout = load.reg_layout(); auto reduced_layout = load_layout.map(split_view.tile()); - auto reduce = reduce_plan_t(desc_.hw, load_layout, reduced_layout); + auto reduce = reduce_plan_t(hw_, load_layout, reduced_layout); auto c_post_layout = std::move(reduced_layout); c_post_layout.remove(k_dim); - plan = slm_reduce_plan_t(desc_.hw); + plan = slm_reduce_plan_t(hw_); plan.store = std::move(store); plan.load = std::move(load); plan.reduce = std::move(reduce); @@ -770,7 +771,7 @@ class plan_builder_t { auto c_reg_tile_layout = c_reg_layout.map(tile); auto store_layout = store.reg_layout().map(tile); if (c_reg_tile_layout != store_layout) { - plan.reorder = reorder_plan_t(desc_.hw); + plan.reorder = reorder_plan_t(hw_); plan.reorder.src = std::move(c_reg_tile_layout); plan.reorder.dst = std::move(store_layout); } @@ -798,11 +799,11 @@ class plan_builder_t { } bool check_plan(const plan_t &plan) const { - int grf_bound = desc_.hw.grf_size() * desc_.regs; + int grf_bound = hw_.grf_size() * desc_.regs; int grf_bytes = plan.grf_usage_bytes(); ir_check(grf_bytes <= grf_bound) << "check_plan: out of registers"; int slm_bound = compute::device_info_t::max_slm_size_per_tg( - convert_ngen_arch_to_dnnl(desc_.hw.to_ngen()), + convert_ngen_arch_to_dnnl(hw_.to_ngen()), into(desc_.thread_group_tile.elems()), desc_.regs > 128); int slm_bytes = plan.slm_usage_bytes(); ir_check(slm_bytes <= slm_bound) << "check_plan: out of SLM"; @@ -818,7 +819,7 @@ class plan_builder_t { if (type.is_f32()) op = send_op_t::atomic_fadd; } send_params_t params; - params.hw = desc_.hw; + params.hw = hw_; params.kind = (send_kind != send_kind_t::undef ? send_kind : desc_.access_kind(op, abc)); @@ -837,6 +838,7 @@ class plan_builder_t { } kernel_desc_t desc_; + hw_t hw_; dim_mapper_manager_t dim_mapper_manager_; multiply_info_t mul_info_; @@ -852,8 +854,8 @@ class plan_builder_t { }; template -plan_t create_conv_plan_impl(KernelDescT &desc, bool finalize) { - if (!desc.is_supported()) return plan_t(); +plan_t create_conv_plan_impl(KernelDescT &desc, const hw_t &hw, bool finalize) { + if (!desc.is_supported(hw)) return plan_t(); ir_assert(!desc.has_spec_strategy()) << "Kernel descriptor strategies are required to be specialized " "before plan creation"; @@ -861,7 +863,7 @@ plan_t create_conv_plan_impl(KernelDescT &desc, bool finalize) { ir_assert(desc.is_finalized) << "Kernel descriptor must be finalized before plan creation"; } - plan_builder_t builder(desc); + plan_builder_t builder(desc, hw); auto plan = builder.build(); if (plan) { if (finalize) { @@ -873,18 +875,17 @@ plan_t create_conv_plan_impl(KernelDescT &desc, bool finalize) { return plan; } -plan_t create_conv_plan(const kernel_desc_t &desc) { - return create_conv_plan_impl(desc, /*finalize=*/false); +plan_t create_conv_plan(const kernel_desc_t &desc, const hw_t &hw) { + return create_conv_plan_impl(desc, hw, /*finalize=*/false); } bool finalize_conv_desc_impl(kernel_desc_t &desc, const hw_t &hw, const problem_t *prb, plan_t *out_plan) { if (desc.is_empty()) return false; if (desc.hw_desc.hw != hw.to_ngen()) return false; - desc.hw = hw; - if (!desc.is_supported()) return false; + if (!desc.is_supported(hw)) return false; if (desc.is_finalized) return true; - auto plan = create_conv_plan_impl(desc, /*finalize=*/true); + auto plan = create_conv_plan_impl(desc, hw, /*finalize=*/true); if (plan) { if (out_plan) *out_plan = plan; if (prb && !desc.matches(*prb)) return false; diff --git a/src/gpu/intel/jit/v2/conv/plan.hpp b/src/gpu/intel/jit/v2/conv/plan.hpp index 5da2fd467dd..a737c424af5 100644 --- a/src/gpu/intel/jit/v2/conv/plan.hpp +++ b/src/gpu/intel/jit/v2/conv/plan.hpp @@ -452,7 +452,7 @@ struct plan_t : public base_plan_t { IR_DEFINE_DUMP() }; -plan_t create_conv_plan(const kernel_desc_t &desc); +plan_t create_conv_plan(const kernel_desc_t &desc, const hw_t &hw); bool finalize_conv_desc( kernel_desc_t &desc, const problem_t &prb, plan_t *plan = nullptr); bool finalize_conv_desc( diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.cpp b/src/gpu/intel/jit/v2/conv/planner/bench.cpp index f28bc8c909d..fc9b34f7ea8 100644 --- a/src/gpu/intel/jit/v2/conv/planner/bench.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/bench.cpp @@ -623,14 +623,15 @@ bench_data_t bench(const bench_manager_t &bench_mger, const kernel_desc_t &_kernel_desc, int nprbs) { auto kernel_desc = _kernel_desc; if (!finalize_conv_desc(kernel_desc, bench_mger.hw())) return {}; - bench_runner_t runner(bench_mger, bench_input_params_t(kernel_desc, nprbs)); + bench_runner_t runner(bench_mger, + bench_input_params_t(kernel_desc, bench_mger.hw(), nprbs)); return runner.bench(kernel_desc); } bool try_create( const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc) { clear_primitive_cache(); - bench_input_params_t params(kernel_desc, /*nprbs=*/1); + bench_input_params_t params(kernel_desc, bench_mger.hw(), /*nprbs=*/1); bench_task_t task(generate_problems(params)[0]); auto engine = bench_mger.get_engine(); auto guard = plan_preset_t::instance().make_guard(kernel_desc); diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.hpp b/src/gpu/intel/jit/v2/conv/planner/bench.hpp index 81aecc497b6..815cbdb24fc 100644 --- a/src/gpu/intel/jit/v2/conv/planner/bench.hpp +++ b/src/gpu/intel/jit/v2/conv/planner/bench.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,11 +67,10 @@ struct bench_input_params_t { int nprbs = 0; bench_input_params_t() = default; - bench_input_params_t( - const kernel_desc_t &kernel_desc, int nprbs = default_nprbs) - : nprbs(nprbs) { + bench_input_params_t(const kernel_desc_t &kernel_desc, const hw_t &hw, + int nprbs = default_nprbs) + : hw(hw), nprbs(nprbs) { ir_assert(kernel_desc.is_finalized); - hw = kernel_desc.hw; prop = kernel_desc.prop; src_tag = kernel_desc.src_tag; wei_tag = kernel_desc.wei_tag; diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp index 3b029cee151..2c0b3e1fdd1 100644 --- a/src/gpu/intel/jit/v2/conv/planner/search.cpp +++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp @@ -340,11 +340,11 @@ class search_kernel_desc_group_t { descs_.push_back(desc); } - bench_input_params_t bench_input_params(int nprbs) const { + bench_input_params_t bench_input_params(int nprbs, const hw_t &hw) const { if (descs_.empty()) return bench_input_params_t(); auto &kd = descs_.front(); bench_input_params_t params; - params.hw = kd.hw; + params.hw = hw; params.prop = kd.prop; params.src_tag = kd.src_tag; params.wei_tag = kd.wei_tag; @@ -575,7 +575,8 @@ bench_data_set_t bench_kernel_desc_group(const bench_manager_t &bench_mger, const search_kernel_desc_group_t &desc_group, int nprbs, int max_descs) { auto eng = bench_mger.get_engine(); - bench_runner_t runner(bench_mger, desc_group.bench_input_params(nprbs)); + bench_runner_t runner( + bench_mger, desc_group.bench_input_params(nprbs, bench_mger.hw())); bench_data_set_t bd_set; search_sequence_t seq(desc_group.descs(), max_descs); while (seq) { @@ -656,8 +657,6 @@ void auto_search( kernel_desc_t desc; parse_result_t parse_result; iface.parse(line, desc, &parse_result); - // TODO: Remove. - desc.hw = hw_t(bench_mger.get_engine().get()); kernel_search_manager_t mger( bench_mger, search_params_t(desc, parse_result)); mger.search(); diff --git a/src/gpu/intel/utils.hpp b/src/gpu/intel/utils.hpp index 5336d3ba39a..42e804d6b05 100644 --- a/src/gpu/intel/utils.hpp +++ b/src/gpu/intel/utils.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Intel Corporation +* Copyright 2023-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.