Skip to content

Commit

Permalink
Added OpenMP nabla implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
fthaler committed Jun 25, 2024
1 parent b80fd3b commit 71b7c7d
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 9 deletions.
72 changes: 63 additions & 9 deletions stencil_benchmarks/benchmarks_collection/unstructured/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def v2e_table(self):
return data

def remove_nproma(self, data):
return np.concatenate([data[:, i, :] for i in range(data.shape[1])], axis=0)
return np.concatenate([data[:, i, ...] for i in range(data.shape[1])], axis=0)

def e2v_table(self):
nx, ny, _ = self.domain
Expand Down Expand Up @@ -248,17 +248,19 @@ def run(self):
result["bandwidth"] = self.data_size / result["time"] / 1e9
return result

@property
def data_size(self):
return sum(
np.prod(shape) * np.dtype(dtype).itemsize for _, shape, dtype in self.args
)


class UnstructuredCopy(Unstructured):
@property
def args(self):
shape = self.nvertices, self.nlevels
return ("inp", shape, self.dtype), ("out", shape, self.dtype)

@property
def data_size(self):
return 2 * self.nvertices * self.nlevels * self.dtype_size

def verify_stencil(self, data_before, data_after):
validation.check_equality("inp", data_before.inp, data_after.inp)
inp = self.remove_nproma(data_before.inp)[: self.nvertices, :]
Expand All @@ -275,10 +277,6 @@ def args(self):
self.dtype,
)

@property
def data_size(self):
return (self.nvertices + self.nedges) * self.nlevels * self.dtype_size

def verify_stencil(self, data_before, data_after):
validation.check_equality("inp", data_before.inp, data_after.inp)
out = self.remove_nproma(data_after.out)[: self.nvertices, :]
Expand All @@ -295,3 +293,59 @@ def verify_stencil(self, data_before, data_after):
axis=1,
),
)


class Nabla(Unstructured):
@property
def args(self):
vshape = self.nvertices, self.nlevels
eshape = self.nedges, self.nlevels
return (
("pp", vshape, self.dtype),
("vol", (self.nvertices,), self.dtype),
("out0", vshape, self.dtype),
("out1", vshape, self.dtype),
("zavg0", eshape, self.dtype),
("zavg1", eshape, self.dtype),
("s0", eshape, self.dtype),
("s1", eshape, self.dtype),
("sign", (self.nedges,), self.dtype),
)

def verify_stencil(self, data_before, data_after):
validation.check_equality("pp", data_before.pp, data_after.pp)
validation.check_equality("vol", data_before.vol, data_after.vol)
validation.check_equality("s0", data_before.s0, data_after.s0)
validation.check_equality("s1", data_before.s1, data_after.s1)
validation.check_equality("sign", data_before.sign, data_after.sign)
pp = self.remove_nproma(data_before.pp)[: self.nvertices, :]
s0 = self.remove_nproma(data_before.s0)[: self.nedges, :]
s1 = self.remove_nproma(data_before.s1)[: self.nedges, :]
sign = self.remove_nproma(data_before.sign)[: self.nedges]
vol = self.remove_nproma(data_before.vol)[: self.nvertices]

pp_sum = np.sum(pp[self._e2v_table, :], axis=1)
zavg0 = pp_sum / 2 * s0
zavg1 = pp_sum / 2 * s1

zavg0_sum = np.sum(
np.where(
(self._v2e_table != -1)[:, :, np.newaxis],
zavg0[self._v2e_table, :] * sign[self._v2e_table][:, :, np.newaxis],
0,
),
axis=1,
)
zavg1_sum = np.sum(
np.where(
(self._v2e_table != -1)[:, :, np.newaxis],
zavg1[self._v2e_table, :] * sign[self._v2e_table][:, :, np.newaxis],
0,
),
axis=1,
)

out0 = self.remove_nproma(data_after.out0)[: self.nvertices, :]
out1 = self.remove_nproma(data_after.out1)[: self.nvertices, :]
validation.check_equality("out0", out0, zavg0_sum / vol[:, np.newaxis])
validation.check_equality("out1", out1, zavg1_sum / vol[:, np.newaxis])
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,8 @@ def template_file(self):
class EdgeSum(UnstructuredMixin, base.EdgeSum):
def template_file(self):
return "edge_sum.j2"


class Nabla(UnstructuredMixin, base.Nabla):
def template_file(self):
return "nabla.j2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{#
Stencil Benchmarks
Copyright (c) 2017-2021, ETH Zurich and MeteoSwiss
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
SPDX-License-Identifier: BSD-3-Clause
#}
{% extends "base.j2" %}

{% block kernel_invoke %}
#pragma omp parallel for collapse(2)
for (index_t eo = 0; eo < {{ (nedges + nproma - 1) // nproma }}; ++eo)
{
for (index_t k = 0; k < {{ nlevels }}; ++k)
{
#pragma omp simd {% if vector_size > 1 %}simdlen({{ vector_size }}){%- endif %}
for (index_t ei = 0; ei < {{ nproma }}; ++ei) {
index_t const edge = ei + {{ nproma }} * eo;
if (edge < {{ nedges }})
{
index_t index = ei * {{ strides.zavg0[0] }} + eo * {{ strides.zavg0[1] }} + k * {{ strides.zavg0[2] }};
index_t const edge_0 = e2v[edge * {{ strides.e2v[0] }} + 0 * {{ strides.e2v[1] }}];
index_t const edge_1 = e2v[edge * {{ strides.e2v[0] }} + 1 * {{ strides.e2v[1] }}];

index_t const pp_index_0 = edge_0 % {{ nproma }} * {{ strides.pp[0] }} + edge_0 / {{ nproma }} * {{ strides.pp[1] }} + k * {{ strides.pp[2] }};
index_t const pp_index_1 = edge_1 % {{ nproma }} * {{ strides.pp[0] }} + edge_1 / {{ nproma }} * {{ strides.pp[1] }} + k * {{ strides.pp[2] }};

{{ ctype }} sum = pp[pp_index_0] + pp[pp_index_1];

zavg0[index] = sum / 2 * s0[index];
zavg1[index] = sum / 2 * s1[index];
}
}
}
}

#pragma omp parallel for collapse(2)
for (index_t vo = 0; vo < {{ (nvertices + nproma - 1) // nproma }}; ++vo)
{
for (index_t k = 0; k < {{ nlevels }}; ++k)
{

#pragma omp simd {% if vector_size > 1 %}simdlen({{ vector_size }}){%- endif %}
{%- if streaming_stores %}
#ifdef __INTEL_COMPILER
#pragma vector nontemporal
#endif
{%- endif %}
for (index_t vi = 0; vi < {{ nproma }}; ++vi) {
index_t const vertex = vi + {{ nproma }} * vo;
if (vertex < {{ nvertices }})
{
{{ ctype }} sum0 = 0, sum1 = 0;
{%- for nb in range(v2e_max_neighbors) %}
{
index_t neighbor_edge = v2e[vertex * {{ strides.v2e[0] }} + {{ nb }} * {{ strides.v2e[1] }}];
{%- if skip_values %}
if (neighbor_edge != -1)
{%- endif %}
{
index_t zavg_index = neighbor_edge % {{ nproma }} * {{ strides.zavg0[0] }} + neighbor_edge / {{ nproma }} * {{ strides.zavg0[1] }} + k * {{ strides.zavg0[2] }};
index_t sign_index = neighbor_edge % {{ nproma }} * {{ strides.sign[0] }} + neighbor_edge / {{ nproma }} * {{ strides.sign[1] }};
sum0 += zavg0[zavg_index] * sign[sign_index];
sum1 += zavg1[zavg_index] * sign[sign_index];
}
}
{%- endfor %}
index_t out_index = vi * {{ strides.out0[0] }} + vo * {{ strides.out0[1] }} + k * {{ strides.out0[2] }};
index_t vol_index = vi * {{ strides.vol[0] }} + vo * {{ strides.vol[1] }};
out0[out_index] = sum0 / vol[vol_index];
out1[out_index] = sum1 / vol[vol_index];
}
}
}
}
{% endblock kernel_invoke %}

0 comments on commit 71b7c7d

Please sign in to comment.