cornell-brg · Janicewei · Oct 8, 2020 · Nov 12, 2020 · yodada · Oct 12, 2020
diff --git a/aten/src/ATen/native/hammerblade/Or.cpp b/aten/src/ATen/native/hammerblade/Or.cpp
@@ -0,0 +1,19 @@
+#include <cmath>
+#include <ATen/Dispatch.h>
+#include <ATen/hammerblade/HammerBladeContext.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/hammerblade/Offload.h>
+
+namespace at { namespace native {
+
+Tensor or_kernel_hb(const Tensor& self, const Tensor& other) {
+  TORCH_CHECK(self.numel() == other.numel(), "The size of two tensors should match.");
+  // TORCH_CHECK(self.scalar_type() == ScalarType::Int || self.scalar_type() == ScalarType::Bool, "HammerBlade or is implemented for Int and Bool only");
+  // TORCH_CHECK(other.scalar_type() == ScalarType::Int || other.scalar_type() == ScalarType::Bool, "HammerBlade or is implemented for Int and Bool only");
+  Tensor result = at::empty_like(self, self.options());
+  hb_offload_kernel(result, self, other, "tensorlib_or");
+  return result;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -4009,6 +4009,7 @@
   dispatch:
     CPU: legacy::cpu::_th_or
     CUDA: legacy::cuda::_th_or
+    HammerBlade: or_kernel_hb
 
 - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   variants: method

diff --git a/hammerblade/torch/kernel/kernel_or.cpp b/hammerblade/torch/kernel/kernel_or.cpp
@@ -0,0 +1,37 @@
+//========================================================================
+// Element-wise or kernel
+//========================================================================
+//
+// Authors : Janice Wei
+// Date    : 10/08/2020
+
+#include <kernel_common.hpp>
+#include <cstdint>
+
+extern "C" {
+
+  __attribute__ ((noinline))  int tensorlib_or(
+          hb_tensor_t* t0_p,
+          hb_tensor_t* t1_p,
+          hb_tensor_t* t2_p) {
+    auto res = HBTensor<int>(t0_p);
+    auto input1 = HBTensor<int>(t1_p);
+    auto input2 = HBTensor<int>(t2_p);
+
+    bsg_cuda_print_stat_kernel_start();
+
+    hb_tiled_foreach(
+      [](int a, int b) {
+        return a | b;
+      },
+      res, input1, input2);
+
+    bsg_cuda_print_stat_kernel_end();
+
+    g_barrier.sync();
+    return 0;
+  }
+
+  HB_EMUL_REG_KERNEL(tensorlib_or, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*)
+
+}
diff --git a/hammerblade/torch/tests/test_or.py b/hammerblade/torch/tests/test_or.py
@@ -0,0 +1,57 @@
+"""
+tests of or kernel
+Authors : Janice Wei
+Date    : 10/08/2020
+"""
+
+import torch
+import random
+from hypothesis import given, settings
+from .hypothesis_test_util import HypothesisUtil as hu
+
+torch.manual_seed(42)
+random.seed(42)
+
+# ------------------------------------------------------------------------
+# test of x1 | x2
+# ------------------------------------------------------------------------
+
+def _test_or(x1, x2):
+    h1 = x1.hammerblade()
+    h2 = x2.hammerblade()
+    assert h1 is not x1
+    assert h2 is not x2
+    y_c = x1 | x2
+    y_h = h1 | h2
+    assert y_h.device == torch.device("hammerblade")
+    assert torch.allclose(y_c, y_h.cpu())
+
+# ------------------------------------------------------------------------
+# tests of or kernel with integer elements
+# ------------------------------------------------------------------------
+
+def test_or_1():
+    x = torch.ones(1, 10, dtype=torch.int)
+    _test_or(x, x)
+
+def test_or_2():
+    x1 = torch.ones(4, 5, dtype=torch.int)
+    x2 = torch.ones(4, 5, dtype=torch.int)
+    _test_or(x1, x2)
+
+def test_or_3():
+    x = torch.randint(-2 ** 30, 2 ** 30 - 1, (1, 128)).to(torch.int32)
+    y = torch.randint(-2 ** 30, 2 ** 30 - 1, (1, 128)).to(torch.int32)
+    _test_or(x, y)
+
+def test_or_4():
+    x = torch.randint(-2 ** 30, 2 ** 30 - 1, (16, 32)).to(torch.int32)
+    y = torch.randint(-2 ** 30, 2 ** 30 - 1, (16, 32)).to(torch.int32)
+    _test_or(x, y)
+
+@settings(deadline=None)
+@given(inputs=hu.tensors(n=2))
+def test_or_hypothesis(inputs):
+    x1 = torch.tensor(inputs[0]).to(torch.int32)
+    x2 = torch.tensor(inputs[1]).to(torch.int32)
+    _test_or(x1, x2)