cornell-brg · jweber97 · Oct 1, 2020 · Oct 2, 2020 · Oct 21, 2020 · Apr 28, 2021
diff --git a/aten/src/ATen/native/hammerblade/Cat.cpp b/aten/src/ATen/native/hammerblade/Cat.cpp
@@ -6,7 +6,7 @@ namespace at { namespace native {
 
 Tensor _cat_hb(TensorList tensors, int64_t dim) {
   TORCH_CHECK(tensors.size() > 0, "_cat_hb: cannot concatenate empty tensor list");
-  TORCH_CHECK(dim == 0, "this simple cat only takes dim=0");
+//  TORCH_CHECK(dim == 0, "this simple cat only takes dim=0");
   TORCH_CHECK(tensors[0].dim() <= 3, "this simple cat only takes up to 3-dimension tensors");
   // convert TensorList length to uint32
   uint32_t length_u32 = safe_downcast<uint32_t, size_t>(tensors.size());
@@ -25,18 +25,28 @@ Tensor _cat_hb(TensorList tensors, int64_t dim) {
   uint32_t space = 0;
   for (size_t i = 0; i < length_u32; i++) {
     TORCH_CHECK(tensors[i].dim() == ndim, "tensors have different dimensions");
-    space += tensors[i].size(0);
+    space += tensors[i].size(dim);
   }
 
   Tensor result;
   if (ndim == 1) {
     result = at::empty({space}, tensors[0].options());
   }
   else if (ndim == 2) {
-    result = at::empty({space, tensors[0].size(1)}, tensors[0].options());
+    if (dim==1){
+      result = at::empty({tensors[0].size(0), space}, tensors[0].options());
+    }
+    else{
+      result = at::empty({space, tensors[0].size(1)}, tensors[0].options());
+    }
   }
   else if (ndim == 3) {
-    result = at::empty({space, tensors[0].size(1), tensors[0].size(2)}, tensors[0].options());
+    if (dim==1){
+      result = at::empty({tensors[0].size(0), space, tensors[0].size(2)}, tensors[0].options());
+    }
+    else{ 
+      result = at::empty({space, tensors[0].size(1), tensors[0].size(2)}, tensors[0].options());
+    }  
   }
 
   tensor_args.push_back(result);

diff --git a/aten/src/ATen/native/hammerblade/TanhBackward.cpp b/aten/src/ATen/native/hammerblade/TanhBackward.cpp
@@ -0,0 +1,17 @@
+#include <ATen/Dispatch.h>
+#include <ATen/hammerblade/HammerBladeContext.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/hammerblade/Offload.h>
+
+namespace at { namespace native {
+
+Tensor tanh_backward_hb(const Tensor & grad_output, const Tensor & output) {
+   // AT_DISPATCH_FLOAT_TYPE_ONLY(grad_output.dtype(), output.dtype(), "tanh_backward_hb", [&]() {
+        hb_offload_kernel(grad_output, output, "tensorlib_tanh_backward");
+     //   });
+}
+
+//REGISTER_HAMMERBLADE_DISPATCH(tanh_backward_stub, &tanh_backward_kernel_hb);
+//
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -6366,6 +6366,7 @@
   dispatch:
     CPU: legacy::cpu::_thnn_tanh_backward
     CUDA: legacy::cuda::_thnn_tanh_backward
+    HammerBlade: tanh_backward_hb
 
 # What's a thnn_conv_ versus a slow_conv_?
 #

diff --git a/hammerblade/torch/kernel/kernel_cat.cpp b/hammerblade/torch/kernel/kernel_cat.cpp
@@ -3,7 +3,7 @@
 //====================================================================
 // simple _cat kernel only works with 0 dim
 //
-// Authors : Lin Cheng, Janice Wei
+// Authors : Lin Cheng, Jack Weber
 // Date    : 07/29/2020, 08/04/2020
 
 #define BUF_SIZE 16
@@ -14,7 +14,7 @@ extern "C" {
 //====================================================================
 // tensorlib__cat
 //====================================================================
-// This is a simple _cat kernel only works with 0 dim
+// This is a simple _cat kernel only works with 0 and 1 dim
 
 __attribute__ ((noinline))
 int tensorlib__cat( hb_tensor_t** tensors_p, hb_tensor_t* result_p,
@@ -23,30 +23,54 @@ int tensorlib__cat( hb_tensor_t** tensors_p, hb_tensor_t* result_p,
   HBTensor<float> result(result_p);
   uint32_t length = *length_p;
   hb_assert(length <= BUF_SIZE);
-  int32_t dim = *dim_p;
+  uint32_t dim = *dim_p;
   int32_t arr[BUF_SIZE];
+  int32_t dim0[BUF_SIZE];
+  int32_t dim1[BUF_SIZE];
 
   // collect tensors' size
   for(size_t i = 0; i < length; i++) {
     HBTensor<float> tensor(tensors_p[i]);
     arr[i] = tensor.numel();
+    dim0[i] = tensor.dim(0);
+    int32_t n = tensor.ndim();
+    if (n>1){
+      dim1[i] = tensor.dim(1);
+    }
   }
-  bsg_cuda_print_stat_kernel_start();
-  bsg_saif_start();
-
+
+  bsg_cuda_print_stat_kernel_start(); 
   hb_tiled_for(result.numel(), [&] (int32_t i) {
     int32_t j = 0;
     int32_t index = 0;
-    int32_t size = arr[0];
+    int32_t size = arr[0];  
+    int32_t q = 0;    
+
+    if (dim == 1){
+      size = dim0[0]*dim1[0];
+    }
+
     while (i >= size) {
       index = i - size;
       j++;
-      size += arr[j];
+
+      if (dim == 1){
+        q = j%length;
+        size += dim0[q]*dim1[q];
+      }
+      else{
+        q = j;
+        size += arr[j];
+      }
     }
     if (j == 0) {
       index = i;
     }
-    HBTensor<float> t(tensors_p[j]);
+    if (j>=length && dim == 1){
+      index = index + dim0[q]*dim1[q];
+    }
+
+    HBTensor<float> t(tensors_p[q]);
     result(i) = t(index);
   });
 

diff --git a/hammerblade/torch/kernel/kernel_tanhbackwards.cpp b/hammerblade/torch/kernel/kernel_tanhbackwards.cpp
@@ -0,0 +1,42 @@
+//====================================================================
+//// tanh kernel
+////====================================================================
+//// Returns a new tensor with the tangent of the elements of input
+//// Used in RNN.
+////
+//// Authors : Jack Weber
+//// Date    : 05/07/2020
+//
+#include <kernel_common.hpp>
+#include <cmath>
+
+extern "C" {
+
+////====================================================================
+//// tensorlib_tanh
+////====================================================================
+//// This is the tanh kernel for tensors with float elements.
+
+__attribute__ ((noinline))
+int tensorlib_tanh_backward(hb_tensor_t* t0_p, hb_tensor_t* t1_p)
+{
+  auto res = HBTensor<float>(t0_p);
+  auto input = HBTensor<float>(t1_p);
+
+  bsg_cuda_print_stat_kernel_start();
+  hb_tiled_foreach(
+  [](float a) {
+  a = tanh(a);
+  a = pow(a,2);
+  a = 1 - a;
+  return a;
+  },
+  res, input);
+
+  bsg_cuda_print_stat_kernel_end();
+  return 0;
+  }
+HB_EMUL_REG_KERNEL(tensorlib_tanh_backward, hb_tensor_t*, hb_tensor_t*)
+
+} /* extern C */
+
diff --git a/hammerblade/torch/tests/test_cat.py b/hammerblade/torch/tests/test_cat.py
@@ -15,60 +15,97 @@ def _test_torch_cat(x, y, z):
     assert y_h.device == torch.device("hammerblade")
     assert torch.allclose(y, y_h.cpu())
 
+def _test_torch_cat1d(x, y, z):
+    x_h = x.hammerblade()
+    y_h = y.hammerblade()
+    z_h = z.hammerblade()
+    a = torch.cat([x, y, z], 1)
+    a_h = torch.cat([x_h, y_h, z_h], 1)
+    assert y_h.device == torch.device("hammerblade")
+    assert torch.allclose(y, y_h.cpu())
+
 def test_cat_1():
     x = torch.ones(10)
     _test_torch_cat(x, x, x)
 
+
 def test_cat_1_dif_sizes():
     x = torch.randn(3)
     y = torch.randn(2)
     z = torch.tensor([])
     _test_torch_cat(x, y, z)
+#    _test_torch_cat1d(x, y, z)
 
 def test_cat_2():
     x = torch.randn(3, 4)
     _test_torch_cat(x, x, x)
+    _test_torch_cat1d(x, x, x)
 
 def test_cat_2_dif_sizes():
     x = torch.randn(3, 4)
     y = torch.randn(2, 4)
     z = torch.randn(4, 4)
     _test_torch_cat(x, y, z)
+
+def test_cat1d_2_dif_sizes():
+    x = torch.randn(4, 2)
+    y = torch.randn(4, 3)
+    z = torch.randn(4, 4)
+    _test_torch_cat1d(x, y, z)
 
 def test_cat_3():
     x = torch.randn(3, 4, 5)
     _test_torch_cat(x, x, x)
+    _test_torch_cat1d(x, x, x)
 
 def test_cat_3_dif_sizes():
     x = torch.randn(3, 4, 5)
     y = torch.randn(2, 4, 5)
     z = torch.randn(4, 4, 5)
     _test_torch_cat(x, y, z)
 
+def test_cat1d_3_dif_sizes():
+    x = torch.randn(4, 4, 5)
+    y = torch.randn(4, 3, 5)
+    z = torch.randn(4, 2, 5)
+    _test_torch_cat1d(x, y, z)    
+
+
 @settings(deadline=None)
-@given(inputs=hu.tensors(n=3, min_dim=1, max_dim=3))
-def test_cat_hypothesis(inputs):
+@given(inputs=hu.tensors(n=3, min_dim=2, max_dim=3))
+def test_cat1d_hypothesis(inputs):
     x1 = torch.tensor(inputs[0])
     x2 = torch.tensor(inputs[1])
     x3 = torch.tensor(inputs[2])
-    _test_torch_cat(x1, x2, x3)
+    _test_torch_cat1d(x1, x2, x3)
 
 def test_cat_error_1():
     x = torch.randn(3, 4, 5, 2).hammerblade()
     with pytest.raises(RuntimeError):
         torch.cat([x, x, x], 0)
 
-def test_cat_erorr_2():
-    x = torch.randn(3, 4).hammerblade()
-    with pytest.raises(RuntimeError):
-        torch.cat([x, x, x], 1)
-
 def test_cat_error_3():
     with pytest.raises(RuntimeError):
         torch.cat([], 0)
+    with pytest.raises(RuntimeError):
+        torch.cat([], 1)
 
 def test_cat_error_4():
     x = torch.ones(2).hammerblade()
     y = torch.randn(3, 4).hammerblade()
     with pytest.raises(RuntimeError):
         torch.cat([x, y], 0)
+
+def test_cat_error_4():
+    x = torch.ones(2,3).hammerblade()
+    y = torch.randn(3,4,5).hammerblade()
+    with pytest.raises(RuntimeError):
+        torch.cat([x, y], 1)
+
+@settings(deadline=None)
+@given(inputs=hu.tensors(n=3, min_dim=1, max_dim=3))
+def test_cat_hypothesis(inputs):
+    x1 = torch.tensor(inputs[0])
+    x2 = torch.tensor(inputs[1])
+    x3 = torch.tensor(inputs[2])
+    _test_torch_cat(x1, x2, x3)