From 50a579b3367af653816f73f3b2e4a459baead771 Mon Sep 17 00:00:00 2001
From: Justin Rosner <justin.rosner@intel.com>
Date: Sun, 20 Oct 2024 23:02:59 -0700
Subject: [PATCH] FPGA: Fix II error and update namespace for device_ptr
 (#2525)

There was a functional change that went into the compiler recently that means it will now correctly identify memory dependences. One of the results of this is that this design will now emit a message that it is unable to achieve a user specified II.

To regain this performance we can use annotated_ptr's in the SYCL HLS flow to specify a larger interface width which will allow for the compiler to coalesce stores to memory, thus resulting in being able to achieve the user specified II again.

This change also corrects the address space of a call to device_ptr
---
 .../svd/src/memory_transfers.hpp              | 13 ++++++-
 .../ReferenceDesigns/svd/src/svd.hpp          | 39 ++++++++++++++++---
 2 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp
index 38144b3ffd..6edab8b42c 100755
--- a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp
+++ b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp
@@ -5,6 +5,10 @@
 #include "tuple.hpp"
 #include "unrolled_loop.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr int BL0 = 0;
 
 // Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
 // elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -65,7 +69,12 @@ template <typename TT,            // Datatype of the elements of the matrix
           typename MatrixPipe     // Input matrix
           >
 void MatrixReadPipeToDDR(
+#if defined (IS_BSP)
     TT* matrix_ptr,    // Output matrix pointer
+#else
+    annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+	    				  dwidth<512>})> matrix_ptr,
+#endif
     int matrix_count,  // Number of matrix to write to DDR
     int repetitions    // Number of time to read the same matrix to the pipe
 ) {
@@ -147,7 +156,7 @@ void VectorReadPipeToDDR(
   // lives on the device.
   // Knowing this, the compiler won't generate hardware to
   // potentially get data from the host.
-  sycl::device_ptr<TT> vector_ptr_located(vector_ptr);
+  sycl::ext::intel::device_ptr<TT> vector_ptr_located(vector_ptr);
 #else
   // Device pointers are not supported when targeting an FPGA
   // family/part
@@ -166,4 +175,4 @@ void VectorReadPipeToDDR(
   }      // end of repetition
 }
 
-#endif /* __MEMORY_TRANSFERS_HPP__ */
\ No newline at end of file
+#endif /* __MEMORY_TRANSFERS_HPP__ */
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp
index 4ceffd1868..8a1c4ce911 100755
--- a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp
+++ b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp
@@ -17,6 +17,8 @@
 #include "memory_transfers.hpp"
 #include "usv_from_eigens.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
 
 // Forward declare the kernel and pipe names
 // (This prevents unwanted name mangling in the optimization report.)
@@ -118,6 +120,15 @@ double SingularValueDecomposition(
     std::terminate();
   }
 
+#if not defined (IS_BSP)
+  constexpr int BL0 = 0;
+  using PtrAnn = annotated_ptr<T, decltype(properties{buffer_location<BL0>,
+		  				      dwidth<512>})>;
+  PtrAnn u_matrix_device_ptr(u_matrix_device);
+  PtrAnn s_matrix_device_ptr(s_matrix_device);
+  PtrAnn v_matrix_device_ptr(v_matrix_device);
+#endif
+
   // Check that the malloc succeeded.
   if (nullptr == input_matrix_device) {
     std::cerr << "Error when allocating the input matrix." << std::endl;
@@ -151,7 +162,7 @@ double SingularValueDecomposition(
         [=]() [[intel::kernel_args_restrict]] {
           MatrixReadFromDDRTo2PipesByBlocks<
               T, cols, rows, kNumElementsPerDDRBurst, InputMatrixPipe, InputMatrixPipe2>(
-              input_matrix_device, matrix_count, repetitions);
+	      input_matrix_device, matrix_count, repetitions);
         });
   });
 
@@ -207,21 +218,39 @@ double SingularValueDecomposition(
   sycl::event u_matrix_event = q.single_task<IDUMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, rows, rows, kNumElementsPerDDRBurst,
-                            UMatrixPipe>(u_matrix_device, matrix_count, repetitions);
+                            UMatrixPipe>(
+#if defined (IS_BSP)
+		u_matrix_device,
+#else
+		u_matrix_device_ptr,
+#endif	
+		matrix_count, repetitions);
       });
 
   // collecting s matrix from pipe into DDR
   sycl::event s_matrix_event = q.single_task<IDSMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, rows, cols, kNumElementsPerDDRBurst,
-                            SMatrixPipe>(s_matrix_device, matrix_count, repetitions);
+                            SMatrixPipe>(
+#if defined (IS_BSP)
+		s_matrix_device,
+#else
+		s_matrix_device_ptr,
+#endif
+	       	matrix_count, repetitions);
       });
 
   // collecting V matrix from pipe into DDR
   sycl::event v_matrix_event = q.single_task<IDVMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, cols, cols, kNumElementsPerDDRBurst,
-                            VMatrixPipe>(v_matrix_device, matrix_count, repetitions);
+                            VMatrixPipe>(
+#if defined (IS_BSP)
+		v_matrix_device,
+#else
+		v_matrix_device_ptr,
+#endif
+	       	matrix_count, repetitions);
       });
 
   // Wait for output memory access kernels to finish
@@ -260,4 +289,4 @@ double SingularValueDecomposition(
   return diff;
 }
 
-#endif  // __SVD_HPP__
\ No newline at end of file
+#endif  // __SVD_HPP__