FPGA: Fix II error and update namespace for device_ptr (#2525)

There was a functional change that went into the compiler recently that means it will now correctly identify memory dependences. One of the results of this is that this design will now emit a message that it is unable to achieve a user specified II. To regain this performance we can use annotated_ptr's in the SYCL HLS flow to specify a larger interface width which will allow for the compiler to coalesce stores to memory, thus resulting in being able to achieve the user specified II again. This change also corrects the address space of a call to device_ptr
oneapi-src · Oct 21, 2024 · 50a579b · 50a579b
1 parent 50ad8bb
commit 50a579b
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 7 deletions.
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/memory_transfers.hpp
@@ -5,6 +5,10 @@
 #include "tuple.hpp"
 #include "unrolled_loop.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr int BL0 = 0;
 
 // Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
 // elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -65,7 +69,12 @@ template <typename TT,            // Datatype of the elements of the matrix
           typename MatrixPipe     // Input matrix
           >
 void MatrixReadPipeToDDR(
+#if defined (IS_BSP)
     TT* matrix_ptr,    // Output matrix pointer
+#else
+    annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+	    				  dwidth<512>})> matrix_ptr,
+#endif
     int matrix_count,  // Number of matrix to write to DDR
     int repetitions    // Number of time to read the same matrix to the pipe
 ) {
@@ -147,7 +156,7 @@ void VectorReadPipeToDDR(
   // lives on the device.
   // Knowing this, the compiler won't generate hardware to
   // potentially get data from the host.
-  sycl::device_ptr<TT> vector_ptr_located(vector_ptr);
+  sycl::ext::intel::device_ptr<TT> vector_ptr_located(vector_ptr);
 #else
   // Device pointers are not supported when targeting an FPGA
   // family/part
@@ -166,4 +175,4 @@ void VectorReadPipeToDDR(
   }      // end of repetition
 }
 
-#endif /* __MEMORY_TRANSFERS_HPP__ */
+#endif /* __MEMORY_TRANSFERS_HPP__ */
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp
@@ -17,6 +17,8 @@
 #include "memory_transfers.hpp"
 #include "usv_from_eigens.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
 
 // Forward declare the kernel and pipe names
 // (This prevents unwanted name mangling in the optimization report.)
@@ -118,6 +120,15 @@ double SingularValueDecomposition(
     std::terminate();
   }
 
+#if not defined (IS_BSP)
+  constexpr int BL0 = 0;
+  using PtrAnn = annotated_ptr<T, decltype(properties{buffer_location<BL0>,
+		  				      dwidth<512>})>;
+  PtrAnn u_matrix_device_ptr(u_matrix_device);
+  PtrAnn s_matrix_device_ptr(s_matrix_device);
+  PtrAnn v_matrix_device_ptr(v_matrix_device);
+#endif
+
   // Check that the malloc succeeded.
   if (nullptr == input_matrix_device) {
     std::cerr << "Error when allocating the input matrix." << std::endl;
@@ -151,7 +162,7 @@ double SingularValueDecomposition(
         [=]() [[intel::kernel_args_restrict]] {
           MatrixReadFromDDRTo2PipesByBlocks<
               T, cols, rows, kNumElementsPerDDRBurst, InputMatrixPipe, InputMatrixPipe2>(
-              input_matrix_device, matrix_count, repetitions);
+	      input_matrix_device, matrix_count, repetitions);
         });
   });
 
@@ -207,21 +218,39 @@ double SingularValueDecomposition(
   sycl::event u_matrix_event = q.single_task<IDUMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, rows, rows, kNumElementsPerDDRBurst,
-                            UMatrixPipe>(u_matrix_device, matrix_count, repetitions);
+                            UMatrixPipe>(
+#if defined (IS_BSP)
+		u_matrix_device,
+#else
+		u_matrix_device_ptr,
+#endif	
+		matrix_count, repetitions);
       });
 
   // collecting s matrix from pipe into DDR
   sycl::event s_matrix_event = q.single_task<IDSMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, rows, cols, kNumElementsPerDDRBurst,
-                            SMatrixPipe>(s_matrix_device, matrix_count, repetitions);
+                            SMatrixPipe>(
+#if defined (IS_BSP)
+		s_matrix_device,
+#else
+		s_matrix_device_ptr,
+#endif
+	       	matrix_count, repetitions);
       });
 
   // collecting V matrix from pipe into DDR
   sycl::event v_matrix_event = q.single_task<IDVMatrixFromLocalMemToDDR>(
       [=]() [[intel::kernel_args_restrict]] {
         MatrixReadPipeToDDR<T, cols, cols, kNumElementsPerDDRBurst,
-                            VMatrixPipe>(v_matrix_device, matrix_count, repetitions);
+                            VMatrixPipe>(
+#if defined (IS_BSP)
+		v_matrix_device,
+#else
+		v_matrix_device_ptr,
+#endif
+	       	matrix_count, repetitions);
       });
 
   // Wait for output memory access kernels to finish
@@ -260,4 +289,4 @@ double SingularValueDecomposition(
   return diff;
 }
 
-#endif  // __SVD_HPP__
+#endif  // __SVD_HPP__