Skip to content

Commit

Permalink
FPGA: Fix II error and update namespace for device_ptr (#2525)
Browse files Browse the repository at this point in the history
There was a functional change that went into the compiler recently that means it will now correctly identify memory dependences. One of the results of this is that this design will now emit a message that it is unable to achieve a user specified II.

To regain this performance we can use annotated_ptr's in the SYCL HLS flow to specify a larger interface width which will allow for the compiler to coalesce stores to memory, thus resulting in being able to achieve the user specified II again.

This change also corrects the address space of a call to device_ptr
  • Loading branch information
justin-rosner authored Oct 21, 2024
1 parent 50ad8bb commit 50a579b
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#include "tuple.hpp"
#include "unrolled_loop.hpp"

using namespace sycl::ext::intel::experimental;
using namespace sycl::ext::oneapi::experimental;

constexpr int BL0 = 0;

// Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
// elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
Expand Down Expand Up @@ -65,7 +69,12 @@ template <typename TT, // Datatype of the elements of the matrix
typename MatrixPipe // Input matrix
>
void MatrixReadPipeToDDR(
#if defined (IS_BSP)
TT* matrix_ptr, // Output matrix pointer
#else
annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
dwidth<512>})> matrix_ptr,
#endif
int matrix_count, // Number of matrix to write to DDR
int repetitions // Number of time to read the same matrix to the pipe
) {
Expand Down Expand Up @@ -147,7 +156,7 @@ void VectorReadPipeToDDR(
// lives on the device.
// Knowing this, the compiler won't generate hardware to
// potentially get data from the host.
sycl::device_ptr<TT> vector_ptr_located(vector_ptr);
sycl::ext::intel::device_ptr<TT> vector_ptr_located(vector_ptr);
#else
// Device pointers are not supported when targeting an FPGA
// family/part
Expand All @@ -166,4 +175,4 @@ void VectorReadPipeToDDR(
} // end of repetition
}

#endif /* __MEMORY_TRANSFERS_HPP__ */
#endif /* __MEMORY_TRANSFERS_HPP__ */
39 changes: 34 additions & 5 deletions DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/svd/src/svd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include "memory_transfers.hpp"
#include "usv_from_eigens.hpp"

using namespace sycl::ext::intel::experimental;
using namespace sycl::ext::oneapi::experimental;

// Forward declare the kernel and pipe names
// (This prevents unwanted name mangling in the optimization report.)
Expand Down Expand Up @@ -118,6 +120,15 @@ double SingularValueDecomposition(
std::terminate();
}

#if not defined (IS_BSP)
constexpr int BL0 = 0;
using PtrAnn = annotated_ptr<T, decltype(properties{buffer_location<BL0>,
dwidth<512>})>;
PtrAnn u_matrix_device_ptr(u_matrix_device);
PtrAnn s_matrix_device_ptr(s_matrix_device);
PtrAnn v_matrix_device_ptr(v_matrix_device);
#endif

// Check that the malloc succeeded.
if (nullptr == input_matrix_device) {
std::cerr << "Error when allocating the input matrix." << std::endl;
Expand Down Expand Up @@ -151,7 +162,7 @@ double SingularValueDecomposition(
[=]() [[intel::kernel_args_restrict]] {
MatrixReadFromDDRTo2PipesByBlocks<
T, cols, rows, kNumElementsPerDDRBurst, InputMatrixPipe, InputMatrixPipe2>(
input_matrix_device, matrix_count, repetitions);
input_matrix_device, matrix_count, repetitions);
});
});

Expand Down Expand Up @@ -207,21 +218,39 @@ double SingularValueDecomposition(
sycl::event u_matrix_event = q.single_task<IDUMatrixFromLocalMemToDDR>(
[=]() [[intel::kernel_args_restrict]] {
MatrixReadPipeToDDR<T, rows, rows, kNumElementsPerDDRBurst,
UMatrixPipe>(u_matrix_device, matrix_count, repetitions);
UMatrixPipe>(
#if defined (IS_BSP)
u_matrix_device,
#else
u_matrix_device_ptr,
#endif
matrix_count, repetitions);
});

// collecting s matrix from pipe into DDR
sycl::event s_matrix_event = q.single_task<IDSMatrixFromLocalMemToDDR>(
[=]() [[intel::kernel_args_restrict]] {
MatrixReadPipeToDDR<T, rows, cols, kNumElementsPerDDRBurst,
SMatrixPipe>(s_matrix_device, matrix_count, repetitions);
SMatrixPipe>(
#if defined (IS_BSP)
s_matrix_device,
#else
s_matrix_device_ptr,
#endif
matrix_count, repetitions);
});

// collecting V matrix from pipe into DDR
sycl::event v_matrix_event = q.single_task<IDVMatrixFromLocalMemToDDR>(
[=]() [[intel::kernel_args_restrict]] {
MatrixReadPipeToDDR<T, cols, cols, kNumElementsPerDDRBurst,
VMatrixPipe>(v_matrix_device, matrix_count, repetitions);
VMatrixPipe>(
#if defined (IS_BSP)
v_matrix_device,
#else
v_matrix_device_ptr,
#endif
matrix_count, repetitions);
});

// Wait for output memory access kernels to finish
Expand Down Expand Up @@ -260,4 +289,4 @@ double SingularValueDecomposition(
return diff;
}

#endif // __SVD_HPP__
#endif // __SVD_HPP__

0 comments on commit 50a579b

Please sign in to comment.