Skip to content

Commit

Permalink
Clean up old pragma comments. Switched to exclusively ac_math activat…
Browse files Browse the repository at this point in the history
…ions
  • Loading branch information
David Burnette committed Jan 22, 2025
1 parent f6e4f91 commit 400d645
Show file tree
Hide file tree
Showing 43 changed files with 6 additions and 1,858 deletions.
650 changes: 1 addition & 649 deletions hls4ml/templates/catapult/nnet_utils/nnet_activation.h
100644 → 100755

Large diffs are not rendered by default.

571 changes: 0 additions & 571 deletions hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion hls4ml/templates/catapult/nnet_utils/nnet_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ struct transpose_config {

template <class data_T, class res_T, typename CONFIG_T>
void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
//#pragma HLS PIPELINE

for (int i = 0; i < CONFIG_T::height; i++) {
for (int j = 0; j < CONFIG_T::width; j++) {
Expand Down
9 changes: 0 additions & 9 deletions hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,14 @@ void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
data_T cache;

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=scale,bias

// For parallel inputs:
// - completely partition arrays -- target fabric
// - if we have an unroll factor, limit number of multipliers
//#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
(void)ce_reuse_factor;
#pragma hls_pipeline_init_interval ce_reuse_factor

// #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
//#pragma HLS ARRAY_PARTITION variable=scale complete
//#pragma HLS ARRAY_PARTITION variable=bias complete

int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);
Expand Down Expand Up @@ -84,8 +79,6 @@ struct batchnorm_quantized_tanh_config {
template <class data_T, typename CONFIG_T>
void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
data_T threshold[CONFIG_T::n_in]) {
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=res complete

data_T datareg;
ac_int<1, false> cache;
Expand All @@ -104,8 +97,6 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CON
template <class data_T, typename CONFIG_T>
void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
data_T threshold_hi[CONFIG_T::n_in], data_T threshold_lo[CONFIG_T::n_in]) {
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=res complete

data_T datareg;
ac_int<2, true> cache;
Expand Down
12 changes: 0 additions & 12 deletions hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ namespace nnet {
template <class data_T, class res_T, typename CONFIG_T>
void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
//#pragma HLS ARRAY_PARTITION variable=scale complete
//#pragma HLS ARRAY_PARTITION variable=bias complete

constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
Expand All @@ -28,16 +26,13 @@ void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG
#pragma hls_pipeline_init_interval ii
BatchNormLoop:
for (unsigned int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
//#pragma HLS PIPELINE II=ii

data_T in_data = data.read();
res_T out_data;
//#pragma HLS DATA_PACK variable=out_data

#pragma hls_unroll
BatchNormpack:
for (unsigned int j = 0; j < data_T::size; j++) {
// #pragma HLS UNROLL
int norm_index;
if (CONFIG_T::n_filt == -1) {
norm_index = i * data_T::size + j;
Expand All @@ -59,15 +54,12 @@ void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG
template <class data_T, typename CONFIG_T>
void normalize_binary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<1, false>, CONFIG_T::n_in>> &res,
typename data_T::value_type threshold[CONFIG_T::n_in]) {
//#pragma HLS ARRAY_PARTITION variable=threshold complete

BinaryNormLoop:
for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
//#pragma HLS PIPELINE

data_T in_data = data.read();
nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
//#pragma HLS DATA_PACK variable=out_data

BatchNormPack:
for (int j = 0; j < data_T::size; j++) {
Expand All @@ -83,16 +75,12 @@ template <class data_T, typename CONFIG_T>
void normalize_ternary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<2, true>, CONFIG_T::n_in>> &res,
typename data_T::value_type threshold_hi[CONFIG_T::n_in],
typename data_T::value_type threshold_lo[CONFIG_T::n_in]) {
//#pragma HLS ARRAY_PARTITION variable=threshold_hi complete
//#pragma HLS ARRAY_PARTITION variable=threshold_lo complete

TernaryNormLoop:
for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
//#pragma HLS PIPELINE

data_T in_data = data.read();
nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
//#pragma HLS DATA_PACK variable=out_data

BatchNormPack:
for (int j = 0; j < data_T::size; j++) {
Expand Down
Empty file modified hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h
100644 → 100755
Empty file.
Empty file modified hls4ml/templates/catapult/nnet_utils/nnet_common.h
100644 → 100755
Empty file.
12 changes: 0 additions & 12 deletions hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,13 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width];
typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
//#pragma HLS ARRAY_PARTITION variable=acc complete dim=0

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=weights,biases

// Parallel mode
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=biases complete dim=0

// Limit multipliers to control parallelization
const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

// Convolve, saving all multiplication results to accumulate later
ConvOut:
Expand Down Expand Up @@ -130,19 +124,13 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan];
typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
//#pragma HLS ARRAY_PARTITION variable=acc complete dim=0

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=weights,biases

// Parallel mode
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=biases complete dim=0

// Limit multipliers to control parallelization
const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

// Convolve, saving all multiplication results to accumulate later
ConvOut:
Expand Down
26 changes: 0 additions & 26 deletions hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ void im2col_1d(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]) {
// int index = 0;
for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) {
//#pragma HLS PIPELINE II=1 rewind
for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
#pragma hls_unroll
int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation;
Expand Down Expand Up @@ -39,9 +38,6 @@ void conv_1d_full(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[
data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
res_T res_col[CONFIG_T::n_filt];

////#pragma HLS ARRAY_PARTITION variable=data_conv complete
//#pragma HLS ARRAY_PARTITION variable=data_col complete
//#pragma HLS ARRAY_PARTITION variable=res_col complete

im2col_1d<data_T, CONFIG_T>(data, data_conv);

Expand All @@ -64,7 +60,6 @@ void im2col_1d_cf_idx(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
ChannelLoop:
for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
//#pragma hls_unroll
//#pragma HLS PIPELINE II=1 rewind
KernelLoop:
for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
#pragma hls_unroll
Expand Down Expand Up @@ -112,21 +107,14 @@ void conv_1d_resource_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width],
const int rufactor = CONFIG_T::reuse_factor;
const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);

////#pragma HLS function_instantiate variable=weights,biases
////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
/// correctly
////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor
////#pragma HLS ARRAY_PARTITION variable=biases complete

data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
res_T res_col[CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=data_col complete
//#pragma HLS ARRAY_PARTITION variable=res_col complete

ColLoop:
for (int i = 0; i < CONFIG_T::out_width; i++) {
//#pragma HLS PIPELINE
im2col_1d_cf<data_T, CONFIG_T>(data, data_col, i);
dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
for (int j = 0; j < CONFIG_T::n_filt; j++) {
Expand Down Expand Up @@ -187,21 +175,14 @@ void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
const int rufactor = CONFIG_T::reuse_factor;
const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);

////#pragma HLS function_instantiate variable=weights,biases
////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
/// correctly
////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor
////#pragma HLS ARRAY_PARTITION variable=biases complete

data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
res_T res_col[CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=data_col complete
//#pragma HLS ARRAY_PARTITION variable=res_col complete

ColLoop:
for (int i = 0; i < CONFIG_T::out_width; i++) {
//#pragma HLS PIPELINE
im2col_1d_cl<data_T, CONFIG_T>(data, data_col, i);
dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
for (int j = 0; j < CONFIG_T::n_filt; j++) {
Expand All @@ -222,21 +203,14 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
const int rufactor = CONFIG_T::reuse_factor;
const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);

////#pragma HLS function_instantiate variable=weights,biases
////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
/// correctly
////#pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor
////#pragma HLS ARRAY_PARTITION variable=biases complete

data_T data_col[CONFIG_T::n_chan];
res_T res_col[CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=data_col complete
//#pragma HLS ARRAY_PARTITION variable=res_col complete

ColLoop:
for (int i = 0; i < CONFIG_T::out_width; i++) {
//#pragma HLS PIPELINE
im2col_1d_pointwise_cl<data_T, CONFIG_T>(data, data_col, i);
dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
for (int j = 0; j < CONFIG_T::n_filt; j++) {
Expand Down
10 changes: 0 additions & 10 deletions hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ void compute_scaled_indices_1d(const unsigned w_idx, ac_int<CONFIG_T::filt_width
#pragma hls_unroll
ComputeIndex:
for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
// #pragma HLS UNROLL
unsigned sw_idx =
CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
wp_idx + p);
Expand All @@ -31,27 +30,21 @@ void conv_1d_encoded_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
ac_channel<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
// const int win_depth = CONFIG_T::out_width;
// for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
// #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
// }

//#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete

res_T res_pack;
//#pragma HLS DATA_PACK variable=res_pack
unsigned outputs_ready = 0;

ac_int<CONFIG_T::filt_width, false> pixel_idx[data_T::size / CONFIG_T::n_chan];
//#pragma HLS ARRAY_PARTITION variable=pixel_idx complete

constexpr int ce_reuse_factor =
CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
(void)ce_reuse_factor;
#pragma hls_pipeline_init_interval ce_reuse_factor
ReadInputWidth:
for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
//#pragma HLS LOOP_FLATTEN
if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
//#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
}
compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
Expand All @@ -70,9 +63,7 @@ void conv_1d_buffer_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
#pragma hls_pipeline_init_interval ce_reuse_factor
ReadInputWidth:
for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
//#pragma HLS LOOP_FLATTEN
if (CONFIG_T::strategy == nnet::latency) {
//#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
}
compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
}
Expand All @@ -83,7 +74,6 @@ template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
//#pragma HLS inline region
switch (CONFIG_T::implementation) {
case conv_implementation::linebuffer:
conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
Expand Down
18 changes: 0 additions & 18 deletions hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,13 @@ void conv_2d_latency_cf(
CONFIG_T::filt_height * CONFIG_T::filt_width];
typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
//#pragma HLS ARRAY_PARTITION variable=acc complete dim=0

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=weights,biases

// Parallel mode
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=biases complete dim=0

// Limit multipliers to control parallelization
const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

// Convolve, saving all multiplication results to accumulate later
ConvOutHeight:
Expand Down Expand Up @@ -180,19 +174,13 @@ void conv_2d_latency_cl(
CONFIG_T::filt_height * CONFIG_T::filt_width];
typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
//#pragma HLS ARRAY_PARTITION variable=acc complete dim=0

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=weights,biases

// Parallel mode
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=biases complete dim=0

// Limit multipliers to control parallelization
const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

// Convolve, saving all multiplication results to accumulate later
ConvOutHeight:
Expand Down Expand Up @@ -300,19 +288,13 @@ void pointwise_conv_2d_latency_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in
typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan];
typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];

//#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
//#pragma HLS ARRAY_PARTITION variable=acc complete dim=0

// Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
//#pragma HLS function_instantiate variable=weights,biases

// Parallel mode
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_PARTITION variable=biases complete dim=0

// Limit multipliers to control parallelization
const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation

// Convolve, saving all multiplication results to accumulate later
ConvOutHeight:
Expand Down
Loading

0 comments on commit 400d645

Please sign in to comment.