From ce287f050529a788aebae942c3cd148ca0ad41ea Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 20 Dec 2023 23:17:44 -0600 Subject: [PATCH 001/100] snapshot adding oneapi --- hls4ml/backends/__init__.py | 2 + hls4ml/backends/oneapi/__init__.py | 0 hls4ml/backends/oneapi/oneapi_backend.py | 338 ++++++++++ hls4ml/backends/oneapi/passes/__init__.py | 0 .../oneapi/passes/convolution_templates.py | 183 ++++++ .../oneapi/passes/convolution_winograd.py | 177 ++++++ .../backends/oneapi/passes/core_templates.py | 221 +++++++ .../backends/oneapi/passes/merge_templates.py | 108 ++++ hls4ml/backends/oneapi/passes/pointwise.py | 95 +++ .../oneapi/passes/pooling_templates.py | 111 ++++ .../oneapi/passes/quantization_templates.py | 36 ++ .../oneapi/passes/recurrent_templates.py | 305 +++++++++ .../oneapi/passes/reshaping_templates.py | 138 +++++ .../oneapi/passes/resource_strategy.py | 77 +++ .../backends/oneapi/passes/transform_types.py | 54 ++ hls4ml/templates/oneapi/CMakeLists.txt | 320 ++++++++++ hls4ml/templates/oneapi/exception_handler.hpp | 22 + hls4ml/templates/oneapi/firmware/defines.h | 21 + .../templates/oneapi/firmware/myproject.cpp | 20 + hls4ml/templates/oneapi/firmware/myproject.h | 36 ++ .../firmware/nnet_utils/nnet_activation.h | 516 ++++++++++++++++ .../firmware/nnet_utils/nnet_batchnorm.h | 104 ++++ .../oneapi/firmware/nnet_utils/nnet_common.h | 78 +++ .../oneapi/firmware/nnet_utils/nnet_conv1d.h | 64 ++ .../nnet_utils/nnet_conv1d_resource.h | 241 ++++++++ .../oneapi/firmware/nnet_utils/nnet_conv2d.h | 72 +++ .../nnet_utils/nnet_conv2d_resource.h | 303 +++++++++ .../oneapi/firmware/nnet_utils/nnet_dense.h | 170 +++++ .../nnet_utils/nnet_dense_compressed.h | 81 +++ .../oneapi/firmware/nnet_utils/nnet_embed.h | 45 ++ .../oneapi/firmware/nnet_utils/nnet_helpers.h | 119 ++++ .../oneapi/firmware/nnet_utils/nnet_merge.h | 249 ++++++++ .../oneapi/firmware/nnet_utils/nnet_mult.h | 113 ++++ .../oneapi/firmware/nnet_utils/nnet_padding.h | 99 +++ .../oneapi/firmware/nnet_utils/nnet_pooling.h | 319 ++++++++++ .../firmware/nnet_utils/nnet_recurrent.h | 583 ++++++++++++++++++ .../nnet_utils/nnet_recurrent_activation.h | 53 ++ .../oneapi/firmware/nnet_utils/nnet_resize.h | 38 ++ .../firmware/nnet_utils/nnet_transpose.h | 50 ++ .../oneapi/firmware/nnet_utils/nnet_types.h | 44 ++ hls4ml/templates/oneapi/firmware/parameters.h | 11 + hls4ml/templates/oneapi/myproject_test.cpp | 167 +++++ 42 files changed, 5783 insertions(+) create mode 100644 hls4ml/backends/oneapi/__init__.py create mode 100644 hls4ml/backends/oneapi/oneapi_backend.py create mode 100644 hls4ml/backends/oneapi/passes/__init__.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_templates.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_winograd.py create mode 100644 hls4ml/backends/oneapi/passes/core_templates.py create mode 100644 hls4ml/backends/oneapi/passes/merge_templates.py create mode 100644 hls4ml/backends/oneapi/passes/pointwise.py create mode 100644 hls4ml/backends/oneapi/passes/pooling_templates.py create mode 100644 hls4ml/backends/oneapi/passes/quantization_templates.py create mode 100644 hls4ml/backends/oneapi/passes/recurrent_templates.py create mode 100644 hls4ml/backends/oneapi/passes/reshaping_templates.py create mode 100644 hls4ml/backends/oneapi/passes/resource_strategy.py create mode 100644 hls4ml/backends/oneapi/passes/transform_types.py create mode 100644 hls4ml/templates/oneapi/CMakeLists.txt create mode 100644 hls4ml/templates/oneapi/exception_handler.hpp create mode 100644 hls4ml/templates/oneapi/firmware/defines.h create mode 100644 hls4ml/templates/oneapi/firmware/myproject.cpp create mode 100644 hls4ml/templates/oneapi/firmware/myproject.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h create mode 100644 hls4ml/templates/oneapi/firmware/parameters.h create mode 100644 hls4ml/templates/oneapi/myproject_test.cpp diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 6396d7815..cbd39813f 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -3,6 +3,7 @@ from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend from hls4ml.backends.vivado.vivado_backend import VivadoBackend +from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 @@ -13,3 +14,4 @@ register_backend('Vitis', VitisBackend) register_backend('Quartus', QuartusBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) +register_backend('oneAPI', OneAPIBackend) diff --git a/hls4ml/backends/oneapi/__init__.py b/hls4ml/backends/oneapi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py new file mode 100644 index 000000000..799c28963 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -0,0 +1,338 @@ +import os +from contextlib import contextmanager + +import numpy as np + +from hls4ml.backends import FPGABackend +from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute +from hls4ml.model.flow import register_flow +from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax +from hls4ml.model.optimizer import get_backend_passes, layer_optimizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType +#from hls4ml.report import parse_oneapi_report + + +@contextmanager +def chdir(newdir): + prevdir = os.getcwd() + os.chdir(os.path.expanduser(newdir)) + try: + yield + finally: + os.chdir(prevdir) + + +class OneAPIBackend(FPGABackend): + def __init__(self): + super().__init__('oneAPI') + self._register_layer_attributes() + self._register_flows() + + def _register_layer_attributes(self): + # Add RNN-specific recurrent_reuse_factor attribute + rnn_layers = [ + SimpleRNN, + LSTM, + GRU, + ] + + for layer in rnn_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('table_size', default=1024)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + self.attribute_map[layer] = attrs + + def _register_flows(self): + initializers = self._get_layer_initializers() + init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) + + streaming_passes = ['oneapi:reshape_stream', 'oneapi:clone_output'] + streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name) + + oneapi_types = [ + 'oneapi:transform_types', + 'oneapi:register_bram_weights', + 'oneapi:apply_resource_strategy', + 'oneapi:apply_winograd_kernel_transformation', + ] + oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=[init_flow], backend=self.name) + + quantization_passes = [ + 'oneapi:merge_batch_norm_quantized_tanh', + 'oneapi:quantize_dense_output', + 'fuse_consecutive_batch_normalization', + 'oneapi:xnor_pooling', + ] + quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name) + + optimization_passes = [ + 'oneapi:remove_final_reshape', + 'oneapi:optimize_pointwise_conv', + 'oneapi:inplace_parallel_reshape', + 'oneapi:inplace_stream_flatten', + 'oneapi:skip_softmax', + 'oneapi:fix_softmax_table_size', + ] + optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) + + templates = self._get_layer_templates() + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name) + + writer_passes = ['make_stamp', 'oneapi:write_hls'] + + self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name) + + all_passes = get_backend_passes(self.name) + + extras = [ + # Ideally this should be empty + opt_pass + for opt_pass in all_passes + if opt_pass + not in initializers + + streaming_passes + + oneapi_types + + quantization_passes + + templates + + optimization_passes + + writer_passes + ] + + if len(extras) > 0: + extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name) + else: + extras_flow = None + + ip_flow_requirements = [ + 'optimize', + init_flow, + streaming_flow, + quantization_flow, + optimization_flow, + oneapi_types_flow, + extras_flow, + template_flow, + ] + ip_flow_requirements = list(filter(None, ip_flow_requirements)) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'): + config = {} + + config['Part'] = part if part is not None else 'Arria10' + config['ClockPeriod'] = clock_period + config['IOType'] = io_type + config['HLSConfig'] = {} + + return config + + def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False): + """ + Builds the project using Intel DPC++ (oneAPI) compiler. + + Args: + model (ModelGraph): The model to build + synth, optional: Whether to run HLS synthesis + fpgasynth, optional: Whether to run FPGA synthesis (oneAPI Compile) + log_level, optional: Logging level to be displayed during HLS synthesis (0, 1, 2) + cont_if_large_area: Instruct the HLS compiler to continue synthesis if the estimated resource usage exceeds + device resources + Errors raise exceptions + """ + + # Check software needed is present + pass + + @layer_optimizer(Layer) + def init_base_layer(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('reuse_factor', reuse_factor) + + target_cycles = layer.model.config.get_target_cycles(layer) + layer.set_attr('target_cycles', target_cycles) + + @layer_optimizer(Dense) + def init_dense(self, layer): + index_t = IntegerPrecisionType(width=1, signed=False) + + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + if layer.model.config.get_compression(layer): + layer.set_attr('strategy', 'compressed') + else: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + + if layer.model.config.is_resource_strategy(layer): + if layer.model.config.get_compression(layer): + index_t = layer.get_weights('weight').type.index_precision + + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) + + @layer_optimizer(Activation) + def init_activation(self, layer): + if layer.get_attr('activation') == 'tanh': + layer.set_attr('activation', 'dense_tanh') + if layer.get_attr('recurrent_activation') == 'tanh': + layer.set_attr('recurrent_activation', 'dense_tanh') + + @layer_optimizer(Softmax) + def init_softmax(self, layer): + if layer.model.config.get_config_value('IOType') == 'io_parallel': + assert ( + len(layer.get_input_variable().shape) == 1 + ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' + + @layer_optimizer(Embedding) + def init_embed(self, layer): + if layer.attributes['n_in'] is None: + raise Exception('Input length of Embedding layer must be specified.') + + @layer_optimizer(GRU) + def init_gru(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # Dense multiplication properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + index_t = IntegerPrecisionType(width=1, signed=False) + layer.set_attr('index_t', index_t) + + if 'table_t' not in layer.attributes: + layer.set_attr( + 'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8)) + ) + if 'table_size' not in layer.attributes: + layer.set_attr('table_size', 1024) + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + layer.set_attr('index_t', index_t) + + @layer_optimizer(Conv1D) + def init_conv1d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv1D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width determines the filter size post-Winograd transformation + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(Conv2D) + def init_conv2d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv2D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation + layer.set_attr('impl_filt_height', layer.get_attr('filt_height')) + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(LSTM) + def init_lstm(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # We don't use RF yet + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + # Split weights for easier storage in on-chip memory and implementation in HLS + weights_data = layer.weights['weight'].data + rec_weights_data = layer.weights['recurrent_weight'].data + bias_data = layer.weights['bias'].data + + weight_types = ['i', 'f', 'c', 'o'] + for i in range(0, 4): + layer.add_weights_variable( + name=f'weight_{weight_types[i]}', + var_name=f'kernel_{weight_types[i]}_{{index}}', + data=weights_data[ + 0 : layer.get_attr('n_in'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'recurrent_weight_{weight_types[i]}', + var_name=f'recurrent_kernel_{weight_types[i]}_{{index}}', + data=rec_weights_data[ + 0 : layer.get_attr('n_out'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'bias_{weight_types[i]}', + var_name=f'bias_{weight_types[i]}_{{index}}', + data=bias_data[i * layer.get_attr('n_out') : (i + 1) * (layer.get_attr('n_out'))], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + + @layer_optimizer(SimpleRNN) + def init_simple_rnn(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # TODO - Consider setting and using RF diff --git a/hls4ml/backends/oneapi/passes/__init__.py b/hls4ml/backends/oneapi/passes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py new file mode 100644 index 000000000..75f8ca687 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_templates.py @@ -0,0 +1,183 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm + +# TODO - Dilation rate ? + +''' Shared mutliplication config ''' +conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +''' 1D Conv ''' +conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_width; + + static const unsigned n_filt = {n_filt}; + static const unsigned out_width = {out_width}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelisation_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}}; +""" + +conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h'] + + +class Conv1DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Conv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise Exception('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Conv1D, include_header=conv1d_include_list) + self.template = conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported on Quartus') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +''' 2D Conv ''' +conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_height = {impl_filt_height}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelisation_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}};\n""" + +conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h'] + + +class Conv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm)) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise Exception('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list) + self.template = conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for Quartus') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py new file mode 100644 index 000000000..9a6686412 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py @@ -0,0 +1,177 @@ +import math + +import numpy as np + +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyWinogradKernelTransformation(OptimizerPass): + ''' + Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution + For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks + ''' + + def match(self, node): + node_matches = isinstance(node, (Conv1D, Conv2D)) + + # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters + weights_transformed = node.get_attr('_weights_transposed', False) is True + + # User opted for Winograd + implementation_is_winograd = ( + node.get_attr('implementation', 'combination') == 'combination' + or node.get_attr('implementation', 'combination') == 'winograd' + ) + + parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel' + + # Winograd algorithm-specific conditions + if isinstance(node, Conv1D): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with stride != 1 + stride_is_one = node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_width') > 2 + + winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type + + elif isinstance(node, (Conv2D)): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with striede != 1 + stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2 + + padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr( + 'pad_left', 0 + ) == node.get_attr('pad_right', 0) + + winograd_conditions = ( + filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type + ) + + else: + winograd_conditions = False + + # Check any previous transformations + already_transformed = node.get_attr('_winograd_transformation_applied', False) is True + + if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd': + raise RuntimeError( + 'Not possible to use Winograd algorithm with current architecture. ' + 'Please set implementation to im2col or combination' + ) + + return ( + node_matches + and weights_transformed + and winograd_conditions + and not already_transformed + and implementation_is_winograd + ) + + def transform(self, model, node): + if isinstance(node, Conv1D): + if node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C) + # Therefore, (F, W, C) => (F, C, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3) => (4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4)) + + # Transformation matrices for 3x1 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel]) + node.weights['weight'].data_length = node.weights['weight'].data.size + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.fractional += 2 + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_width', 4) + + elif isinstance(node, Conv2D): + if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C) + # Therefore, (F, H, W, C) => (F, C, H, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3x3) => (4x4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4)) + + # Transformation matrices for 3x3 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT) + node.weights['weight'].data_length = node.weights['weight'].data.size + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.fractional += 2 + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_height', 4) + node.set_attr('impl_filt_width', 4) + else: + raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer') + + node.set_attr('_winograd_transformation_applied', True) + + return False diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py new file mode 100644 index 000000000..aece9fc22 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -0,0 +1,221 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax + +# Dense templates + +dense_config_template = """struct config{index} : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned n_zeros = {nzeros}; + static const unsigned n_nonzeros = {nonzeros}; + static const bool store_weights_in_bram = false; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor); + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {index_t.name} index_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] + + +class DenseConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_config_template + + def format(self, node): + params = self._default_config_params(node) + params['nzeros'] = node.get_weights('weight').nzeros + params['nonzeros'] = node.get_weights('weight').nonzeros + params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + + return self.template.format(**params) + + +class DenseFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Dense, include_header=dense_include_list) + self.template = dense_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# BatchNormalization templates + +batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; + typedef {bias_t.name} bias_t; + typedef {scale_t.name} scale_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});' + +batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class BatchNormalizationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# Activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +hard_activ_config_template = """struct {type}_config{index} {{ + static const unsigned n_in = {n_in}; + static const {slope_t.name} slope; + static const {shift_t.name} shift; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}}; +const {slope_t.name} {type}_config{index}::slope = {slope}; +const {shift_t.name} {type}_config{index}::shift = {shift};\n""" + +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; + typedef {exp_table_t.name} exp_table_t; + typedef {inv_table_t.name} inv_table_t; +}};\n""" + +activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' +param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + + +class ActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Activation, ParametrizedActivation, PReLU)) + self.template = activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class SoftmaxConfigTemplate(ActivationConfigTemplate): + def __init__(self): + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + self.template = softmax_config_template + + +class ActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) + + +class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['param'] = node.get_attr('activ_param', 1.0) + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) + + +class PReLUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PReLU, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['param'] = node.get_weights('alpha').name + params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py new file mode 100644 index 000000000..0cf612166 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/merge_templates.py @@ -0,0 +1,108 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Concatenate, Dot, Merge + +# TODO - Very similar to vivado/merge_templates.py - only difference is on line 67: +# TODO - get_backend('vivado').product_type(inp1.type.precision, inp2.type.precision) +# TODO - Look into ways of having passes similar accross many backends in a shared folder thorugh inheritance and overriding. + +# Merge templates +merge_config_template = """struct config{index} : nnet::merge_config {{ + static const unsigned n_elem = {n_elem}; +}};\n""" + +merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});' +merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h'] + + +class MergeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Merge) + self.template = merge_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp() + + return self.template.format(**params) + + +class MergeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list) + self.template = merge_function_template + + def format(self, node): + params = {} + params['merge'] = node.get_attr('op').lower() + params['config'] = f'config{node.index}' + params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name + params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name + params['output_t'] = node.get_output_variable().type.name + params['input1'] = node.get_input_variable(node.inputs[0]).name + params['input2'] = node.get_input_variable(node.inputs[1]).name + params['output'] = node.get_output_variable().name + + return self.template.format(**params) + + +# Dot templates +dot_config_template = """struct config{index} : nnet::dot_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned reuse_factor = {reuse}; + + typedef {accum_t.name} accum_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + + +class DotConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dot) + self.template = dot_config_template + + def format(self, node): + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + params = self._default_config_params(node) + params['n_out'] = 1 + params['n_in'] = inp1.shape[0] + params['product_type'] = get_backend('quartus').product_type(inp1.type.precision, inp2.type.precision) + + return self.template.format(**params) + + +# Concatenate templates +concat_config_template = """struct config{index} : nnet::concat_config {{ + static const unsigned n_elem1_0 = {n_elem1_0}; + static const unsigned n_elem1_1 = {n_elem1_1}; + static const unsigned n_elem1_2 = {n_elem1_2}; + static const unsigned n_elem2_0 = {n_elem2_0}; + static const unsigned n_elem2_1 = {n_elem2_1}; + static const unsigned n_elem2_2 = {n_elem2_2}; + + static const int axis = {axis}; +}};\n""" + + +class ConcatenateConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Concatenate) + self.template = concat_config_template + + def format(self, node): + params = self._default_config_params(node) + for i in range(3): + params.setdefault(f'n_elem1_{i}', 0) + params.setdefault(f'n_elem2_{i}', 0) + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)): + params[f'n_elem1_{i}'] = s1 + params[f'n_elem2_{i}'] = s2 + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py new file mode 100644 index 000000000..84ae79e49 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pointwise.py @@ -0,0 +1,95 @@ +from copy import copy + +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D +from hls4ml.backends.quartus.passes.convolution_templates import ( + Conv1DConfigTemplate, + Conv1DFunctionTemplate, + Conv2DConfigTemplate, + Conv2DFunctionTemplate, + conv1d_config_template, + conv2d_config_template, + conv_mult_config_template, +) +from hls4ml.model.layers import register_layer +from hls4ml.model.optimizer import OptimizerPass + +''' +Custom hls4ml layer implementation for 1x1 Conv filters using im2col +Allows lower latency andresource usage, due to less loop invocations +''' + +pointwise_conv1d_function_template = ( + 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +pointwise_conv2d_function_template = ( + 'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h'] + + +class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): + def __init__(self): + super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv1DFunctionTemplate(Conv1DFunctionTemplate): + def __init__(self): + super(Conv1DFunctionTemplate, self).__init__(PointwiseConv1D, include_header=sepconv1d_include_list) + self.template = pointwise_conv1d_function_template + + +class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): + def __init__(self): + super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv2DFunctionTemplate(Conv2DFunctionTemplate): + def __init__(self): + super(Conv2DFunctionTemplate, self).__init__(PointwiseConv2D, include_header=sepconv2d_include_list) + self.template = pointwise_conv2d_function_template + + +def register_pointwise(backend): + # Register the layer types to the layer map + register_layer('PointwiseConv1D', PointwiseConv1D) + register_layer('PointwiseConv2D', PointwiseConv2D) + + # Register the optimization passes + backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv) + + # Register template passes + backend.register_template(PointwiseConv1DConfigTemplate) + backend.register_template(PointwiseConv1DFunctionTemplate) + backend.register_template(PointwiseConv2DConfigTemplate) + backend.register_template(PointwiseConv2DFunctionTemplate) + + +class OptimizePointwiseConv(OptimizerPass): + def match(self, node): + return ( + node.class_name in ('Conv1D', 'Conv2D') + and node.get_attr('filt_height', 1) == 1 + and node.get_attr('filt_width') == 1 + and node.model.config.get_config_value('IOType') == 'io_parallel' + ) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + pw_node = model.make_node( + 'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy() + ) + if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D + expand_axis = tuple(range(int(dim[0]))) + pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis) + pw_node.weights['bias'].data = node.weights['bias'].data + model.replace_node(node, pw_node) + + return True diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py new file mode 100644 index 000000000..9a3ee4192 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pooling_templates.py @@ -0,0 +1,111 @@ +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D + +# TODO - Move to ../fpga/passes, once streaming is supported on Quartus (should be identical to Vivado) + +pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned stride_width = {stride_width}; + static const unsigned pool_width = {pool_width}; + + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned filt_width = {pool_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned in_width = {n_in}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned pool_height = {pool_height}; + static const unsigned pool_width = {pool_width}; + static const unsigned filt_height = {pool_height}; + static const unsigned filt_width = {pool_width}; + + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +global_pooling1d_function_template = ( + 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) +global_pooling2d_function_template = ( + 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) + +pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h'] + + +class PoolingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_config_template, + 'Pooling2D': pooling2d_config_template, + 'GlobalPooling1D': global_pooling1d_config_template, + 'GlobalPooling2D': global_pooling2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class PoolingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list) + self.templates = { + 'Pooling1D': pooling1d_function_template, + 'Pooling2D': pooling2d_function_template, + 'GlobalPooling1D': global_pooling1d_function_template, + 'GlobalPooling2D': global_pooling2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for Quartus') + params['data_format'] = 'cl' + return self.templates[node.class_name].format(**params) diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py new file mode 100644 index 000000000..d6cf2d2da --- /dev/null +++ b/hls4ml/backends/oneapi/passes/quantization_templates.py @@ -0,0 +1,36 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.quartus.passes.core_templates import ( + batchnorm_config_template, + batchnorm_function_template, + batchnorm_include_list, +) +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha + + +class ApplyAlphaConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class ApplyAlphaFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py new file mode 100644 index 000000000..2bf45351b --- /dev/null +++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py @@ -0,0 +1,305 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GRU, LSTM, SimpleRNN + +recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h'] + +################################################ +# Shared Matrix Multiplication Template (Dense) +################################################ +recr_mult_config_template = '''struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n''' + +################################################ +# Shared Activation Template +################################################ +activ_config_template = '''struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n''' + +################################################ +# GRU Template +################################################ +gru_config_template = '''struct config{index} : nnet::gru_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_units = {n_units}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned n_outputs = {n_outputs}; + static const bool return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + + typedef {config_mult_x} mult_config_x; + typedef {config_mult_h} mult_config_h; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n''' + +gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});' + + +class GRUConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(GRU) + self.gru_template = gru_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + self.mult_x_template = recr_mult_config_template + self.mult_h_template = recr_mult_config_template + + def format(self, node): + # Input has shape (n_timesteps, inp_dimensionality) + # Output / hidden units has shape (1 if !return_sequences else n_timesteps , n_units) + params = self._default_config_params(node) + params['n_units'] = node.get_attr('n_out') + params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + params['return_sequences'] = 'true' if node.get_attr('return_sequences', False) else 'false' + params['config_mult_x'] = f'config{node.index}_x_mult' + params['config_mult_h'] = f'config{node.index}_h_mult' + params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act') + gru_config = self.gru_template.format(**params) + + # Activation is on candidate hidden state, dimensionality (1, n_units) + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + # Recurrent activation is on reset and update gates (therefore x2), dimensionality (1, n_units) + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = str(node.get_attr('n_out')) + ' * 2' + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + # Multiplication config for matrix multiplications of type Wx (reset, update and candidate states) + mult_params_x = self._default_config_params(node) + mult_params_x['n_in'] = node.get_attr('n_in') + mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_x['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_params_x['index'] = str(node.index) + '_x' + mult_config_x = self.mult_x_template.format(**mult_params_x) + + # Multiplication config for matrix multiplications of type Wh (reset, update and candidate states) + mult_params_h = self._default_config_params(node) + mult_params_h['n_in'] = node.get_attr('n_out') + mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_h['reuse_factor'] = params['recurrent_reuse_factor'] + mult_params_h['product_type'] = get_backend('quartus').product_type( + node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision + ) + mult_params_h['index'] = str(node.index) + '_h' + mult_config_h = self.mult_h_template.format(**mult_params_h) + + return mult_config_x + '\n' + mult_config_h + '\n' + recr_act_config + '\n' + act_config + '\n' + gru_config + + +class GRUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(GRU, include_header=recurrent_include_list) + self.template = gru_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + return self.template.format(**params) + + +################################################ +# LSTM Template +################################################ +lstm_config_template = """struct config{index} : nnet::lstm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +lstm_function_template = 'nnet::lstm<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class LSTMConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(LSTM) + self.template = lstm_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + lstm_params = self._default_config_params(node) + lstm_params['n_in'] = node.get_attr('n_in') + lstm_params['n_out'] = node.get_attr('n_out') + lstm_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + + lstm_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + lstm_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + lstm_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + lstm_config = self.template.format(**lstm_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + lstm_config + + +class LSTMFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(LSTM, include_header=recurrent_include_list) + self.template = lstm_function_template + + def format(self, node): + params = self._default_function_params(node) + + types = ['i', 'f', 'c', 'o'] + params['weights'] = '' + for t in types: + params['weights'] += f'kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += f'recurrent_kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += 'bias_{}_{}{}'.format(t, str(node.index), ',' if t != 'o' else '') + + return self.template.format(**params) + + +################################################ +# SimpleRNN Template +################################################ +simple_rnn_config_template = """struct config{index} : nnet::simpleRNN_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_outputs = {n_outputs}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class SimpleRNNConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SimpleRNN) + self.template = simple_rnn_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + simple_rnn_params = self._default_config_params(node) + simple_rnn_params['n_in'] = node.get_attr('n_in') + simple_rnn_params['n_out'] = node.get_attr('n_out') + simple_rnn_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + simple_rnn_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + simple_rnn_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + simple_rnn_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + simple_rnn_params['recurrent_activation'] = 'relu' + + simple_rnn_config = self.template.format(**simple_rnn_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + simple_rnn_config + + +class SimpleRNNFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SimpleRNN, include_header=recurrent_include_list) + self.template = simple_rnn_function_template + + def format(self, node): + params = self._default_function_params(node) + params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index)) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py new file mode 100644 index 000000000..0db01e654 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py @@ -0,0 +1,138 @@ +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D + +# ZeroPadding templates + +zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' + +padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h'] + + +class ZeroPaddingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_config_template, + 'ZeroPadding2D': zeropad2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list) + self.templates = { + 'ZeroPadding1D': zeropad1d_function_template, + 'ZeroPadding2D': zeropad2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('Quartus only supports channels_last data format') + params['data_format'] = 'cl' + + return self.templates[node.class_name].format(**params) + + +# Resize templates + +resize_config_template = """struct config{index} : nnet::resize_config {{ + static const unsigned height = {in_height}; + static const unsigned width = {in_width}; + + static const unsigned new_height = {out_height}; + static const unsigned new_width = {out_width}; + + static const unsigned n_chan = {n_chan}; +}};\n""" + +resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});' +resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h'] + + +class ResizeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class ResizeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Resize, include_header=resize_include_list) + self.template = resize_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('algorithm') != 'nearest': + raise Exception('Currently only supporting resize_nearest') + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +# Transpose templates + +transpose_config_template = """struct config{index} : nnet::transpose_config {{ + static const unsigned depth = {depth}; + static const unsigned height = {height}; + static const unsigned width = {width}; + static constexpr unsigned perm[3] = {{{perm_str}}}; +}};\n""" + +transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});' +transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h'] + + +class TransposeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class TransposeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Transpose, include_header=transpose_include_list) + self.template = transpose_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py new file mode 100644 index 000000000..00fe89038 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/resource_strategy.py @@ -0,0 +1,77 @@ +import numpy as np + +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SimpleRNN +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyResourceStrategy(OptimizerPass): + '''Transposes the weights to use the dense_resource matrix multiply routine''' + + def match(self, node): + node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN)) + is_resource_strategy = ( + True # node.get_attr('strategy', '').lower() == 'resource' -> Quartus only supportr Resource strategy + ) + already_transformed = node.get_attr('_weights_transposed', False) is True + return node_matches and is_resource_strategy and not already_transformed + + def transform(self, model, node): + if isinstance(node, Dense) and not node.model.config.get_compression(node): + rf = node.get_attr('reuse_factor') + bf = int((node.attributes['n_in'] * node.attributes['n_out']) / rf) + bf_rounded = int(pow(2, np.ceil(np.log2(bf)))) + rf_rounded = int(pow(2, np.ceil(np.log2(rf)))) + + node.weights['weight'].data = np.transpose(node.weights['weight'].data).flatten() + + if node.attributes['n_in'] * node.attributes['n_out'] > 2048 and rf_rounded != rf: + node.set_attr('rfpad', rf_rounded - rf) + node.set_attr('bfpad', bf_rounded - bf) + + temp = np.empty([bf_rounded, rf_rounded]) + for i in range(rf_rounded): + for j in range(bf_rounded): + if i < rf and j < bf: + w_index = i + rf * j + temp[j][i] = node.weights['weight'].data[w_index] + else: + temp[j][i] = 0 + node.weights['weight'].data = temp.flatten() + node.weights['weight'].data_length = node.weights['weight'].data.size + + elif isinstance(node, Conv1D): + # (W,C,F) => (F,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1]) + + elif isinstance(node, Conv2D): + # (H,W,C,F) => (F,H,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[3, 0, 1, 2]) + + elif isinstance(node, GRU): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, SimpleRNN): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, LSTM): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + for weight_type in ['i', 'f', 'c', 'o']: + node.weights[f'weight_{weight_type}'].data = np.transpose(node.weights[f'weight_{weight_type}'].data) + node.weights[f'recurrent_weight_{weight_type}'].data = np.transpose( + node.weights[f'recurrent_weight_{weight_type}'].data + ) + + else: + raise Exception(f'Unexpected layer {node.class_name} with resource strategy') + node.set_attr('_weights_transposed', True) + return False diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py new file mode 100644 index 000000000..67de32ab6 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -0,0 +1,54 @@ +from hls4ml.backends.fpga.fpga_types import ( + ACTypeConverter, + HLSTypeConverter, + QuartusArrayVariableConverter, + QuartusInplaceArrayVariableConverter, + QuartusInplaceStreamVariableConverter, + QuartusStreamVariableConverter, + QuartusStructMemberVariableConverter, + StaticWeightVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter()) + self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = QuartusInplaceArrayVariableConverter(type_converter=self.type_converter) + self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = QuartusInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var) + else: + new_var = self.stream_var_converter.convert(var) + elif io_type == 'io_parallel': + if out_name in node.model.inputs: + new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs') + elif out_name in node.model.outputs: + new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='outputs') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='hls_register') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt new file mode 100644 index 000000000..a3a6e5c4a --- /dev/null +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -0,0 +1,320 @@ +# Direct CMake to use icpx rather than the default C++ compiler/linker on Linux +# and icx-cl on Windows +if(UNIX) + set(CMAKE_CXX_COMPILER icpx) +else() # Windows + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.7.2) + +project(fpga_template CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +############################################################################### +### Customize these build variables +############################################################################### +set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp) +set(TARGET_NAME fpga_template) + +# Use cmake -DFPGA_DEVICE=: to choose a +# different device. Here are a few device examples (this list is not +# exhaustive): +# intel_s10sx_pac:pac_s10 +# intel_s10sx_pac:pac_s10_usm +# intel_a10gx_pac:pac_a10 +# Note that depending on your installation, you may need to specify the full +# path to the board support package (BSP), this usually is in your install +# folder. +# +# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a +# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. +if(NOT DEFINED FPGA_DEVICE) + set(FPGA_DEVICE "Arria10") +endif() + +# Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend +# compilation. +set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) + +# Use cmake -DUSER_FLAGS= to set extra flags for general compilation. +set(USER_FLAGS -Wno-unused-label ${USER_FLAGS}) + +# Use cmake -DUSER_INCLUDE_PATHS= to set extra paths for general +# compilation. +set(USER_INCLUDE_PATHS src;src/firmware;${USER_INCLUDE_PATHS}) + +############################################################################### +### no changes after here +############################################################################### + +# Print the device being used for the compiles +message(STATUS "Configuring the design to run on FPGA board ${FPGA_DEVICE}") + +# Set the names of the makefile targets to be generated by cmake +set(EMULATOR_TARGET fpga_emu) +set(SIMULATOR_TARGET fpga_sim) +set(REPORT_TARGET report) +set(FPGA_TARGET fpga) +set(IP_EXPORT_TARGET fpga_ip_export) + +# Set the names of the generated files per makefile target +set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET}) +set(SIMULATOR_OUTPUT_NAME ${TARGET_NAME}.${SIMULATOR_TARGET}) +set(REPORT_OUTPUT_NAME ${TARGET_NAME}.${REPORT_TARGET}) +set(FPGA_OUTPUT_NAME ${TARGET_NAME}.${FPGA_TARGET}) +set(IP_EXPORT_OUTPUT_NAME ${TARGET_NAME}.${IP_EXPORT_TARGET}) + +message(STATUS "Additional USER_FPGA_FLAGS=${USER_FPGA_FLAGS}") +message(STATUS "Additional USER_FLAGS=${USER_FLAGS}") + +include_directories(${USER_INCLUDE_PATHS}) +message(STATUS "Additional USER_INCLUDE_PATHS=${USER_INCLUDE_PATHS}") + +link_directories(${USER_LIB_PATHS}) +message(STATUS "Additional USER_LIB_PATHS=${USER_LIB_PATHS}") + +link_libraries(${USER_LIBS}) +message(STATUS "Additional USER_LIBS=${USER_LIBS}") + +if(WIN32) + # add qactypes for Windows + set(QACTYPES "-Qactypes") + # This is a Windows-specific flag that enables exception handling in host code + set(WIN_FLAG "/EHsc") +else() + # add qactypes for Linux + set(QACTYPES "-qactypes") +endif() + +set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS}) +set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) + +# A SYCL ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate +# representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. For +# this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR) +set(EMULATOR_LINK_FLAGS ) +set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early) +set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR) +set(SIMULATOR_LINK_FLAGS -Xssimulation -Xsghdl -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${SIMULATOR_OUTPUT_NAME}) +set(FPGA_COMPILE_FLAGS -DFPGA_HARDWARE) +set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_OUTPUT_NAME}) +# get rid of this once host pipes work properly +set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel) + +############################################################################### +### FPGA Emulator +############################################################################### +add_executable(${EMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${EMULATOR_LINK_FLAGS}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${EMULATOR_OUTPUT_NAME}) + +############################################################################### +### FPGA Simulator +############################################################################### +add_executable(${SIMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${SIMULATOR_COMPILE_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${SIMULATOR_LINK_FLAGS}) +set_target_properties(${SIMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${SIMULATOR_OUTPUT_NAME}) + +############################################################################### +### Generate Report +############################################################################### +add_executable(${REPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${REPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${REPORT_TARGET} PRIVATE ${REPORT_COMPILE_FLAGS}) + +# The report target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_REPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_REPORT ${QACTYPES}) + +target_link_libraries(${REPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_REPORT}) +target_link_libraries(${REPORT_TARGET} ${REPORT_LINK_FLAGS}) +set_target_properties(${REPORT_TARGET} PROPERTIES OUTPUT_NAME ${REPORT_OUTPUT_NAME}) + +############################################################################### +### FPGA Hardware +############################################################################### +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILES}) +target_compile_options(${FPGA_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${FPGA_TARGET} PRIVATE ${FPGA_COMPILE_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${FPGA_LINK_FLAGS}) +set_target_properties(${FPGA_TARGET} PROPERTIES OUTPUT_NAME ${FPGA_OUTPUT_NAME}) + +############################################################################### +### FPGA IP Export (only necessary until native host pipes) +############################################################################### +add_executable(${IP_EXPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${IP_EXPORT_COMPILE_FLAGS}) + +# The ip export target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_EXPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_EXPORT ${QACTYPES}) + +target_link_libraries(${IP_EXPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_EXPORT}) +target_link_libraries(${IP_EXPORT_TARGET} ${IP_EXPORT_LINK_FLAGS}) +set_target_properties(${IP_EXPORT_TARGET} PROPERTIES OUTPUT_NAME ${IP_EXPORT_OUTPUT_NAME}) + +############################################################################### +### This part only manipulates cmake variables to print the commands to the user +############################################################################### + +# set the correct object file extension depending on the target platform +if(WIN32) + set(OBJ_EXTENSION "obj") +else() + set(OBJ_EXTENSION "o") +endif() + +# Set the source file names in a string +set(SOURCE_FILE_NAME "${SOURCE_FILES}") + +function(getCompileCommands common_compile_flags special_compile_flags common_link_flags special_link_flags target output_name) + + set(file_names ${SOURCE_FILE_NAME}) + set(COMPILE_COMMAND ) + set(LINK_COMMAND ) + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source}) + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Creating a string that contains the compile command + # Start by the compiler invocation + set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the potential includes + foreach(INCLUDE ${USER_INCLUDE_PATHS}) + if(NOT IS_ABSOLUTE ${INCLUDE}) + file(RELATIVE_PATH INCLUDE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${INCLUDE}) + endif() + set(COMPILE_COMMAND "${COMPILE_COMMAND} -I${INCLUDE}") + endforeach() + + # Add all the common compile flags + foreach(FLAG ${common_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific compile flags + foreach(FLAG ${special_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Get the location of the object file + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(COMPILE_COMMAND "${COMPILE_COMMAND} -c ${CURRENT_SOURCE_FILE} -o ${OBJ_FILE}\n") + endforeach() + + set(COMPILE_COMMAND "${COMPILE_COMMAND}" PARENT_SCOPE) + + # Creating a string that contains the link command + # Start by the compiler invocation + set(LINK_COMMAND "${LINK_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the common link flags + foreach(FLAG ${common_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific link flags + foreach(FLAG ${special_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add the output file + set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}") + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(LINK_COMMAND "${LINK_COMMAND} ${OBJ_FILE}") + endforeach() + + # Add all the potential library paths + foreach(LIB_PATH ${USER_LIB_PATHS}) + if(NOT IS_ABSOLUTE ${LIB_PATH}) + file(RELATIVE_PATH LIB_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${LIB_PATH}) + endif() + if(NOT WIN32) + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH}") + else() + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH} -Wl,-rpath,${LIB_PATH}") + endif() + endforeach() + + # Add all the potential includes + foreach(LIB ${USER_LIBS}) + set(LINK_COMMAND "${LINK_COMMAND} -l${LIB}") + endforeach() + + set(LINK_COMMAND "${LINK_COMMAND}" PARENT_SCOPE) + +endfunction() + +# Windows executable is going to have the .exe extension +if(WIN32) + set(EXECUTABLE_EXTENSION ".exe") +endif() + +# Display the compile instructions in the emulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${EMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${EMULATOR_LINK_FLAGS}" "${EMULATOR_TARGET}" "${EMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayEmulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${EMULATOR_TARGET} displayEmulationCompileCommands) + +# Display the compile instructions in the simulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${SIMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${SIMULATOR_LINK_FLAGS}" "${SIMULATOR_TARGET}" "${SIMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displaySimulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${SIMULATOR_TARGET} displaySimulationCompileCommands) + +# Display the compile instructions in the report flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${REPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_REPORT}" "${REPORT_LINK_FLAGS}" "${REPORT_TARGET}" "${REPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayReportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${REPORT_TARGET} displayReportCompileCommands) + +# Display the compile instructions in the IP export flow (Remove after native host pipes work properly) +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${IP_EXPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_EXPORT}" "${IP_EXPORT_LINK_FLAGS}" "${IP_EXPORT_TARGET}" "${IP_EXPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayExportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${IP_EXPORT_TARGET} displayExportCompileCommands) + +# Display the compile instructions in the fpga flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${FPGA_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${FPGA_LINK_FLAGS}" "${FPGA_TARGET}" "${FPGA_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayFPGACompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${FPGA_TARGET} displayFPGACompileCommands) diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp new file mode 100644 index 000000000..f5b9c8433 --- /dev/null +++ b/hls4ml/templates/oneapi/exception_handler.hpp @@ -0,0 +1,22 @@ +#ifndef __EXCEPTIONHANDLER_HPP__ +#define __EXCEPTIONHANDLER_HPP__ +#include +#include +#include + +namespace fpga_tools { + +void exception_handler(sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL exception:\n" + << e.what() << std::endl; + } + } +} + +} // namespace fpga_tools + +#endif //__EXCEPTIONHANDLER_HPP__ diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h new file mode 100644 index 000000000..622d9f2bf --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -0,0 +1,21 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include +#include +#include +#include +#include + +// Include nnet::array - a custom array-like struct, mainly used with io_stream +#include "nnet_utils/nnet_types.h" + +// hls-fpga-machine-learning insert numbers + +// hls-fpga-machine-learning insert layer-precision + +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n < d ? d : n) + +#endif diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp new file mode 100644 index 000000000..93f11c837 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -0,0 +1,20 @@ +#include "myproject.h" +#include "parameters.h" + +// hls-fpga-machine-learning insert weights + +void MyProject::operator()() const { + // **************************************** + // NETWORK INSTANTIATION + // **************************************** + + auto inputsArr = InPipe::read(); + +// hls-fpga-machine-learning insert layers + +// hls-fpga-machine-learning return + + OutPipe::write(outData); +} + + diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h new file mode 100644 index 000000000..f01b5978c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -0,0 +1,36 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include "defines.h" + +// This file defines the interface to the kernel + + +using input_data_t = std::array; +using output_data_t = std::array; + +class InPipeID; +class OutPipeID; + +using PipeProps = decltype(sycl::ext::oneapi::experimental::properties( + sycl::ext::intel::experimental::ready_latency<0>)); + +using InPipe = sycl::ext::intel::experimental::pipe; +using OutPipe = sycl::ext::intel::experimental::pipe; + +class MyProjectID; + +struct MyProject { + + // kernel property method to config invocation interface + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::streaming_interface<>, + sycl::ext::intel::experimental::pipelined<>}; + } + + SYCL_EXTERNAL void operator()() const; +}; + + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h new file mode 100644 index 000000000..d874741ec --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -0,0 +1,516 @@ +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +#include "nnet_common.h" + +namespace nnet { + +struct activ_config { + // IO size + static const unsigned n_in = 10; + + // Internal info + static const unsigned table_size = 512; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + + // Internal data type definitions + typedef ac_fixed<16, 8> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + res[ii] = datareg; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +template +void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template +void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + static const int MAX_VALUE = 8; +#include "activation_tables/sigmoid_table.tb" + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T absoluteValue hls_register; + res_T temp2 hls_register; + if (data[ii] < 0) { + absoluteValue = -data[ii]; + } else { + absoluteValue = data[ii]; + } + int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (absoluteValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)sigmoid_table[index]; + if (data[ii] < 0) { + res[ii] = 1 - temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; + +template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + hls_register ac_int y = x.template slc(x.width - N - 1); + // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness + if (x != 0 && y == 0) + y[0] = 1; + return y.to_uint(); +} + +template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + hls_register ac_int y = x.template slc(x.width - N); + return y.to_uint(); +} + +template +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Look-up tables +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + // Find maximum + Op_max op_max; + hls_register data_T x_max = reduce>(data, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; + } + + // Explicitly sum previously calculated exponentials with an adder tree + Op_add op_add; + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +// TODO - Improve accuracy +template +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + // Calculate all the e^x's + hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; + } + + // Explicitly sum the results with an adder tree. + Op_add op_add; + hls_register typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + hls_register int data_round[CONFIG_T::n_in]; +New_loop: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int(); + } +NN_Outer: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + typename CONFIG_T::exp_table_t exp_res_temp = 0; + NN_Inner: + #pragma unroll + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) { + exp_res_temp += 1; + } else { + int _data_cache = (data_round[jj] - data_round[ii]); + int index = _data_cache + 8 * CONFIG_T::table_size / 16; + + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + + typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index]; + exp_res_temp += temp_exp; + } + } + int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + res[ii] = invert_table_legacy[exp_res_index]; + } +} + +template +void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + res[i] = (res_T)0; + } + + hls_register data_T maximum = data[0]; + hls_register int idx = 0; + + #pragma ii 1 + for (int i = 1; i < CONFIG_T::n_in; i++) { + if (data[i] > maximum) { + maximum = data[i]; + idx = i; + } + } + + res[idx] = (res_T)1; +} + +template +inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + switch (CONFIG_T::implementation) { + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + default: + softmax_stable(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template +void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + static const int MAX_VALUE = 4; +// Initialize the lookup table +#include "activation_tables/tanh_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T temp hls_register; + res_T temp2 hls_register; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)tanh_table[index]; + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template +void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +template +void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template +void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/softplus_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int(); + ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = (res_T)softplus_table[index]; + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template +void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + static const int MAX_VALUE = 8; +// Initialize the lookup table +#include "activation_tables/softsign_table.tb" + + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T temp hls_register; + res_T temp2 hls_register; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = (res_T)softsign_table[index]; + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/elu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + elu(data, 1.0, res); +} + +// ************************************************* +// SELU Activation +// ************************************************* +template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +// Initialize the lookup table +#include "activation_tables/selu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg >= 0) { + res[ii] = res_T(1.0507009873554804934193349852946) * datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template +void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = data[ii]; + res_T cache; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template +void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_T datareg = 2 * data[ii]; + res_T cache; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = (res_T)cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h new file mode 100644 index 000000000..7b84a9c0f --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h @@ -0,0 +1,104 @@ +#ifndef NNET_BATCHNORM_H_ +#define NNET_BATCHNORM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" + +namespace nnet { + +struct batchnorm_config { + // Internal data type definitions + typedef float bias_t; + typedef float scale_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in], + const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { +// Calcuate result +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_in; ires++) { + if (CONFIG_T::n_filt == -1) { + res[ires] = CONFIG_T::template product::product(data[ires], scale[ires]) + + bias[ires]; + } else { + int norm_index = ires % CONFIG_T::n_filt; + res[ires] = + CONFIG_T::template product::product(data[ires], scale[norm_index]) + + bias[norm_index]; + } + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +struct batchnorm_quantized_tanh_config { + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; +}; + +template +void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in], + const data_T threshold[CONFIG_T::n_scale_bias]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<1, false> cache; + data_T datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg >= threshold[norm_index]) + cache = 1; + else + cache = 0; + + res[ii] = cache; + } +} + +template +void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in], + const data_T threshold_hi[CONFIG_T::n_scale_bias], + const data_T threshold_lo[CONFIG_T::n_scale_bias]) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<2, true> cache; + data_T datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg > threshold_hi[norm_index]) + cache = 1; + else if (datareg <= threshold_lo[norm_index]) + cache = -1; + else + cache = 0; + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h new file mode 100644 index 000000000..0c2e94e02 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h @@ -0,0 +1,78 @@ +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + + +#include "nnet_helpers.h" +#include +#include +#include + +typedef ac_fixed<16, 6> table_default_t; + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; + +// Default data types (??) TODO: Deprecate +typedef ac_fixed<16, 4> weight_t_def; +typedef ac_fixed<16, 4> bias_t_def; +typedef ac_fixed<32, 10> accum_t_def; + +template void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) { + #pragma unroll + for (int ii = 0; ii < NIN1; ii++) { + res[ii] = data1[ii]; + } + #pragma unroll + for (int ii = 0; ii < NIN2; ii++) { + res[NIN1 + ii] = data2[ii]; + } +} + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Quartus cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +// template T reduce(const T *x, Op op) { +// static constexpr int leftN = pow2::val>::val > 0 ? pow2::val>::val : 0; +// static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; +// if (N == 1) { +// return x[0]; +// } +// if (N == 2) { +// return op(x[0], x[1]); +// } +// return op(reduce(x, op), reduce(x + leftN, op)); +// } + +// alternate reduce - basic +template T reduce(const T *x, Op op) { + if (N == 1) { + return x[0]; + } + auto val = op(x[0], x[1]); + for (int i = 2; i < N; i++) { + val = op(val, x[i]); + } + return val; +} + + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h new file mode 100644 index 000000000..8897e1315 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h @@ -0,0 +1,64 @@ +#ifndef NNET_CONV1D_H_ +#define NNET_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_resource.h" + +namespace nnet { + +struct conv1d_config { + // I/O sizes + static const unsigned in_width = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + + // Run-time Configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelisation_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + conv_1d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + pointwise_conv_1d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 000000000..a110d6d42 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,241 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +enum class conv1d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 1D Convolution algorithm +// **************************************************************** + +template +void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP + // usage + + hls_register int index = 0; + +KernelLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + hls_register int index_data = + (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } + } +} + +template +void conv_1d_im2col_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factor for loop traversing input image, derived from parallelisation_factor + static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + +ColLoop: + #pragma unroll pf + #pragma ii CONFIG_T::reuse_factor + for (int i = 0; i < CONFIG_T::out_width; i++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + hls_register data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; + im2col_1d_cl(data, data_col, i); + + hls_register res_T res_col[CONFIG_T::n_filt]; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int j = 0; j < CONFIG_T::n_filt; j++) { + res[i * CONFIG_T::n_filt + j] = res_col[j]; + } + } +} + +// **************************************************************** +// 1D Convolution for 3x1 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd convolution, as explained by Lavin & Gray (2015) +template +inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[4]) { + D[0] = I[0] - I[2]; + D[1] = I[1] + I[2]; + D[2] = -I[1] + I[2]; + D[3] = I[1] - I[3]; +} + +template +void winograd_conv1d_3x1_kernel_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_width == 1); + assert(CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelisation_factor + static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +WidthLoop: + #pragma unroll pf + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x1 tile + hls_register data_T T[16]; + hls_register uint8_t p = 0; + + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + + // Transform input tile + hls_register typename CONFIG_T::accum_t D[4]; + winograd_transform_input_tile_3x1_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + hls_register int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + hls_register typename CONFIG_T::accum_t Y[4]; + #pragma unroll + for (int i = 0; i < 4; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * col + filter] += static_cast(Y[0] + Y[1] + Y[2]); + if ((col + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * (col + 1) + filter] += static_cast(Y[1] - Y[2] - Y[3]); + } + } + } +} + +// **************************************************************** +// 1D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan], + const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + hls_register int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + hls_register int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_width == 1); + + // Unroll factor for loop traversing input image, derived from parallelisation_factor + static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + +ColLoop: + #pragma unroll pf + #pragma ii CONFIG_T::reuse_factor + for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + hls_register data_T data_col[CONFIG_T::n_chan]; + im2col_1d_pointwise_cl(data, data_col, col); + + hls_register res_T res_col[CONFIG_T::n_filt]; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[col * CONFIG_T::n_filt + k] = res_col[k]; + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_1d_resource_cl( + data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + CONFIG_T::implementation == nnet::conv1d_implementation::combination || + CONFIG_T::implementation == nnet::conv1d_implementation::winograd; + + if (CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv1d_3x1_kernel_cl(data, res, weights, biases); + } else { + conv_1d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h new file mode 100644 index 000000000..3aa71a74b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h @@ -0,0 +1,72 @@ +#ifndef NNET_CONV2D_H_ +#define NNET_CONV2D_H_ + +#include "nnet_conv2d_resource.h" + +namespace nnet { + +struct conv2d_config { + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_height = 1; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_height * filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_height = 1; + static const unsigned stride_width = 1; + static const unsigned dilation_height = 1; + static const unsigned dilation_width = 1; + + // Run-time configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelisation_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t + weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + conv_2d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + pointwise_conv_2d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 000000000..73ad45592 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,303 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_helpers.h" + +namespace nnet { + +enum class conv2d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 2D Convolution algorithm +// **************************************************************** + +template +void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int row, + const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little + // DSP usage + + hls_register int index = 0; + +FiltHeightLoop: + #pragma unroll + for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) { + hls_register int input_row = + -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; + + FiltWidthLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + hls_register int input_col = + -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; + + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } + } + } +} + +template +void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * + CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factors for loop traversing input image, derived from parallelisation_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height); + +HeightLoop: + #pragma unroll pfr + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + #pragma unroll pfc + #pragma ii CONFIG_T::reuse_factor + for (int j = 0; j < CONFIG_T::out_width; j++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + hls_register data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; + im2col_2d_cl(data, data_col, i, j); + + hls_register res_T res_col[CONFIG_T::n_filt]; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// 2D Convolution for 3x3 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd calculation, as explained by Lavin & Gray, 2015 +template +inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D[16]) { + D[0] = I[0] - I[2] - I[8] + I[10]; + D[1] = I[1] + I[2] - I[9] - I[10]; + D[2] = -I[1] + I[2] + I[9] - I[10]; + D[3] = I[1] - I[3] - I[9] + I[11]; + + D[4] = I[4] - I[6] + I[8] - I[10]; + D[5] = I[5] + I[6] + I[9] + I[10]; + D[6] = -I[5] + I[6] - I[9] + I[10]; + D[7] = I[5] - I[7] + I[9] - I[11]; + + D[8] = -I[4] + I[6] + I[8] - I[10]; + D[9] = -I[5] - I[6] + I[9] + I[10]; + D[10] = I[5] - I[6] - I[9] + I[10]; + D[11] = -I[5] + I[7] + I[9] - I[11]; + + D[12] = I[4] - I[6] - I[12] + I[14]; + D[13] = I[5] + I[6] - I[13] - I[14]; + D[14] = I[6] - I[5] + I[13] - I[14]; + D[15] = I[5] - I[7] - I[13] + I[15]; +} + +template +void winograd_conv2d_3x3_kernel_cl( + data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t + weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1); + assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom); + assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelisation_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2)); + static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2)); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row += 2) { + WidthLoop: + #pragma unroll pfc + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x4 tile + hls_register data_T T[16]; + hls_register typename CONFIG_T::accum_t D[16]; + hls_register uint8_t p = 0; + + #pragma unroll + for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) { + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (r < CONFIG_T::in_height && r >= 0 && c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[r * CONFIG_T::in_width * CONFIG_T::n_chan + c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + } + + // Transform input tile + winograd_transform_input_tile_3x3_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + hls_register int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + hls_register typename CONFIG_T::accum_t Y[16]; + #pragma unroll + for (int i = 0; i < 16; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] += + static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + Y[10]); + if ((col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - Y[11]); + if ((row + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] += + static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - Y[13] - Y[14]); + if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - Y[13] + Y[14]); + } + } + } + } +} + +// **************************************************************** +// 2D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + data_T data_col[CONFIG_T::n_chan], const int row, const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + hls_register int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + + hls_register int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; + hls_register int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; + + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + // Unroll factors for loop traversing input image, derived from parallelisation_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height); + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row++) { + WidthLoop: + #pragma unroll pfc + #pragma ii CONFIG_T::reuse_factor + for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + hls_register data_T data_col[CONFIG_T::n_chan]; + im2col_2d_pointwise_cl(data, data_col, row, col); + + hls_register res_T res_col[CONFIG_T::n_filt]; + dense_resource(data_col, res_col, weights, biases); + + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[row * CONFIG_T::out_width * CONFIG_T::n_filt + col * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * + CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + CONFIG_T::implementation == nnet::conv2d_implementation::combination || + CONFIG_T::implementation == nnet::conv2d_implementation::winograd; + + if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv2d_3x3_kernel_cl(data, res, weights, biases); + } else { + conv_2d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h new file mode 100644 index 000000000..c1786ef78 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -0,0 +1,170 @@ +#ifndef NNET_DENSE_LARGE_H_ +#define NNET_DENSE_LARGE_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + static const unsigned reuse_factor = 1; + static const unsigned block_factor = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + static const unsigned multiplier_limit = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor) + static const unsigned multiplier_factor = 1; // min n_in, rf + static const unsigned multiplier_scale = 1; // M_LIMIT/CONFIG_T::n_out; + static const unsigned reciprocal = 1; // 2^35 / 25 + static const unsigned rf_pad = 0; + static const unsigned bf_pad = 0; + // Resource reuse info + static const unsigned io_type = io_parallel; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); + //#pragma ii CONFIG_T::reuse_factor + hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +Load: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + + #pragma unroll + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + CONFIG_T::reuse_factor * im; + out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor); + d_index[ir][im] = w_index % CONFIG_T::n_in; + } + } +Product1: + #pragma nofusion + #pragma speculated_iterations 0 + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; + Product2: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded) + continue; + int data_index = d_index[ir][im]; + // Modified this + tmp_acc[im] = + CONFIG_T::template product::product(data[data_index], weights[w_index]); + } + hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; + ResetMult: + #pragma unroll + for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) { + mult[imult] = 0; + } + AccumLoop1: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + int o_index = out_index[ir][im]; + if (o_index >= CONFIG_T::n_out) + continue; // check out of bounds + mult[o_index] += tmp_acc[im]; + } + AccumLoop2: + #pragma unroll + for (int im = 0; im < CONFIG_T::multiplier_limit; im++) { + acc[im] += mult[im]; + } + } +Store: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); // acc[jj]; + } +} +template +void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN"); + + hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +InitAccum: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } +ReuseLoop: + #pragma nofusion + #pragma speculated_iterations 0 + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; + MultLoop: + #pragma unroll + for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; + // Modified this + mult[im] = + CONFIG_T::template product::product(data[in_index], weights[w_index]); + in_index += CONFIG_T::reuse_factor; + if (in_index >= CONFIG_T::n_in) + in_index = ir; + } + AccumLoop: + #pragma unroll + for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) { + acc[out_index] += mult[im]; + if (acc_step + 1 >= CONFIG_T::multiplier_scale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } +// Cast to "res_t" type +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} +template +void dense_resource( + const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_rf_lt(data, res, weights, biases); + } else { + dense_rf_gt(data, res, weights, biases); + } +} +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h new file mode 100644 index 000000000..ba50a631b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h @@ -0,0 +1,81 @@ +#ifndef NNET_COMPRESSED_LAYER_H_ +#define NNET_COMPRESSED_LAYER_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include + +namespace nnet { + +template +void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + +InitAccum: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + acc[i] = (typename CONFIG_T::accum_t)(biases[i]); + } + + hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; + hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; + + #pragma unroll + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma unroll + for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) { + uint32_t w = ir + CONFIG_T::reuse_factor * im; + inputs[ir][im] = data[weights[w].row_index]; + out_index[ir][im] = weights[w].col_index; + } + } +ReuseLoop: + #pragma nofusion + #pragma speculated_iterations 0 + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor]; + CompressedMultLoop: + #pragma unroll + for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) { + uint32_t w = ir + CONFIG_T::reuse_factor * im; + // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue; + typename CONFIG_T::accum_t prod = mult[im] = + CONFIG_T::template product::product(inputs[0][im], weights[w].weight); + #pragma unroll + for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) { + inputs[is][im] = inputs[is + 1][im]; + } + } + hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out]; + ResetMult: + #pragma unroll + for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) { + tmp_acc[tacc] = 0; + } + AccumLoop1: + #pragma unroll + for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) { + int col = out_index[ir][im]; + tmp_acc[col] += mult[im]; + } + AccumLoop2: + #pragma unroll + for (int im = 0; im < CONFIG_T::n_out; im++) { + acc[im] += tmp_acc[im]; + } + } + +// Cast to "res_t" type +ResultLoop: + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_out; i++) { + res[i] = cast(acc[i]); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h new file mode 100644 index 000000000..5191239b6 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h @@ -0,0 +1,45 @@ +#ifndef NNET_EMBED_H_ +#define NNET_EMBED_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +struct embed_config { + // Internal data type definitions + typedef float embeddings_t; + + // (Default layer sizes, overwritten form the backend + static const unsigned n_in = 10; + static const unsigned n_out = 16; + static const unsigned vocab_size = 50; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; +}; + +template +void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + + /* + * Can store embeddings[] in a register, but a large multiiplexer + * is created due to a non-constant access pattern + */ + +InputSequence: + #pragma ii CONFIG_T::reuse_factor + #pragma unroll + for (int j = 0; j < CONFIG_T::n_in; j++) { + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res[j * CONFIG_T::n_out + i] = embeddings[data[j].to_uint() * CONFIG_T::n_out + i]; + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h new file mode 100644 index 000000000..888ea4a6f --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -0,0 +1,119 @@ +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +template void convert_data(srcType *src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + dst[i] = dstType(src[i]); + } +} + +template void convert_data_back(srcType *src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + dst[i] = static_cast(src[i].to_double()); + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +// constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } +// replace with template metaprogramming +template struct ceillog2 +{ + enum { val = 1 + ceillog2<((n + 1) / 2)>::val }; +}; + +template<> struct ceillog2<2> +{ + enum { val = 1 }; +}; + +template<> struct ceillog2<1> +{ + enum { val = 0 }; +}; + + +// constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } +// replace with template metaprogramming +template struct floorlog2 +{ + enum { val = 1 + floorlog2<(n / 2)>::val }; +}; + +template<> struct floorlog2<1> +{ + enum { val = 0 }; +}; + +template<> struct floorlog2<0> +{ + enum { val = 0 }; +}; + +// constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } +// replace with template metaprogramming +template struct pow2 +{ + enum { val = 2 * pow2<(n - 1)>::val }; +}; + +template<> struct pow2<0> +{ + enum { val = 1 }; +}; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = static_cast(data[i].to_double()); + } +} + + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << data[i] << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h new file mode 100644 index 000000000..766ef2e20 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h @@ -0,0 +1,249 @@ +#ifndef NNET_MERGE_H_ +#define NNET_MERGE_H_ + +#include "nnet_mult.h" + +namespace nnet { + +struct merge_config { + static const unsigned n_elem = 10; +}; + +struct dot_config { + static const unsigned n_in = 10; + static const unsigned n_out = 1; + + static const unsigned reuse_factor = 1; + + typedef float accum_t; + + template using product = nnet::product::mult; +}; + +struct concat_config { + static const unsigned n_elem1_0 = 10; + static const unsigned n_elem1_1 = 10; + static const unsigned n_elem1_2 = 10; + static const unsigned n_elem2_0 = 10; + static const unsigned n_elem2_1 = 10; + static const unsigned n_elem2_2 = 10; + + static const unsigned axis = -1; +}; + +template +void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] + data2[i]); + } +} + +template +void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] - data2[i]); + } +} + +template +void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] * data2[i]); + } +} + +template +void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] + data2[i]) / (res_T)2); + } +} + +template +void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = (data1[i] > data2[i]) ? static_cast(data1[i]) : static_cast(data2[i]); + } +} + +template +void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = (data1[i] < data2[i]) ? static_cast(data1[i]) : static_cast(data2[i]); + } +} + +template +void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) { + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + + hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; +Product: + #pragma unroll multiplier_limit + for (int i = 0; i < CONFIG_T::n_in; i++) { + mult[i] = CONFIG_T::template product::product(data1[i], data2[i]); + } + + hls_register typename CONFIG_T::accum_t acc = 0; +Accum: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + acc += mult[i]; + } + + res[0] = static_cast(acc); +} + +template +void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0], + res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + res[CONFIG_T::n_elem1_0 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] = + static_cast(data1[i * CONFIG_T::n_elem1_1 + j]); + } + + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] = + static_cast(data2[i * CONFIG_T::n_elem2_1 + j]); + } + } +} + +template +void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast(data2[i]); + } +} + +template +void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = + i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + } + + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem2_2; k++) { + int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + + (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2], + input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2], + res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h new file mode 100644 index 000000000..5be772832 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h @@ -0,0 +1,113 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include + +namespace nnet { + +// Different methods to perform the product of input and weight, depending on their types. +namespace product { + +class Product { + public: + static void limit(unsigned multiplier_limit) {} +}; + +template class both_binary : public Product { + public: + inline static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + return a == w; + } +}; + +template class weight_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + return a * w; + } + static void limit(unsigned multiplier_limit) { + // TODO: Implement for Quartus + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS + // pragma + } +}; + +template class weight_exponential : public Product { + public: + using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>; + inline static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + // Shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.weight; + + // Negate or not depending on weight sign + return w.sign == 1 ? y : static_cast(-y); + } +}; +} // namespace product + +// TO-DO: These may need extra variants if ac_int types are used in more places +template +inline typename std::enable_if>::value && + std::is_same>::value, + ac_int>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int()); +} + +template +inline typename std::enable_if>::value && + !std::is_same>::value, + res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h new file mode 100644 index 000000000..a95f9ab00 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h @@ -0,0 +1,99 @@ +#ifndef NNET_PADDING_H_ +#define NNET_PADDING_H_ + +namespace nnet { + +struct padding1d_config { + static const unsigned in_width = 10; + static const unsigned out_width = 10; + static const unsigned n_chan = 10; + + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) { + for (int i = 0; i < CONFIG_T::pad_left; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = (res_T) * (data++); + } + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(res++) = 0; + } + } +} + +struct padding2d_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; + + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template +void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width], + res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) { + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = (res_T) * (data++); + } + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(res++) = 0; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h new file mode 100644 index 000000000..bbfc0908e --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h @@ -0,0 +1,319 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include "nnet_common.h" + +namespace nnet { + +// Returns the maximum value from an array of size N +template T max(T x[N]) { + hls_register T y = x[0]; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + #pragma disable_loop_pipelining + for (int i = 1; i < N; i++) { + if (x[i] > y) + y = x[i]; + } + + return y; +} + +// Returns the mean value of an array of size N +template T avg(T (&x)[N]) { + hls_register T y = 0; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + #pragma disable_loop_pipelining + for (int i = 0; i < N; i++) { + y += x[i]; + } + + y /= N; + return y; +} + +// Returns the mean value of an array of size N +// Overload of the above function; using a wider accumulator than the input to avoid overflow +template ac_int avg(ac_int (&x)[N]) { + hls_register ac_int tmp = 0; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + #pragma disable_loop_pipelining + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + + tmp /= N; + + // Cast back to original type + ac_int y = static_cast>(tmp); + return tmp; +} + +// Returns the mean value of an array of size N +// Overload of the above function; using a wider accumulator than the input to avoid overflow +template ac_fixed avg(ac_fixed (&x)[N]) { + hls_register ac_fixed tmp = 0; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + #pragma disable_loop_pipelining + for (int i = 0; i < N; i++) { + tmp += x[i]; + } + + tmp /= N; + + // Cast back to original type + ac_fixed y = tmp; + return y; +} + +// Enumeration for pooling functions +enum Pool_Op { Max, Average }; +template T pool_op(T (&x)[N]) { + switch (op) { + case Max: + return max(x); + case Average: + return avg(x); + } +} + +/* + * In Tensorflow, pooling ignores the value in the padded cells + * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.) + * For ax pooling, return the most negative value for the type. + */ +template inline T pad_val() { + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + } + case Average: + return 0; + } +} + +struct pooling1d_config { + // Pooling paramaters + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + + // I/O sizes + static const unsigned n_in = 10; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned n_filt = 4; + + // Padding + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template +void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) { + // For 'same' padding, increase input width by left- and right-side padding + // For 'valid' padding, reduce input width to area covered by pooling function + static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) + ? (CONFIG_T::n_in / CONFIG_T::stride_width * CONFIG_T::stride_width) + : (CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right); + +FiltLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputWidthLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) { + hls_register data_T pool[CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + hls_register unsigned img_overlap = 0; + + PoolWidthLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) { + if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col] = data[(inp_col + pool_col - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = + static_cast(pool_op(pool)); + + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) + res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *= + (static_cast(CONFIG_T::pool_width) / img_overlap); + } + } +} + +template +void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + +FiltLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + hls_register data_T pool[CONFIG_T::n_in]; + + InputWidthLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int col = 0; col < CONFIG_T::n_in; col++) { + pool[col] = data[col * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + +struct pooling2d_config { + // Pooling parameters + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template +void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) { + // For 'same' padding, increase input width by left- and right-side padding + // For 'valid' padding, reduce input width to area covered by pooling function + static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) + ? (CONFIG_T::in_width / CONFIG_T::stride_width * CONFIG_T::stride_width) + : (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right); + static constexpr int padded_height = (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0) + ? (CONFIG_T::in_height / CONFIG_T::stride_height * CONFIG_T::stride_height) + : (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom); + +FiltLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputHeightLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int inp_col = 0; inp_col < padded_height; inp_col += CONFIG_T::stride_height) { + InputWidthLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) { + hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + hls_register unsigned img_overlap = 0; + + PoolHeightLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) { + PoolWidthLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int pool_row = 0; pool_row < CONFIG_T::stride_width; pool_row++) { + if (inp_col + pool_col < CONFIG_T::pad_top || + inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) || + inp_width + pool_row < CONFIG_T::pad_left || + inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col * CONFIG_T::stride_width + pool_row] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col * CONFIG_T::stride_width + pool_row] = + data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = + static_cast( + pool_op(pool)); + + // If the pool op is Average, the zero-padding needs to be removed from the results + if (CONFIG_T::pool_op == Average) + res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *= + (static_cast(CONFIG_T::pool_height) * static_cast(CONFIG_T::pool_width) / + img_overlap); + } + } + } +} + +template +void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt], + res_T res[CONFIG_T::n_filt]) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + +FiltLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + hls_register data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + #pragma unroll + #pragma disable_loop_pipelining + for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast(pool_op(pool)); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h new file mode 100644 index 000000000..464c6d415 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h @@ -0,0 +1,583 @@ +#ifndef NNET_RECURRENT_H_ +#define NNET_RECURRENT_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { + +//---------------------- +// Utils +//---------------------- + +template +void multiply_W(data_T input[N_IN], res_T out[N_OUT], const weight_t weight[N_IN * N_OUT]) { +MULTIPLY_W_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_W_LOOP_J: + #pragma unroll + for (int j = 0; j < N_IN; j++) { + out[i] += input[j] * weight[i * N_IN + j]; + } + } +} + +template +void multiply_U(data_T input[N_OUT], res_T out[N_OUT], const weight_t weight[N_OUT * N_OUT]) { +MULTIPLY_U_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_U_LOOP_J: + #pragma unroll + for (int j = 0; j < N_OUT; j++) { + out[i] += input[j] * weight[i * N_OUT + j]; + } + } +} + +template +void add_bias(data_T inputs[N], res_T out[N], const bias_t bias[N]) { +ADD_BIAS_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = inputs[i] + bias[i]; + } +} + +template void multiply_vectors(data_T in1[N], data_T in2[N], res_T out[N]) { +MULTIPLY_VECT_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] * in2[i]; + } +} + +template void add_vectors(data_T in1[N], data_T in2[N], res_T out[N]) { +ADD_VECTOR_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] + in2[i]; + } +} + +//---------------------- +// GRU +//---------------------- + +struct gru_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_units = 1; + static const unsigned n_timesteps = 1; + static const unsigned n_outputs = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], + const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in], + const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units], + const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units], + const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) { + static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor; + // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and + // candidate state gates, for each of the units + hls_register typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units]; + nnet::dense_resource(x, mat_mul_x_w, weights, + bias); + + // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights), + // for update, reset and candidate state gates, for each of the units + hls_register typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units]; + nnet::dense_resource( + h, mat_mul_h_wr, recurrent_weights, recurrent_bias); + + // A vector containing both the values of z(t) and r(t) for every state + hls_register typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units]; + + // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1) + // Unrolled fully, no DSPs used + #pragma unroll + for (int i = 0; i < (2 * CONFIG_T::n_units); i++) { + z_r[i] = mat_mul_x_w[i] + mat_mul_h_wr[i]; + } + + // Activation on z(t) and r(t) + hls_register typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units]; + CONFIG_T::template activation_recr::activation(z_r, z_r_act); + + // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h + hls_register typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units]; + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; + } + + // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h} + typename CONFIG_T::accum_t h_cand[CONFIG_T::n_units]; + // Addition - can unroll fully; no DSPs used here + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h_cand[i] = mat_mul_x_w[i + 2 * CONFIG_T::n_units] + hadamard_r_h[i]; + } + + // Activation on candidate state + hls_register typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units]; + CONFIG_T::template activation::activation(h_cand, h_cand_act); + + // Update state + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]); + } +} + +template +void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_units], + const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in], + const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units], + const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units], + const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) { + + hls_register data_T x[CONFIG_T::n_in]; + hls_register res_T h[CONFIG_T::n_units]; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + // Loop depedency - cannot pipeline + #pragma disable_loop_pipelining + for (int t = 0; t < CONFIG_T::n_timesteps; t++) { + // Get data at current time step + #pragma unroll + for (int j = 0; j < CONFIG_T::n_in; j++) { + x[j] = data[j + t * CONFIG_T::n_in]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + res[CONFIG_T::n_units * t + i] = h[i]; + } + } + } + + if (!CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + res[i] = h[i]; + } + } +} + +//---------------------- +// SimpleRNN +//---------------------- + +struct simpleRNN_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void simple_rnn_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out], + res_T hidden_state_o[CONFIG_T::n_out], + const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) { + // Weight multiplication + typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] hls_register; + multiply_W( + inputs, afterW, kernel); + + // Bias addition + typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] hls_register; + add_bias( + afterW, afterBias, bias); + + // Hidden state + typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] hls_register; + multiply_U(hidden_state, hiddenCand, + rec_kernel); + + // Vector addition + typename CONFIG_T::accum_t afterAdd[CONFIG_T::n_out]; + add_vectors(afterBias, hiddenCand, afterAdd); + + // Activation + CONFIG_T::template activation::activation( + afterAdd, hidden_state_o); +} + +template +void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out], + const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) { + res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; + res_T hidden_state_temp[CONFIG_T::n_out] hls_register; + res_T h[CONFIG_T::n_out] hls_register; + data_T in[CONFIG_T::n_in] hls_register; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][0] = 0; + } + + #pragma disable_loop_pipelining + for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + + // Data at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[x][i]; + } + + // Do SimpleRNN + simple_rnn_cell(in, hidden_state_temp, h, kernel, rec_kernel, bias); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][i + 1] = h[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[x][CONFIG_T::n_timesteps]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + #pragma unroll + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1]; + } + } + } +} + +//---------------------- +// LSTM +//---------------------- + +struct lstm_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out], res_T hidden_state_o[CONFIG_T::n_out], + res_T cell_state[CONFIG_T::n_out], res_T cell_state_o[CONFIG_T::n_out], + const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out], + const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) { + + // Internals definitions + typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] hls_register; + + // Hidden state Gate candidates, intermediate variables + typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] hls_register; + + // After addition, intermediate variables + typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] hls_register; + + // Gate outputs + typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t h[CONFIG_T::n_out] hls_register; + + // Intermediate variable cell calculation + typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] hls_register; + + //-----------Gate I Calculations + // Weight multiplication + multiply_W( + inputs, i_afterW, WI); + + // Bias addition + add_bias( + i_afterW, i_afterBias, BI); + + // Hidden Candidate + multiply_U(hidden_state, i_hiddenCand, + RWI); + + // Vector addition + add_vectors(i_afterBias, i_hiddenCand, + i_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation(i_afterAdd, gate_i); + + //-----------Gate F Calculations + // Weight multiplication + multiply_W( + inputs, f_afterW, WF); + + // Bias addition + add_bias( + f_afterW, f_afterBias, BF); + + // Hidden Candidate + multiply_U(hidden_state, f_hiddenCand, + RWF); + + // Vector addition + add_vectors(f_afterBias, f_hiddenCand, + f_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation(f_afterAdd, gate_f); + + //-----------Gate C Calculations + // Weight multiplication + multiply_W( + inputs, c_afterW, WC); + + // Bias addition + add_bias( + c_afterW, c_afterBias, BC); + + // Hidden Candidate + multiply_U(hidden_state, c_hiddenCand, + RWC); + + // Vector addition + add_vectors(c_afterBias, c_hiddenCand, + c_afterAdd); + + // Activation + CONFIG_T::template activation::activation(c_afterAdd, gate_c); + + //-----------gate I and C multiply + // Vector multiplication + multiply_vectors(gate_i, gate_c, gate_ic); + + //-----------Gate O Calculations + // Weight multiplication + multiply_W( + inputs, o_afterW, WO); + + // Bias addition + add_bias( + o_afterW, o_afterBias, BO); + + // Hidden Candidate + multiply_U(hidden_state, o_hiddenCand, + RWO); + + // Vector addition + add_vectors(o_afterBias, o_hiddenCand, + o_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation(o_afterAdd, gate_o); + + //-----------Cell State Calculation + // Vector multiplication + multiply_vectors(gate_f, cell_state, cell_act_multp); + + // Vector addition + add_vectors(gate_ic, cell_act_multp, + cell_act_add); + + //-----------Forget gate Calculation + // Activation + CONFIG_T::template activation::activation(cell_act_add, gate_forget); + + // Vector multiplication + multiply_vectors(gate_o, gate_forget, h); + +OUTPUT_WRITE_LOOP: + #pragma unroll + for (int x = (CONFIG_T::n_out - 1); x >= 0; x--) { + hidden_state_o[x] = h[x]; + cell_state_o[x] = cell_act_add[x]; + } +} + +template +void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out], + const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out], + const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) { + res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; + res_T hidden_state_temp[CONFIG_T::n_out] hls_register; + res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; + res_T cell_state_temp[CONFIG_T::n_out] hls_register; + res_T h[CONFIG_T::n_out] hls_register; + res_T c[CONFIG_T::n_out] hls_register; + data_T in[CONFIG_T::n_in] hls_register; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][0] = 0; + cell_state[x][0] = 0; + } + + // Input dimension + #pragma disable_loop_pipelining + for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + // Data at current time step + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[x][i]; + cell_state_temp[x] = cell_state[x][i]; + } + + // Do LSTM + lstm_cell(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, + BI, BF, BC, BO); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[x][i + 1] = h[x]; + cell_state[x][i + 1] = c[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[x][CONFIG_T::n_timesteps]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h new file mode 100644 index 000000000..e5896e6da --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h @@ -0,0 +1,53 @@ +#ifndef NNET_RECR_ACTIVATION_H_ +#define NNET_RECR_ACTIVATION_H_ + +#include "nnet_activation.h" +#include "nnet_common.h" + +namespace nnet { + +namespace activation { + +template class Activation { + public: + // ************************************************* + // Blank Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} +}; + +template class relu : public Activation { + public: + // ************************************************* + // Relu Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::relu(data, res); + } +}; + +template class sigmoid : public Activation { + public: + // ************************************************* + // Sigmoid Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::sigmoid(data, res); + } +}; + +template class tanh : public Activation { + public: + // ************************************************* + // TanH Activation + // ************************************************* + static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { + nnet::dense_tanh(data, res); + } +}; + +} // namespace activation + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h new file mode 100644 index 000000000..a8e3ffe85 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h @@ -0,0 +1,38 @@ +#ifndef NNET_IMAGE_H_ +#define NNET_IMAGE_H_ + +namespace nnet { + +struct resize_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; +}; + +template +void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan], + data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) { + int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; + int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; + + for (int i = 0; i < CONFIG_T::new_height; i++) { + for (int j = 0; j < CONFIG_T::new_width; j++) { + int x = ((j * x_ratio) >> 16); + int y = ((i * y_ratio) >> 16); + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] = + image[(y * CONFIG_T::width * CONFIG_T::n_chan) + x * CONFIG_T::n_chan + k]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h new file mode 100644 index 000000000..05fd5fe76 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h @@ -0,0 +1,50 @@ +#ifndef NNET_TRANSPOSE_H_ +#define NNET_TRANSPOSE_H_ + +namespace nnet { + +struct transpose_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned depth = 10; + static constexpr unsigned perm[3] = {2, 0, 1}; +}; + +template +void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T res[CONFIG_T::height * CONFIG_T::width]) { + for (int i = 0; i < CONFIG_T::height; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::width; j++) { + res[j * CONFIG_T::height + i] = static_cast(data[i * CONFIG_T::width + j]); + } + } +} + +template +void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width], + res_T res[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) { + static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; + static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]], + dim_data[CONFIG_T::perm[2]]}; + + int index_data[3] = {0}, index_res[3] = {0}; + + for (index_data[0] = 0; index_data[0] < dim_data[0]; index_data[0]++) { + #pragma unroll + for (index_data[1] = 0; index_data[1] < dim_data[1]; index_data[1]++) { + #pragma unroll + for (index_data[2] = 0; index_data[2] < dim_data[2]; index_data[2]++) { + index_res[0] = index_data[CONFIG_T::perm[0]]; + index_res[1] = index_data[CONFIG_T::perm[1]]; + index_res[2] = index_data[CONFIG_T::perm[2]]; + + res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = static_cast( + data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]); + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h new file mode 100644 index 000000000..221055938 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -0,0 +1,44 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include + +namespace nnet { + +/* + * HLS Shift Register Implementation + * To verify a shift register is used in hardware, go to report.html > Area Analysis of System + * Unrolling the shift loop minimizes resource usage and latency at the same time + * The shift loop should be either fully unrolled or not unrolled at all + * Unrolling with a specific unroll factor or pipelining with certain ii's, can cause an irregular access pattern, which + * wouldn't allow shift register usage in RTL + */ +template struct shift_reg { + private: + T data[N]; + + public: + // Default constructor + shift_reg() {} + + // Shift queue, insert new element and return element from the front + T shift(T inp) { + T out = data[N - 1]; + + #pragma unroll + for (int i = N - 1; i > 0; i--) { + data[i] = data[i - 1]; + } + data[0] = inp; + + return out; + } + + T read(int pos) { return data[pos]; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h new file mode 100644 index 000000000..e23ca9770 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/parameters.h @@ -0,0 +1,11 @@ +#ifndef PARAMETERS_H_ +#define PARAMETERS_H_ + +#include "defines.h" + +#include "nnet_utils/nnet_helpers.h" +// hls-fpga-machine-learning insert includes + +// hls-fpga-machine-learning insert layer-config + +#endif diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp new file mode 100644 index 000000000..6c7ae68fb --- /dev/null +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -0,0 +1,167 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "firmware/myproject.h" +#include "firmware/parameters.h" + +#include +#include + +#include "exception_handler.hpp" +// hls-fpga-machine-learning insert bram + +#define CHECKPOINT 5000 + + +int main(int argc, char **argv) { + +#if FPGA_SIMULATOR + auto selector = sycl::ext::intel::fpga_simulator_selector_v; +#elif FPGA_HARDWARE + auto selector = sycl::ext::intel::fpga_selector_v; +#else // #if FPGA_EMULATOR + auto selector = sycl::ext::intel::fpga_emulator_selector_v; +#endif + + sycl::queue q(selector, fpga_tools::exception_handler, + sycl::property::queue::enable_profiling{}); + + auto device = q.get_device(); + + // make sure the device supports USM host allocations + if (!device.has(sycl::aspect::usm_host_allocations)) { + std::cerr << "This design must either target a board that supports USM " + "Host/Shared allocations, or IP Component Authoring. " + << std::endl; + std::terminate(); + } + + std::cout << "Running on device: " + << device.get_info().c_str() + << std::endl; + + + // load input data from text file + std::ifstream fin("tb_data/tb_input_features.dat"); + // load predictions from text file + std::ifstream fpr("tb_data/tb_output_predictions.dat"); + + std::string RESULTS_LOG = "tb_data/results.log"; + std::ofstream fout(RESULTS_LOG); + + std::string iline; + std::string pline; + + std::vector inputs; + std::vector outputs; + + if (fin.is_open() && fpr.is_open()) { + std::vector> predictions; + unsigned int num_iterations = 0; + for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) { + if (num_iterations % CHECKPOINT == 0) { + std::cout << "Processing input " << num_iterations << std::endl; + } + + std::vector in; + std::vector pr; + float current; + + std::stringstream ssin(iline); + while (ssin >> current) { + in.push_back(current); + } + + std::stringstream sspred(pline); + while (sspred >> current) { + pr.push_back(current); + } + if (in.size() != N_INPUT_1_1) { + throw std::runtime_error("The input size does not match"); + } + if (pr.size() != N_LAYER_11) { + throw std::runtime_error("The output size does not match"); + } + + // hls-fpga-machine-learning insert data + inputs.emplace_back(); + std::copy(in.cbegin(), in.cend(), inputs.back().begin()); + outputs.emplace_back(); + predictions.push_back(std::move(pr)); + } + // Do this separately to avoid vector reallocation + // hls-fpga-machine-learning insert top-level-function + for(int i = 0; i < num_iterations; i++) { + InPipe::write(q, inputs[i]); + q.single_task(MyProject{}); // once or once for each + } + q.wait(); + + for (int j = 0; j < num_iterations; j++) { + // hls-fpga-machine-learning insert tb-output + outputs[j] = OutPipe::read(q); + for(int i = 0; i < N_LAYER_11; i++) { + fout << outputs[j][i] << " "; + } + fout << std::endl; + if (j % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + for(int i = 0; i < N_LAYER_11; i++) { + std::cout << predictions[j][i] << " "; + } + std::cout << std::endl; + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + for(int i = 0; i < N_LAYER_11; i++) { + std::cout << outputs[j][i] << " "; + } + std::cout << std::endl; + } + } + fin.close(); + fpr.close(); + } else { + const unsigned int num_iterations = 10; + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; + // hls-fpga-machine-learning insert zero + for(int i = 0; i < num_iterations; i++) { + inputs.emplace_back(); + outputs.emplace_back(); + outputs.back().fill(0.0); + } + + // hls-fpga-machine-learning insert top-level-function + for(int i = 0; i < num_iterations; i++) { + InPipe::write(q, inputs[i]); + q.single_task(MyProject{}); + } + q.wait(); + + for (int j = 0; j < num_iterations; j++) { + // hls-fpga-machine-learning insert output + outputs[j] = OutPipe::read(q); + for(int i = 0; i < N_LAYER_11; i++) { + std::cout << outputs[j][i] << " "; + } + std::cout << std::endl; + + // hls-fpga-machine-learning insert tb-output + for(int i = 0; i < N_LAYER_11; i++) { + fout << outputs[j][i] << " "; + } + fout << std::endl; + } + } + + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + + return 0; +} From cd0a2b8eb401e8b57f1b6d1e2c309804087af085 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 21 Dec 2023 14:47:34 -0600 Subject: [PATCH 002/100] fix reduce constexpr --- .../oneapi/firmware/nnet_utils/nnet_common.h | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h index 0c2e94e02..abefd87b8 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h @@ -38,30 +38,32 @@ template void merge(data_T data1[NIN1], data_ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section * before applying and accumulate the result over the rolled dimension. * --- */ -// template T reduce(const T *x, Op op) { -// static constexpr int leftN = pow2::val>::val > 0 ? pow2::val>::val : 0; -// static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; -// if (N == 1) { -// return x[0]; -// } -// if (N == 2) { -// return op(x[0], x[1]); -// } -// return op(reduce(x, op), reduce(x + leftN, op)); -// } - -// alternate reduce - basic template T reduce(const T *x, Op op) { - if (N == 1) { + static constexpr int leftN = pow2::val>::val > 0 ? + pow2::val>::val : + 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if constexpr (N == 1) { return x[0]; } - auto val = op(x[0], x[1]); - for (int i = 2; i < N; i++) { - val = op(val, x[i]); + else if constexpr (N == 2) { + return op(x[0], x[1]); + } else { + return op(reduce(x, op), reduce(x + leftN, op)); } - return val; } +// alternate reduce - basic +// template T reduce(const T *x, Op op) { +// if (N == 1) { +// return x[0]; +// } +// auto val = op(x[0], x[1]); +// for (int i = 2; i < N; i++) { +// val = op(val, x[i]); +// } +// return val; +// } template class Op_add { public: From 3b3d40d8570b8914f5fa0a61ae87b71eb5b64a27 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 22 Dec 2023 18:36:13 -0600 Subject: [PATCH 003/100] further updates --- hls4ml/backends/oneapi/oneapi_types.py | 71 ++ .../backends/oneapi/passes/transform_types.py | 37 +- hls4ml/templates/oneapi/CMakeLists.txt | 4 +- hls4ml/templates/oneapi/firmware/defines.h | 1 + .../templates/oneapi/firmware/myproject.cpp | 3 +- hls4ml/templates/oneapi/firmware/myproject.h | 14 +- hls4ml/templates/oneapi/myproject_test.cpp | 3 + hls4ml/writer/__init__.py | 2 + hls4ml/writer/oneapi_writer.py | 967 ++++++++++++++++++ 9 files changed, 1071 insertions(+), 31 deletions(-) create mode 100644 hls4ml/backends/oneapi/oneapi_types.py create mode 100644 hls4ml/writer/oneapi_writer.py diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py new file mode 100644 index 000000000..d76449f1e --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -0,0 +1,71 @@ +''' +This package includes oneAPI-specific customizations to the variable types +''' +from hls4ml.backends.fpga.fpga_types import VariableDefinition, ArrayVariableConverter + +# region ArrayVarable + +class OneAPIArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}' + +class OneAPIInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + +class OneAPIArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition) + +class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition + ) + +# endregion + +# region InterfaceMemberVariable + + +class OneAPIInterfaceVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}' + + def declare_cpp(self, pipe_min_size=0, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + (f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n') + return lines + + +class InterfaceVariableConverter: + def __init__(self, type_converter, prefix, definition_cls): + self.type_converter = type_converter + self.prefix = prefix + self.definition_cls = definition_cls + + def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition'): + if isinstance(tensor_var, self.definition_cls): # Already converted + return tensor_var + + tensor_var.pragma = pragma + tensor_var.type = self.type_converter.convert(tensor_var.type) + + tensor_var.pipe_name = pipe_name + tensor_var.pipe_id = pipe_id + tensor_var.array_type = array_type + + tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {}) + return tensor_var + + +class OneAPIInterfaceVariableConverter(InterfaceVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition + ) + + +# endregion diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py index 67de32ab6..121392eda 100644 --- a/hls4ml/backends/oneapi/passes/transform_types.py +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -1,25 +1,23 @@ from hls4ml.backends.fpga.fpga_types import ( ACTypeConverter, HLSTypeConverter, - QuartusArrayVariableConverter, - QuartusInplaceArrayVariableConverter, - QuartusInplaceStreamVariableConverter, - QuartusStreamVariableConverter, - QuartusStructMemberVariableConverter, StaticWeightVariableConverter, ) +from hls4ml.backends.oneapi.oneapi_types import ( + OneAPIArrayVariableConverter, + OneAPIInplaceArrayVariableConverter, + OneAPIInterfaceVariableConverter +) from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable - +from hls4ml.utils.string_utils import convert_to_pascal_case class TransformTypes(GlobalOptimizerPass): def __init__(self): self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter()) - self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter) - self.inplace_array_var_converter = QuartusInplaceArrayVariableConverter(type_converter=self.type_converter) - self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter) - self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter) - self.inplace_stream_var_converter = QuartusInplaceStreamVariableConverter(type_converter=self.type_converter) + self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter) + self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter) self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter) def transform(self, model, node): @@ -27,19 +25,22 @@ def transform(self, model, node): for out_name, var in node.variables.items(): if io_type == 'io_stream': - if isinstance(var, InplaceTensorVariable): - new_var = self.inplace_stream_var_converter.convert(var) - else: - new_var = self.stream_var_converter.convert(var) + raise NotImplementedError("io_stream is not yet implemented for oneAPI") elif io_type == 'io_parallel': if out_name in node.model.inputs: - new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs') + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register', + pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', + pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', + array_type=f'{var.name}_array_t') elif out_name in node.model.outputs: - new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='outputs') + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register', + pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', + pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', + array_type=f'{var.name}_array_t') elif isinstance(var, InplaceTensorVariable): new_var = self.inplace_array_var_converter.convert(var, pragma='') else: - new_var = self.array_var_converter.convert(var, pragma='hls_register') + new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register') else: raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index a3a6e5c4a..66c505450 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -10,7 +10,7 @@ endif() cmake_minimum_required (VERSION 3.7.2) -project(fpga_template CXX) +project(myproject CXX) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) @@ -20,7 +20,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) ### Customize these build variables ############################################################################### set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp) -set(TARGET_NAME fpga_template) +set(TARGET_NAME myproject) # Use cmake -DFPGA_DEVICE=: to choose a # different device. Here are a few device examples (this list is not diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index 622d9f2bf..04dc640a1 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -6,6 +6,7 @@ #include #include #include +#include // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp index 93f11c837..0dc79a21c 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.cpp +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -8,13 +8,12 @@ void MyProject::operator()() const { // NETWORK INSTANTIATION // **************************************** - auto inputsArr = InPipe::read(); +// hls-fpga-machine-learning read in // hls-fpga-machine-learning insert layers // hls-fpga-machine-learning return - OutPipe::write(outData); } diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index f01b5978c..52f457344 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -5,18 +5,14 @@ // This file defines the interface to the kernel - -using input_data_t = std::array; -using output_data_t = std::array; - -class InPipeID; -class OutPipeID; - +// currently this is fixed using PipeProps = decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::intel::experimental::ready_latency<0>)); -using InPipe = sycl::ext::intel::experimental::pipe; -using OutPipe = sycl::ext::intel::experimental::pipe; +// Need to declare the input and output pipes + +// hls-fpga-machine-learning insert inputs +// hls-fpga-machine-learning insert outputs class MyProjectID; diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index 6c7ae68fb..a9830245a 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -57,6 +57,9 @@ int main(int argc, char **argv) { std::string iline; std::string pline; + // hls-fpga-machine-learning insert inputs + // hls-fpga-machine-learning insert results + std::vector inputs; std::vector outputs; diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index f4eed945a..942964fc8 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,4 +1,5 @@ from hls4ml.writer.quartus_writer import QuartusWriter +from hls4ml.writer.oneapi_writer import OneAPIWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter @@ -9,4 +10,5 @@ register_writer('VivadoAccelerator', VivadoAcceleratorWriter) register_writer('Vitis', VitisWriter) register_writer('Quartus', QuartusWriter) +register_writer('oneAPI', OneAPIWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py new file mode 100644 index 000000000..39a5f14c4 --- /dev/null +++ b/hls4ml/writer/oneapi_writer.py @@ -0,0 +1,967 @@ +import glob +import os +import tarfile +from collections import OrderedDict +from shutil import copyfile, copytree, rmtree + +import numpy as np +import yaml + +from hls4ml.backends import get_backend +from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, Dense +from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary +from hls4ml.utils.string_utils import convert_to_pascal_case +from hls4ml.writer.writers import Writer + +config_filename = 'hls4ml_config.yml' + + +class OneAPIWriter(Writer): + def next_pow2(self, x): + return 1 << (x - 1).bit_length() + + def __make_dat_file(self, original_path, project_path): + """ + Convert other input/output data types into a dat file, which is + a text file with the falttened matrix printed out. Note that ' ' is + assumed to be the delimiter. + """ + + # Take in data from current supported data files + if original_path[-3:] == "npy": + data = np.load(original_path) + else: + raise Exception("Unsupported input/output data files.") + + # Faltten data, just keep first dimension + data = data.reshape(data.shape[0], -1) + + def print_data(f): + for i in range(data.shape[0]): + for j in range(data.shape[1]): + f.write(str(data[i][j]) + " ") + f.write("\n") + + # Print out in dat file + with open(project_path, "w") as f: + print_data(f) + + def get_max_reuse_factor(self, model): + max_rf = 0 + for layer in model.get_layers(): + rf = int(layer.get_attr('reuse_factor')) + if rf > max_rf: + max_rf = rf + return max_rf + + def print_array_to_cpp(self, var, layer, odir): + """Write a weights array to C++ header files. + + Args: + var (WeightVariable): Weight to write + layer (Layer): Instance of the layer to which the weights belong + odir (str): Output directory + """ + with open(f"{odir}/firmware/weights/{var.name}.h", "w") as h_file: + + # meta data + h_file.write(f"//Numpy array shape {var.shape}\n") + h_file.write(f"//Min {np.min(var.min):.12f}\n") + h_file.write(f"//Max {np.max(var.max):.12f}\n") + h_file.write(f"//Number of zeros {var.nzeros}\n") + h_file.write("\n") + + h_file.write(f"#ifndef {var.name.upper()}_H_\n") + h_file.write(f"#define {var.name.upper()}_H_\n") + h_file.write("\n") + + rf = int(layer.get_attr('reuse_factor', 1)) + weight_header = '' + + weight_size = 0 + if isinstance(layer, (Conv2D, Conv2DBatchnorm)): + weight_size = ( + layer.get_attr('impl_filt_height') + * layer.get_attr('impl_filt_width') + * layer.get_attr('n_filt') + * layer.get_attr('n_chan') + ) + elif isinstance(layer, (Conv1D)): + weight_size = layer.get_attr('impl_filt_width') * layer.get_attr('n_filt') * layer.get_attr('n_chan') + elif isinstance(layer, (Dense)): + weight_size = layer.get_attr('n_in') * layer.get_attr('n_out') + + if rf == 1 or var.name[0] == 'b' or weight_size <= 2048 or (var.name[0] == 'w' and var.type.precision.width < 3): + pass # might want to modify this + else: + block_factor = (layer.get_attr('n_in') * layer.get_attr('n_out')) / rf + nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2) + var_width = int(np.ceil(var.type.precision.width / 8)) + bwidth = self.next_pow2(var_width) + weight_header += ( + f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' + ) + if var.storage.lower() == 'bram': + weight_header += 'static ' + else: + weight_header += 'static const ' + h_file.write(weight_header + var.definition_cpp() + " = {") + + # fill c++ array. + # not including internal brackets for multidimensional case + sep = '' + for x in var: + h_file.write(sep + x) + sep = ", " + h_file.write("};\n") + h_file.write("\n#endif\n") + + + def write_project_dir(self, model): + """Write the base project directory + + Args: + model (ModelGraph): the hls4ml model. + """ + if not os.path.isdir(f"{model.config.get_output_dir()}/src/firmware/weights"): + os.makedirs(f"{model.config.get_output_dir()}/src/firmware/weights") + + def write_project_cpp(self, model): + """Write the main architecture source file (myproject.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + project_name = model.config.get_project_name() + + filedir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, \ + open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout: + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + if len(model_brams != 0): + raise NotImplementedError("Weights on the interface is currently not supported") + + io_type = model.config.get_config_value('IOType') + indent = ' ' + + for line in f.readlines(): + # Add headers to weights and biases + if 'myproject' in line: + newline = line.replace('myproject', project_name) + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + # Read in inputs + elif '// hls-fpga-machine-learning read in' in line: + newline = line + if io_type == 'io_parallel': + for inp in model_inputs: + newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' + else: + raise NotImplementedError("Only io_parallel is currently supported with oneAPI") + + # Insert weights + elif '// hls-fpga-machine-learning insert weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w not in model_brams: + newline += f'#include "weights/{w.name}.h"\n' + + + # Neural net instantiation + elif '// hls-fpga-machine-learning insert layers' in line: + newline = line + '\n' + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + for layer in model.get_layers(): + if io_type != 'io_stream': + vars = layer.get_variables() + for var in vars: + if var not in model_inputs and var not in model_outputs: + def_cpp = var.definition_cpp() + if def_cpp is not None: + newline += ' ' + def_cpp + ';\n' + func = layer.get_attr('function_cpp', None) + if func: + newline += ' ' + func + '\n' + if model.config.trace_output and layer.get_attr('trace', False): + newline += '#ifndef HLS_SYNTHESIS\n' + for var in vars: + newline += ' nnet::save_layer_output<{}>({}, "{}", {});\n'.format( + var.type.name, var.name, layer.name, var.size_cpp() + ) + newline += '#endif\n' + newline += '\n' + + # Write the output + elif '// hls-fpga-machine-learning return' in line: + newline = line + if io_type == 'io_parallel': + for out in model_outputs: + newline += indent + f'{out.pipe_name}::write({out.name});\n' + else: + raise NotImplementedError("Only io_parallel is currently supported with oneAPI") + + # Just copy line + else: + newline = line + + fout.write(newline) + + + def write_project_header(self, model): + """Write the main architecture header file (myproject.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + + project_name = model.config.get_project_name() + + filedir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, \ + open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout: + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + # io_parallel and io_stream instantiate the top-level function differently + io_type = model.config.get_config_value('IOType') + indent = ' ' + brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(project_name.upper())) + + elif 'myproject' in line: + newline = line.replace('myproject', project_name) + + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + # Declarations for the inputs. May need modification when io_stream is supported + elif '// hls-fpga-machine-learning insert inputs' in line: + newline = line + for inp in model_inputs: + newline += inp.declare_cpp() + + # and declareations for the outputs + elif '// hls-fpga-machine-learning insert outputs' in line: + newline = line + for out in model_outputs: + newline += out.declare_cpp() + + # Simply copy line, if no inserts are required + else: + newline = line + + fout.write(newline) + + def write_defines(self, model): + """Write the C++ type definitions file (defines.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, \ + open(f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w') as fout: + + for line in f.readlines(): + # Insert numbers + if '// hls-fpga-machine-learning insert numbers' in line: + newline = line + + defines_list = [] + for layer in model.get_layers(): + defines = '' + # Note: this assumes all the layers have one ouput + # (or in clones, one type of output) + for k, v in layer.get_output_variable().get_shape(): + defines += f'#define {k} {v}\n' + + defines_list.append(defines) + + newline += ''.join(defines_list) + + elif '// hls-fpga-machine-learning insert layer-precision' in line: + newline = line + all_precision = OrderedDict() + for layer in model.get_layers(): + layer_precision = layer.get_layer_precision() + for type_name, type_var in layer_precision.items(): + # Ensure that layer's types doesn't override existing types + # This can happen in case of InplaceVariable types + if type_name not in all_precision: + all_precision[type_name] = type_var + for used_type in all_precision.values(): + newline += used_type.definition_cpp() + + else: + newline = line + fout.write(newline) + + def write_parameters(self, model): + """Write the C++ layer config file (parameters.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + filedir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, \ + open(f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w') as fout: + + for line in f.readlines(): + if '// hls-fpga-machine-learning insert includes' in line: + newline = line + for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): + newline += '#include "%s"\n' % include + + elif "// hls-fpga-machine-learning insert layer-config" in line: + newline = line + for layer in model.get_layers(): + config = layer.get_attr('config_cpp', None) + if config: + newline += config + '\n' + else: + newline = line + fout.write(newline) + + def write_weights(self, model): + """Write the weights into header files + + Args: + model (ModelGraph): the hls4ml model. + """ + for layer in model.get_layers(): + for weights in layer.get_weights(): + self.print_array_to_cpp(weights, layer, model.config.get_output_dir()) + + def write_test_bench(self, model): + """Write the testbench + + Args: + model (ModelGraph): the hls4ml model. + """ + # TODO - This function only works with one model input + # (NOT one data point - it works as expected with multiple data points) + + # copy the exception handler + filedir = os.path.dirname(os.path.abspath(__file__)) + srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp') + dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp' + copyfile(srcpath, dstpath) + + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + if len(model_brams != 0): + raise NotImplementedError("Weights on the interface is currently not supported") + + if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'): + os.mkdir(f'{model.config.get_output_dir()}/tb_data/') + + input_data = model.config.get_config_value('InputData') + output_predictions = model.config.get_config_value('OutputPredictions') + + if input_data: + if input_data[-3:] == "dat": + copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + else: + self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + + if output_predictions: + if output_predictions[-3:] == "dat": + copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat') + else: + self.__make_dat_file( + output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat' + ) + + with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \ + open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w') as fout: + + for line in f.readlines(): + indent = ' ' * (len(line) - len(line.lstrip(' '))) + + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert inputs': + newline = line + for inp in model_inputs: + newline += indent + f'std::vector<{inp.array_type}> {inp.name};\n' + newline += indent + f'input_counts.push_back({inp.size_cpp()});\n' + elif '// hls-fpga-machine-learning insert results': + newline = line + for out in model_outputs: + newline += indent + f'std::vector<{out.array_type}> {out.name};\n' + newline += indent + f'output_counts.push_back({out.size_cpp()});\n' + elif '// hls-fpga-machine-learning insert data' in line: + newline = line + newline += ' std::vector::const_iterator in_begin = in.cbegin();\n' + newline += ' std::vector::const_iterator in_end;\n' + newline += ' inputs.emplace_back();\n' + for inp in model.get_input_variables(): + newline += f' in_end = in_begin + ({inp.size_cpp()});\n' + newline += f' std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n' + newline += ' in_begin = in_end;\n' + newline += ' outputs.emplace_back();\n' + elif '// hls-fpga-machine-learning insert zero' in line: + newline = line + newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' + for inp in model.get_input_variables(): + newline += indent + ' inputs.emplace_back();\n' + newline += indent + ' outputs.emplace_back();\n' + newline += indent + f' std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n' + newline += indent + '}\n' + + elif '// hls-fpga-machine-learning insert top-level-function' in line: + newline = line + newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' + newline += indent + f' ihc_hls_enqueue(&outputs[i], {model.config.get_project_name()}, inputs[i]' + if model_brams: + bram_vars = ','.join([b.name for b in model_brams]) + newline += f', {bram_vars});\n' + else: + newline += ');\n' + newline += indent + '}\n' + elif 'hls-fpga-machine-learning insert run' in line: + newline = line + newline += ' ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n' + elif '// hls-fpga-machine-learning insert predictions' in line: + newline = line + newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' + newline += indent + ' std::cout << predictions[j][i] << " ";\n' + newline += indent + '}\n' + newline += indent + 'std::cout << std::endl;\n' + elif '// hls-fpga-machine-learning insert tb-output' in line: + newline = line + newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' + newline += indent + f' fout << outputs[j].{outvar.member_name}[i] << " ";\n' + newline += indent + '}\n' + newline += indent + 'fout << std::endl;\n' + elif ( + '// hls-fpga-machine-learning insert output' in line + or '// hls-fpga-machine-learning insert quantized' in line + ): + newline = line + newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' + newline += indent + f' std::cout << outputs[j].{outvar.member_name}[i] << " ";\n' + newline += indent + '}\n' + newline += indent + 'std::cout << std::endl;\n' + else: + newline = line + + fout.write(newline) + + def write_test_bench(self, model): + """Write the testbench + + Args: + model (ModelGraph): the hls4ml model. + """ + # TODO - This function only works with one model input + # (NOT one data point - it works as expected with multiple data points) + + # copy the exception handler + filedir = os.path.dirname(os.path.abspath(__file__)) + srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp') + dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp' + copyfile(srcpath, dstpath) + + io_type = model.config.get_config_value('IOType') + if io_type == 'io_parallel': + self.write_testbench_parallel(model) + elif io_type == 'io_stream': + self.write_testbench_stream(model) + + def write_bridge(self, model): + """Write the Python-C++ bridge (myproject_bridge.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + pass + + def write_build_script(self, model): + """Write the build scripts (Makefile, build_lib.sh) + + Args: + model (ModelGraph): the hls4ml model. + """ + + # Makefile + filedir = os.path.dirname(os.path.abspath(__file__)) + device = model.config.get_config_value('Part') + with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, \ + open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout: + + for line in f.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + + if 'set(FPGA_DEVICE' in line: + line = f' set(FPGA_DEVICE "{device}")' + + fout.write(line) + + + def write_nnet_utils(self, model): + """Copy the nnet_utils, AP types headers and any custom source to the project output directory + + Args: + model (ModelGraph): the hls4ml model. + """ + + # nnet_utils + filedir = os.path.dirname(os.path.abspath(__file__)) + + srcpath = os.path.join(filedir, '../templates/oneapi/firmware/nnet_utils/') + dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/' + + if not os.path.exists(dstpath): + os.mkdir(dstpath) + + headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')] + + for h in headers: + copyfile(srcpath + h, dstpath + h) + + + # custom source + filedir = os.path.dirname(os.path.abspath(__file__)) + + custom_source = get_backend('oneAPI').get_custom_source() + for dst, srcpath in custom_source.items(): + dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}' + copyfile(srcpath, dstpath) + + def __get_table_size(self, model, activation): + for layer in model.get_layers(): + if ( + layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation + ) and layer.get_attr('table_size') is not None: + return int(layer.get_attr('table_size')) + return 1024 + + def __get_table_header(self, table_name, table_size): + table_header += f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{' + return table_header + + def __write_elu_table(self, model, path): + table_name = 'elu_table' + table_size = self.__get_table_size(model, 'elu') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = -8.0 * i / float(table_size) + real_val = np.exp(in_val) - 1.0 + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_sigmoid_table(self, model, path): + MAX_VALUE = 8 + MIN_VALUE = 0 + + table_name = 'sigmoid_table' + table_size = self.__get_table_size(model, 'sigmoid') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(int(table_size)): + in_val = ( + i * (MAX_VALUE - MIN_VALUE) / float(table_size) + + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2) + + MIN_VALUE + ) + real_val = 1.0 / (1 + np.exp(-in_val)) + if real_val >= 0.5: + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_tanh_table(self, model, path): + MAX_VALUE = 4 + MIN_VALUE = 0 + + table_name = 'tanh_table' + table_size = self.__get_table_size(model, 'tanh') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = ( + i * (MAX_VALUE - MIN_VALUE) / float(table_size) + + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2) + + MIN_VALUE + ) + real_val = np.tanh(in_val) + if real_val >= 0: + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_softplus_table(self, model, path): + table_name = 'softplus_table' + table_size = self.__get_table_size(model, 'softplus') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size) + real_val = np.log(np.exp(in_val) + 1.0) + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_softsign_table(self, model, path): + MAX_VALUE = 8 + MIN_VALUE = 0 + table_name = 'softsign_table' + table_size = self.__get_table_size(model, 'softsign') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = ( + i * (MAX_VALUE - MIN_VALUE) / float(table_size) + + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2) + + MIN_VALUE + ) + + real_val = in_val / (np.fabs(in_val) + 1.0) + if real_val >= 0: + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_selu_table(self, model, path): + table_name = 'selu_table' + table_size = self.__get_table_size(model, 'selu') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = -8.0 * i / float(table_size) + real_val = 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (np.exp(in_val) - 1.0)) + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_exp_table(self, model, path): + table_name = 'exp_table' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision + # 6 bits for integer part, 10 bits for decimal - total, 16 + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + # Exp table should use the same precision as exp_table, as seen in Vivado code + # init_exp_table(exp_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + b = uint_to_binary(i, N) + if i == 0: + b.insert(0, 0) + else: + b.insert(0, 1) + f.set_msb_bits(b) + real_val = f.exp_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_invert_table(self, model, path): + table_name = 'invert_table' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision, in case values from layer attributes cannot be extracted + # 8 bits for integer part, 10 bits for decimal - total, 18 + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + # Invert table should use the same precision as exp_table, as seen in Vivado code + # init_invert_table(invert_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + b = uint_to_binary(i, N) + b.insert(0, 0) + f.set_msb_bits(b) + real_val = f.inv_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_exp_table_latency(self, model, path): + table_name = 'exp_table_latency' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision + # 6 bits for integer part, 10 bits for decimal - total, 16 + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + # Exp table should use the same precision as exp_table, as seen in Vivado code + # init_exp_table(exp_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.exp_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_invert_table_latency(self, model, path): + table_name = 'invert_table_latency' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision, in case values from layer attributes cannot be extracted + # 8 bits for integer part, 10 bits for decimal - total, 18 + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + # Invert table should use the same precision as exp_table, as seen in Vivado code + # init_invert_table(invert_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.inv_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_exp_table_legacy(self, model, path): + table_name = 'exp_table_legacy' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size) + real_val = np.exp(in_val) + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def __write_invert_table_legacy(self, model, path): + table_name = 'invert_table_legacy' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open(f'{path}/{table_name}.tb', 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + real_val = 0 + in_val = 64.0 * i / float(table_size) + if in_val > 0.0: + real_val = 1.0 / in_val + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.close() + + def write_activation_tables(self, model): + """Write the lookup tables for activation functions + + Args: + model (ModelGraph): the hls4ml model. + """ + # Output path + dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/activation_tables' + if not os.path.exists(dstpath): + os.mkdir(dstpath) + + # Tables + # TODO - Only write tables needed by model, not all of them + self.__write_elu_table(model, dstpath) + self.__write_sigmoid_table(model, dstpath) + self.__write_tanh_table(model, dstpath) + self.__write_softplus_table(model, dstpath) + self.__write_softsign_table(model, dstpath) + self.__write_selu_table(model, dstpath) + self.__write_exp_table(model, dstpath) + self.__write_invert_table(model, dstpath) + self.__write_exp_table_latency(model, dstpath) + self.__write_invert_table_latency(model, dstpath) + self.__write_exp_table_legacy(model, dstpath) + self.__write_invert_table_legacy(model, dstpath) + + def write_yml(self, model): + """Write the config to the YAML file + + Args: + model (ModelGraph): the hls4ml model. + """ + + def keras_model_representer(dumper, keras_model): + model_path = model.config.get_output_dir() + '/keras_model.h5' + keras_model.save(model_path) + return dumper.represent_scalar('!keras_model', model_path) + + try: + from tensorflow.keras import Model as KerasModel + + yaml.add_multi_representer(KerasModel, keras_model_representer) + except Exception: + pass + + with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file: + yaml.dump(model.config.config, file) + + def write_tar(self, model): + """Write the generated project as a .tar.gz archive + + Args: + model (ModelGraph): the hls4ml model. + """ + + with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive: + archive.add(model.config.get_output_dir(), recursive=True) + + def write_hls(self, model): + print('Writing HLS project') + self.write_project_dir(model) + self.write_project_cpp(model) + self.write_project_header(model) + self.write_weights(model) + self.write_defines(model) + self.write_parameters(model) + self.write_test_bench(model) + self.write_bridge(model) + self.write_build_script(model) + self.write_nnet_utils(model) + self.write_activation_tables(model) + self.write_yml(model) + self.write_tar(model) + print('Done') From b7429015f285d8cd1cf51727331d3f4572646ffe Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 26 Dec 2023 17:26:08 -0600 Subject: [PATCH 004/100] update the bridge and testbench --- hls4ml/templates/oneapi/myproject_bridge.cpp | 74 ++++++++ hls4ml/templates/oneapi/myproject_test.cpp | 51 +++--- hls4ml/writer/oneapi_writer.py | 178 +++++++++++-------- 3 files changed, 199 insertions(+), 104 deletions(-) create mode 100644 hls4ml/templates/oneapi/myproject_bridge.cpp diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp new file mode 100644 index 000000000..4b7a6b170 --- /dev/null +++ b/hls4ml/templates/oneapi/myproject_bridge.cpp @@ -0,0 +1,74 @@ +#ifndef MYPROJECT_BRIDGE_H_ +#define MYPROJECT_BRIDGE_H_ + +#include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_helpers.h" +#include +#include + +#include "exception_handler.hpp" +/ +// hls-fpga-machine-learning insert bram + +namespace nnet { +bool trace_enabled = false; +std::map *trace_outputs = NULL; +size_t trace_type_size = sizeof(double); +} // namespace nnet + +extern "C" { + +struct trace_data { + const char *name; + void *data; +}; + +void allocate_trace_storage(size_t element_size) { + nnet::trace_enabled = true; + nnet::trace_outputs = new std::map; + nnet::trace_type_size = element_size; + // hls-fpga-machine-learning insert trace_outputs +} + +void free_trace_storage() { + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + void *ptr = i->second; + free(ptr); + } + nnet::trace_outputs->clear(); + delete nnet::trace_outputs; + nnet::trace_outputs = NULL; + nnet::trace_enabled = false; +} + +void collect_trace_output(struct trace_data *c_trace_outputs) { + int ii = 0; + for (std::map::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) { + c_trace_outputs[ii].name = i->first.c_str(); + c_trace_outputs[ii].data = i->second; + ii++; + } +} + +// Wrapper of top level function for Python bridge +void myproject_float( + // hls-fpga-machine-learning insert header #float +) { + auto selector = sycl::ext::intel::fpga_emulator_selector_v; + sycl::queue q(selector, fpga_tools::exception_handler, + sycl::property::queue::enable_profiling{}); + + // hls-fpga-machine-learning insert wrapper #float +} + +void myproject_double( + // hls-fpga-machine-learning insert header #double +) { + auto selector = sycl::ext::intel::fpga_emulator_selector_v; + sycl::queue q(selector, fpga_tools::exception_handler, + sycl::property::queue::enable_profiling{}); + // hls-fpga-machine-learning insert wrapper #double +} +} + +#endif diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index a9830245a..39b2e17c1 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -60,9 +60,6 @@ int main(int argc, char **argv) { // hls-fpga-machine-learning insert inputs // hls-fpga-machine-learning insert results - std::vector inputs; - std::vector outputs; - if (fin.is_open() && fpr.is_open()) { std::vector> predictions; unsigned int num_iterations = 0; @@ -84,45 +81,46 @@ int main(int argc, char **argv) { while (sspred >> current) { pr.push_back(current); } - if (in.size() != N_INPUT_1_1) { - throw std::runtime_error("The input size does not match"); - } - if (pr.size() != N_LAYER_11) { - throw std::runtime_error("The output size does not match"); - } // hls-fpga-machine-learning insert data inputs.emplace_back(); + if (in.size() != inputs[0].size()) { + throw std::runtime_error("The input size does not match"); + } + std::copy(in.cbegin(), in.cend(), inputs.back().begin()); + outputs.emplace_back(); - predictions.push_back(std::move(pr)); + if (pr.size() != outputs[0].size()) { + throw std::runtime_error("The output size does not match"); + } + std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); + } // Do this separately to avoid vector reallocation - // hls-fpga-machine-learning insert top-level-function for(int i = 0; i < num_iterations; i++) { - InPipe::write(q, inputs[i]); + // hls-fpga-machine-learning insert tb-input q.single_task(MyProject{}); // once or once for each } q.wait(); for (int j = 0; j < num_iterations; j++) { // hls-fpga-machine-learning insert tb-output - outputs[j] = OutPipe::read(q); - for(int i = 0; i < N_LAYER_11; i++) { - fout << outputs[j][i] << " "; + for(auto outval : outputs[j]) { + fout << outval << " "; } fout << std::endl; if (j % CHECKPOINT == 0) { std::cout << "Predictions" << std::endl; // hls-fpga-machine-learning insert predictions - for(int i = 0; i < N_LAYER_11; i++) { - std::cout << predictions[j][i] << " "; + for(auto predval : predictions[j]) { + std::cout << predval << " "; } std::cout << std::endl; std::cout << "Quantized predictions" << std::endl; // hls-fpga-machine-learning insert quantized - for(int i = 0; i < N_LAYER_11; i++) { - std::cout << outputs[j][i] << " "; + for(auto outval : outputs[j]) { + std::cout << outval << " "; } std::cout << std::endl; } @@ -137,27 +135,26 @@ int main(int argc, char **argv) { for(int i = 0; i < num_iterations; i++) { inputs.emplace_back(); outputs.emplace_back(); - outputs.back().fill(0.0); + inputs.back().fill(0.0); } // hls-fpga-machine-learning insert top-level-function for(int i = 0; i < num_iterations; i++) { - InPipe::write(q, inputs[i]); + // hls-fpga-machine-learning insert tb-input q.single_task(MyProject{}); } q.wait(); for (int j = 0; j < num_iterations; j++) { - // hls-fpga-machine-learning insert output - outputs[j] = OutPipe::read(q); - for(int i = 0; i < N_LAYER_11; i++) { - std::cout << outputs[j][i] << " "; + // hls-fpga-machine-learning insert tb-output + for(auto outval : outputs[j]) { + std::cout << outval << " "; } std::cout << std::endl; // hls-fpga-machine-learning insert tb-output - for(int i = 0; i < N_LAYER_11; i++) { - fout << outputs[j][i] << " "; + for(auto outval : outputs[j]) { + fout << outval << " "; } fout << std::endl; } diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 39a5f14c4..e9d45ff08 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -359,6 +359,7 @@ def write_test_bench(self, model): dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp' copyfile(srcpath, dstpath) + project_name = model.config.get_project_name() model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] @@ -366,6 +367,11 @@ def write_test_bench(self, model): if len(model_brams != 0): raise NotImplementedError("Weights on the interface is currently not supported") + if len(model_inputs) != 1 or len(model_outputs) != 1: + print("The testbench supports only single input arrays and single output arrays.") + print("Please modify it before using it.") + return + if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'): os.mkdir(f'{model.config.get_output_dir()}/tb_data/') @@ -387,13 +393,13 @@ def write_test_bench(self, model): ) with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \ - open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w') as fout: + open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout: for line in f.readlines(): indent = ' ' * (len(line) - len(line.lstrip(' '))) if 'myproject' in line: - newline = line.replace('myproject', model.config.get_project_name()) + newline = line.replace('myproject', project_name) elif 'MyProject' in line: newline = line.replace('MyProject', convert_to_pascal_case(project_name)) @@ -403,100 +409,118 @@ def write_test_bench(self, model): newline += f'#include \"firmware/weights/{bram.name}.h\"\n' elif '// hls-fpga-machine-learning insert inputs': newline = line - for inp in model_inputs: - newline += indent + f'std::vector<{inp.array_type}> {inp.name};\n' - newline += indent + f'input_counts.push_back({inp.size_cpp()});\n' + # there should really be only one input + inp = model_inputs[0] + newline += indent + f'std::vector<{inp.array_type}> inputs;\n' + elif '// hls-fpga-machine-learning insert results': newline = line - for out in model_outputs: - newline += indent + f'std::vector<{out.array_type}> {out.name};\n' - newline += indent + f'output_counts.push_back({out.size_cpp()});\n' - elif '// hls-fpga-machine-learning insert data' in line: - newline = line - newline += ' std::vector::const_iterator in_begin = in.cbegin();\n' - newline += ' std::vector::const_iterator in_end;\n' - newline += ' inputs.emplace_back();\n' - for inp in model.get_input_variables(): - newline += f' in_end = in_begin + ({inp.size_cpp()});\n' - newline += f' std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n' - newline += ' in_begin = in_end;\n' - newline += ' outputs.emplace_back();\n' - elif '// hls-fpga-machine-learning insert zero' in line: - newline = line - newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' - for inp in model.get_input_variables(): - newline += indent + ' inputs.emplace_back();\n' - newline += indent + ' outputs.emplace_back();\n' - newline += indent + f' std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n' - newline += indent + '}\n' - - elif '// hls-fpga-machine-learning insert top-level-function' in line: + # there should really be only one out + out = model_outputs[0] + newline += indent + f'std::vector<{out.array_type}> predictions;\n' + elif '// hls-fpga-machine-learning insert tb-input' in line: newline = line - newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n' - newline += indent + f' ihc_hls_enqueue(&outputs[i], {model.config.get_project_name()}, inputs[i]' - if model_brams: - bram_vars = ','.join([b.name for b in model_brams]) - newline += f', {bram_vars});\n' - else: - newline += ');\n' - newline += indent + '}\n' - elif 'hls-fpga-machine-learning insert run' in line: - newline = line - newline += ' ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n' - elif '// hls-fpga-machine-learning insert predictions' in line: - newline = line - newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' - newline += indent + ' std::cout << predictions[j][i] << " ";\n' - newline += indent + '}\n' - newline += indent + 'std::cout << std::endl;\n' + inp = model_inputs[0] + newline += indent + f'{inp.pipe_name}::write(q, inputs[i]);\n' elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line - newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' - newline += indent + f' fout << outputs[j].{outvar.member_name}[i] << " ";\n' - newline += indent + '}\n' - newline += indent + 'fout << std::endl;\n' - elif ( - '// hls-fpga-machine-learning insert output' in line - or '// hls-fpga-machine-learning insert quantized' in line - ): - newline = line - newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n' - newline += indent + f' std::cout << outputs[j].{outvar.member_name}[i] << " ";\n' - newline += indent + '}\n' - newline += indent + 'std::cout << std::endl;\n' + out = model_outputs[0] + newline += indent + f'outputs[i] = {out.pipe_name}::read(q);\n' else: newline = line fout.write(newline) - def write_test_bench(self, model): - """Write the testbench + def write_bridge(self, model): + """Write the Python-C++ bridge (myproject_bridge.cpp) Args: model (ModelGraph): the hls4ml model. """ - # TODO - This function only works with one model input - # (NOT one data point - it works as expected with multiple data points) + project_name = model.config.get_project_name() + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + # model brambs aren't actually supported yet + + io_type = model.config.get_config_value('IOType') + indent = ' ' - # copy the exception handler filedir = os.path.dirname(os.path.abspath(__file__)) - srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp') - dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp' - copyfile(srcpath, dstpath) + with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, \ + open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout: - io_type = model.config.get_config_value('IOType') - if io_type == 'io_parallel': - self.write_testbench_parallel(model) - elif io_type == 'io_stream': - self.write_testbench_stream(model) + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(project_name.upper())) - def write_bridge(self, model): - """Write the Python-C++ bridge (myproject_bridge.cpp) + elif 'myproject' in line: + newline = line.replace('myproject', format(project_name)) + + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + + elif '// hls-fpga-machine-learning insert header' in line: + dtype = line.split('#', 1)[1].strip() + inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) + outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + '\n' + + elif '// hls-fpga-machine-learning insert wrapper' in line: + dtype = line.split('#', 1)[1].strip() + newline = '' + for i in model_inputs: + newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n' + newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input);\n' + newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n' + + newline += '\n' + + for o in model_outputs: + newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap')) + + newline += '\n' + + # input_vars = ','.join([i.name + '_input' for i in model_inputs]) + # bram_vars = ','.join([b.name for b in model_brams]) + # output_vars = ','.join([o.name + '_output' for o in model_outputs]) + + # Concatenate the input, output, and bram variables. Filter out empty/null values + all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + + top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n' + newline += top_level + + newline += '\n' + + for o in model_outputs: + newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' + newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp}>({o.name}_output, {o.name});\n' + elif '// hls-fpga-machine-learning insert trace_outputs' in line: + newline = '' + for layer in model.get_layers(): + func = layer.get_attr('function_cpp') + if func and model.config.trace_output and layer.get_attr('trace', False): + vars = layer.get_variables() + for var in vars: + newline += ( + indent + + 'nnet::trace_outputs->insert(std::pair(' + + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n' + ) + + else: + newline = line + fout.write(newline) - Args: - model (ModelGraph): the hls4ml model. - """ - pass def write_build_script(self, model): """Write the build scripts (Makefile, build_lib.sh) From 8f6ef788d9afd5507ecaeb122c3c4f280fe1c865 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 27 Dec 2023 19:06:18 -0600 Subject: [PATCH 005/100] fix issues discovered when compiling --- .../backends/oneapi/passes/core_templates.py | 8 +- .../firmware/nnet_utils/nnet_activation.h | 79 ++++++++++--------- .../oneapi/firmware/nnet_utils/nnet_dense.h | 21 ++--- hls4ml/writer/oneapi_writer.py | 26 +++--- 4 files changed, 68 insertions(+), 66 deletions(-) diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index aece9fc22..608e6b7ff 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -35,8 +35,8 @@ dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' -dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] - +# dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] +dense_include_list = ['nnet_utils/nnet_dense.h'] class DenseConfigTemplate(LayerConfigTemplate): def __init__(self): @@ -147,8 +147,8 @@ def format(self, node): activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' -activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] - +# activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] +activ_include_list = ['nnet_utils/nnet_activation.h'] class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index d874741ec..191bf5613 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -2,6 +2,7 @@ #define NNET_ACTIVATION_H_ #include "nnet_common.h" +#include namespace nnet { @@ -23,7 +24,7 @@ struct activ_config { // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void linear(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -34,7 +35,7 @@ template void linear(data_T data[ // ************************************************* // RELU Activation // ************************************************* -template void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void relu(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -46,7 +47,7 @@ template void relu(data_T data[CO } template -void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void relu_max(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -59,11 +60,11 @@ void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } } -template void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void relu6(const std::array data, std::array res) { relu_max(data, res); } -template void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void relu1(const std::array data, std::array res) { relu_max(data, res); } @@ -71,13 +72,13 @@ template void relu1(data_T data[C // Sigmoid Activation // ************************************************* template -void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void sigmoid(const std::array data, std::array res) { static const int MAX_VALUE = 8; #include "activation_tables/sigmoid_table.tb" #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T absoluteValue hls_register; - res_T temp2 hls_register; + [[intel::fpga_register]] data_T absoluteValue; + [[intel::fpga_register]] res_T temp2; if (data[ii] < 0) { absoluteValue = -data[ii]; } else { @@ -106,7 +107,7 @@ template inline unsigned softmax_stable_idx_fr static constexpr int N = ceillog2::val; // Slice the top N bits of the input - hls_register ac_int y = x.template slc(x.width - N - 1); + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N - 1); // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness if (x != 0 && y == 0) y[0] = 1; @@ -118,29 +119,29 @@ template inline unsigned softmax_latency_idx_f static constexpr int N = ceillog2::val; // Slice the top N bits of the input - hls_register ac_int y = x.template slc(x.width - N); + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N); return y.to_uint(); } template -void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softmax_stable(const std::array data, std::array res) { // Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" // Find maximum Op_max op_max; - hls_register data_T x_max = reduce>(data, op_max); + [[intel::fpga_register]] data_T x_max = reduce>(data.data(), op_max); // For the diffs, use the same type as the input but force rounding and saturation - hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; + [[intel::fpga_register]] ac_fixed d_xi_xmax[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { d_xi_xmax[i] = data[i] - x_max; } // Calculate all the e^x's - hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; @@ -148,11 +149,11 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Explicitly sum previously calculated exponentials with an adder tree Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { @@ -162,12 +163,12 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // TODO - Improve accuracy template -void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softmax_latency(const std::array data, std::array res) { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" // Calculate all the e^x's - hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; @@ -175,11 +176,11 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Explicitly sum the results with an adder tree. Op_add op_add; - hls_register typename CONFIG_T::exp_table_t exp_sum = + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { @@ -188,11 +189,11 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } template -void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softmax_legacy(const std::array data, std::array res) { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" - hls_register int data_round[CONFIG_T::n_in]; + [[intel::fpga_register]] int data_round[CONFIG_T::n_in]; New_loop: #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { @@ -230,14 +231,14 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } template -void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softmax_argmax(const std::array data, std::array res) { #pragma unroll for (int i = 0; i < CONFIG_T::n_in; i++) { res[i] = (res_T)0; } - hls_register data_T maximum = data[0]; - hls_register int idx = 0; + [[intel::fpga_register]] data_T maximum = data[0]; + [[intel::fpga_register]] int idx = 0; #pragma ii 1 for (int i = 1; i < CONFIG_T::n_in; i++) { @@ -251,7 +252,7 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } template -inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +inline void softmax(const std::array data, std::array res) { switch (CONFIG_T::implementation) { case softmax_implementation::stable: softmax_stable(data, res); @@ -275,15 +276,15 @@ inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // TanH Activation // ************************************************* template -void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void dense_tanh(const std::array data, std::array res) { static const int MAX_VALUE = 4; // Initialize the lookup table #include "activation_tables/tanh_table.tb" // Index into the lookup table based on data #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T temp hls_register; - res_T temp2 hls_register; + [[intel::fpga_register]] data_T temp; + [[intel::fpga_register]] res_T temp2; if (data[ii] < 0) { temp = -data[ii]; } else { @@ -305,7 +306,7 @@ void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Hard sigmoid Activation // ************************************************* template -void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void hard_sigmoid(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -318,7 +319,7 @@ void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { } template -void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void hard_tanh(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -364,7 +365,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI // Softplus Activation // ************************************************* template -void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softplus(const std::array data, std::array res) { // Initialize the lookup table #include "activation_tables/softplus_table.tb" // Index into the lookup table based on data @@ -384,7 +385,7 @@ void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Softsign Activation // ************************************************* template -void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void softsign(const std::array data, std::array res) { static const int MAX_VALUE = 8; // Initialize the lookup table #include "activation_tables/softsign_table.tb" @@ -392,8 +393,8 @@ void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Index into the lookup table based on data #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T temp hls_register; - res_T temp2 hls_register; + [[intel::fpga_register]] data_T temp; + [[intel::fpga_register]] res_T temp2; if (data[ii] < 0) { temp = -data[ii]; } else { @@ -433,14 +434,14 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i } } -template void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void elu(const std::array data, std::array res) { elu(data, 1.0, res); } // ************************************************* // SELU Activation // ************************************************* -template void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void selu(const std::array data, std::array res) { // Initialize the lookup table #include "activation_tables/selu_table.tb" // Index into the lookup table based on data @@ -477,7 +478,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_ // Binary TanH Activation // ************************************************* template -void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void binary_tanh(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -495,7 +496,7 @@ void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { // Ternary TanH Activation // ************************************************* template -void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +void ternary_tanh(const std::array data, std::array res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = 2 * data[ii]; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index c1786ef78..5071a7d6a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -5,6 +5,7 @@ #include "nnet_helpers.h" #include "nnet_mult.h" #include +#include namespace nnet { @@ -37,21 +38,21 @@ struct dense_config { }; template -void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], +void dense_rf_gt(const std::array data, std::array res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); //#pragma ii CONFIG_T::reuse_factor - hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; Load: #pragma unroll for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; } - hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; - hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + [[intel::fpga_register]] int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; #pragma unroll for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { @@ -66,7 +67,7 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma nofusion #pragma speculated_iterations 0 for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { - hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; + [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; Product2: #pragma unroll for (int im = 0; im < CONFIG_T::block_factor; im++) { @@ -78,7 +79,7 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], tmp_acc[im] = CONFIG_T::template product::product(data[data_index], weights[w_index]); } - hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; ResetMult: #pragma unroll for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) { @@ -105,14 +106,14 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], } } template -void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], +void dense_rf_lt(const std::array data, std::array res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && "The current Reuse Factor is not allowed"); assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN"); - hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; InitAccum: #pragma unroll for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { @@ -122,7 +123,7 @@ void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma nofusion #pragma speculated_iterations 0 for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { - hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; MultLoop: #pragma unroll for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) { @@ -157,7 +158,7 @@ void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], } template void dense_resource( - const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], + const std::array data, std::array res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index e9d45ff08..bc0b13c46 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -62,7 +62,7 @@ def print_array_to_cpp(self, var, layer, odir): layer (Layer): Instance of the layer to which the weights belong odir (str): Output directory """ - with open(f"{odir}/firmware/weights/{var.name}.h", "w") as h_file: + with open(f"{odir}/src/firmware/weights/{var.name}.h", "w") as h_file: # meta data h_file.write(f"//Numpy array shape {var.shape}\n") @@ -142,7 +142,7 @@ def write_project_cpp(self, model): model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] - if len(model_brams != 0): + if len(model_brams) != 0: raise NotImplementedError("Weights on the interface is currently not supported") io_type = model.config.get_config_value('IOType') @@ -182,7 +182,7 @@ def write_project_cpp(self, model): if io_type != 'io_stream': vars = layer.get_variables() for var in vars: - if var not in model_inputs and var not in model_outputs: + if var not in model_inputs: def_cpp = var.definition_cpp() if def_cpp is not None: newline += ' ' + def_cpp + ';\n' @@ -364,7 +364,7 @@ def write_test_bench(self, model): model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] - if len(model_brams != 0): + if len(model_brams) != 0: raise NotImplementedError("Weights on the interface is currently not supported") if len(model_inputs) != 1 or len(model_outputs) != 1: @@ -392,11 +392,11 @@ def write_test_bench(self, model): output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat' ) - with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \ + with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, \ open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout: for line in f.readlines(): - indent = ' ' * (len(line) - len(line.lstrip(' '))) + indent = ' ' * (len(line) - len(line.lstrip(' '))) if 'myproject' in line: newline = line.replace('myproject', project_name) @@ -407,17 +407,17 @@ def write_test_bench(self, model): newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' - elif '// hls-fpga-machine-learning insert inputs': + elif '// hls-fpga-machine-learning insert inputs' in line: newline = line # there should really be only one input inp = model_inputs[0] newline += indent + f'std::vector<{inp.array_type}> inputs;\n' - elif '// hls-fpga-machine-learning insert results': + elif '// hls-fpga-machine-learning insert results' in line: newline = line # there should really be only one out out = model_outputs[0] - newline += indent + f'std::vector<{out.array_type}> predictions;\n' + newline += indent + f'std::vector<{out.array_type}> outputs;\n' elif '// hls-fpga-machine-learning insert tb-input' in line: newline = line inp = model_inputs[0] @@ -425,7 +425,7 @@ def write_test_bench(self, model): elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line out = model_outputs[0] - newline += indent + f'outputs[i] = {out.pipe_name}::read(q);\n' + newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n' else: newline = line @@ -494,7 +494,7 @@ def write_bridge(self, model): # output_vars = ','.join([o.name + '_output' for o in model_outputs]) # Concatenate the input, output, and bram variables. Filter out empty/null values - all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) + # all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n' newline += top_level @@ -539,7 +539,7 @@ def write_build_script(self, model): line = line.replace('myproject', model.config.get_project_name()) if 'set(FPGA_DEVICE' in line: - line = f' set(FPGA_DEVICE "{device}")' + line = f' set(FPGA_DEVICE "{device}")\n' fout.write(line) @@ -583,7 +583,7 @@ def __get_table_size(self, model, activation): return 1024 def __get_table_header(self, table_name, table_size): - table_header += f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{' + table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{' return table_header def __write_elu_table(self, model, path): From 2e56be430b8427dd27f45c5ce84fd1b5a4298d92 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 8 Jan 2024 15:18:24 -0600 Subject: [PATCH 006/100] update bridge writing files --- hls4ml/templates/oneapi/CMakeLists.txt | 14 ++++++++++++++ hls4ml/templates/oneapi/myproject_bridge.cpp | 2 +- hls4ml/writer/oneapi_writer.py | 5 +++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index 66c505450..5e5490b51 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -20,7 +20,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) ### Customize these build variables ############################################################################### set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp) +set(LIBRARY_FILES src/firmware/myproject.cpp src/myproject_bridge.cpp) +set(LIB_STAMP mystamp) set(TARGET_NAME myproject) +set(LIBRARY_NAME myproject-${LIB_STAMP}) # Use cmake -DFPGA_DEVICE=: to choose a # different device. Here are a few device examples (this list is not @@ -62,6 +65,7 @@ set(SIMULATOR_TARGET fpga_sim) set(REPORT_TARGET report) set(FPGA_TARGET fpga) set(IP_EXPORT_TARGET fpga_ip_export) +set(LIBRARY_TARGET lib) # Set the names of the generated files per makefile target set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET}) @@ -112,6 +116,16 @@ set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reu set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE) set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel) +############################################################################### +### FPGA Emulator library +############################################################################### +add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${EMULATOR_LINK_FLAGS}) +set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) + ############################################################################### ### FPGA Emulator ############################################################################### diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp index 4b7a6b170..f4974ad8b 100644 --- a/hls4ml/templates/oneapi/myproject_bridge.cpp +++ b/hls4ml/templates/oneapi/myproject_bridge.cpp @@ -7,7 +7,7 @@ #include #include "exception_handler.hpp" -/ + // hls-fpga-machine-learning insert bram namespace nnet { diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index bc0b13c46..6ccac4459 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -479,7 +479,7 @@ def write_bridge(self, model): newline = '' for i in model_inputs: newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n' - newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input);\n' + newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n' newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n' newline += '\n' @@ -503,7 +503,7 @@ def write_bridge(self, model): for o in model_outputs: newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' - newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp}>({o.name}_output, {o.name});\n' + newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): @@ -537,6 +537,7 @@ def write_build_script(self, model): for line in f.readlines(): line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) if 'set(FPGA_DEVICE' in line: line = f' set(FPGA_DEVICE "{device}")\n' From b90021fb94195ddf97e791368d3a880020bda94a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 8 Jan 2024 18:37:07 -0600 Subject: [PATCH 007/100] build library (but not tested) --- hls4ml/backends/oneapi/oneapi_backend.py | 40 +++++++++++++++++------- hls4ml/templates/oneapi/CMakeLists.txt | 12 ++++--- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index 799c28963..53f8d83b3 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -1,5 +1,5 @@ -import os -from contextlib import contextmanager +import subprocess +from pathlib import Path import numpy as np @@ -9,17 +9,8 @@ from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType -#from hls4ml.report import parse_oneapi_report - -@contextmanager -def chdir(newdir): - prevdir = os.getcwd() - os.chdir(os.path.expanduser(newdir)) - try: - yield - finally: - os.chdir(prevdir) +# from hls4ml.report import parse_oneapi_report class OneAPIBackend(FPGABackend): @@ -134,6 +125,31 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para return config + def compile(self, model): + """Compile the generated project that can be linked into Python runtime. + + Args: + model (ModelGraph): Model to compile. + + Raises: + Exception: If the project failed to compile + + Returns: + string: Returns the name of the compiled library. + """ + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run('make lib', shell=True, cwd=builddir, check=True) + + lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so' + return lib_name + def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False): """ Builds the project using Intel DPC++ (oneAPI) compiler. diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index 5e5490b51..abadd5395 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -31,8 +31,8 @@ set(LIBRARY_NAME myproject-${LIB_STAMP}) # intel_s10sx_pac:pac_s10 # intel_s10sx_pac:pac_s10_usm # intel_a10gx_pac:pac_a10 -# Note that depending on your installation, you may need to specify the full -# path to the board support package (BSP), this usually is in your install +# Note that depending on your installation, you may need to specify the full +# path to the board support package (BSP), this usually is in your install # folder. # # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a @@ -42,7 +42,7 @@ if(NOT DEFINED FPGA_DEVICE) endif() # Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend -# compilation. +# compilation. set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) # Use cmake -DUSER_FLAGS= to set extra flags for general compilation. @@ -97,6 +97,8 @@ else() endif() set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS}) +# for debugging need to do this. Not sure why +# set(COMMON_LINK_FLAGS -v -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) # A SYCL ahead-of-time (AoT) compile processes the device code in two stages. @@ -210,7 +212,7 @@ function(getCompileCommands common_compile_flags special_compile_flags common_li # Get the relative path to the source and object files file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source}) file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) - + # Creating a string that contains the compile command # Start by the compiler invocation set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}") @@ -254,7 +256,7 @@ function(getCompileCommands common_compile_flags special_compile_flags common_li # Add all the specific link flags foreach(FLAG ${special_link_flags}) set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") - endforeach() + endforeach() # Add the output file set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}") From f086aa2c6c142c79bf51845574e74040bf548dd7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 10 Jan 2024 17:50:19 -0600 Subject: [PATCH 008/100] fix a bug in testbench --- hls4ml/templates/oneapi/myproject_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index 39b2e17c1..fce1c19db 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -152,7 +152,6 @@ int main(int argc, char **argv) { } std::cout << std::endl; - // hls-fpga-machine-learning insert tb-output for(auto outval : outputs[j]) { fout << outval << " "; } From 1f28cbf934af8bfe54010b5f96addcfd4816c0a6 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 10 Jan 2024 20:13:47 -0600 Subject: [PATCH 009/100] snapshot after some debugging --- hls4ml/model/graph.py | 4 +- hls4ml/templates/oneapi/CMakeLists.txt | 6 ++- .../firmware/nnet_utils/nnet_activation.h | 42 +++++++++---------- .../oneapi/firmware/nnet_utils/nnet_dense.h | 6 +-- .../oneapi/firmware/nnet_utils/nnet_printf.h | 18 ++++++++ hls4ml/writer/oneapi_writer.py | 23 +++------- 6 files changed, 55 insertions(+), 44 deletions(-) create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index a6b5c29e8..ba10e0285 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -740,7 +740,9 @@ def predict(self, x): n_outputs = len(self.get_output_variables()) curr_dir = os.getcwd() - os.chdir(self.config.get_output_dir() + '/firmware') + newdir = self.config.get_output_dir() + '/firmware' if os.path.exists(self.config.get_output_dir() + '/firmware') \ + else self.config.get_output_dir() + '/src/firmware' + os.chdir(newdir) output = [] if n_samples == 1 and n_inputs == 1: diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index abadd5395..d6b2a4745 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -107,7 +107,9 @@ set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) # 2. The "link" stage invokes the compiler's FPGA backend before linking. For # this reason, FPGA backend flags must be passed as link flags in CMake. set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR) +set(LIBRARY_COMPILE_FLAGS -DFPGA_EMULATOR) set(EMULATOR_LINK_FLAGS ) +set(LIBRARY_LINK_FLAGS -L$ENV{FPGA_VARS_DIR}/host/linux64/lib) set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE) set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early) set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR) @@ -123,9 +125,9 @@ set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} ############################################################################### add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES}) target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) -target_compile_options(${LIBRARY_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${LIBRARY_COMPILE_FLAGS}) target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS}) -target_link_libraries(${LIBRARY_TARGET} ${EMULATOR_LINK_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${LIBRARY_LINK_FLAGS}) set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) ############################################################################### diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index 191bf5613..19fbdb3b5 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -24,7 +24,7 @@ struct activ_config { // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template void linear(const std::array data, std::array res) { +template void linear(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -35,7 +35,7 @@ template void linear(const std::a // ************************************************* // RELU Activation // ************************************************* -template void relu(const std::array data, std::array res) { +template void relu(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -47,7 +47,7 @@ template void relu(const std::arr } template -void relu_max(const std::array data, std::array res) { +void relu_max(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -60,11 +60,11 @@ void relu_max(const std::array data, std::array void relu6(const std::array data, std::array res) { +template void relu6(const std::array& data, std::array& res) { relu_max(data, res); } -template void relu1(const std::array data, std::array res) { +template void relu1(const std::array& data, std::array& res) { relu_max(data, res); } @@ -72,7 +72,7 @@ template void relu1(const std::ar // Sigmoid Activation // ************************************************* template -void sigmoid(const std::array data, std::array res) { +void sigmoid(const std::array& data, std::array& res) { static const int MAX_VALUE = 8; #include "activation_tables/sigmoid_table.tb" #pragma unroll @@ -124,7 +124,7 @@ template inline unsigned softmax_latency_idx_f } template -void softmax_stable(const std::array data, std::array res) { +void softmax_stable(const std::array& data, std::array& res) { // Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" @@ -163,7 +163,7 @@ void softmax_stable(const std::array data, std::array -void softmax_latency(const std::array data, std::array res) { +void softmax_latency(const std::array& data, std::array& res) { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" @@ -189,7 +189,7 @@ void softmax_latency(const std::array data, std::array -void softmax_legacy(const std::array data, std::array res) { +void softmax_legacy(const std::array& data, std::array& res) { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" @@ -231,7 +231,7 @@ void softmax_legacy(const std::array data, std::array -void softmax_argmax(const std::array data, std::array res) { +void softmax_argmax(const std::array& data, std::array& res) { #pragma unroll for (int i = 0; i < CONFIG_T::n_in; i++) { res[i] = (res_T)0; @@ -252,7 +252,7 @@ void softmax_argmax(const std::array data, std::array -inline void softmax(const std::array data, std::array res) { +inline void softmax(const std::array& data, std::array& res) { switch (CONFIG_T::implementation) { case softmax_implementation::stable: softmax_stable(data, res); @@ -276,7 +276,7 @@ inline void softmax(const std::array data, std::array -void dense_tanh(const std::array data, std::array res) { +void dense_tanh(const std::array& data, std::array& res) { static const int MAX_VALUE = 4; // Initialize the lookup table #include "activation_tables/tanh_table.tb" @@ -306,7 +306,7 @@ void dense_tanh(const std::array data, std::array -void hard_sigmoid(const std::array data, std::array res) { +void hard_sigmoid(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -319,7 +319,7 @@ void hard_sigmoid(const std::array data, std::array -void hard_tanh(const std::array data, std::array res) { +void hard_tanh(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -365,7 +365,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI // Softplus Activation // ************************************************* template -void softplus(const std::array data, std::array res) { +void softplus(const std::array& data, std::array& res) { // Initialize the lookup table #include "activation_tables/softplus_table.tb" // Index into the lookup table based on data @@ -385,7 +385,7 @@ void softplus(const std::array data, std::array -void softsign(const std::array data, std::array res) { +void softsign(const std::array& data, std::array& res) { static const int MAX_VALUE = 8; // Initialize the lookup table #include "activation_tables/softsign_table.tb" @@ -416,7 +416,7 @@ void softsign(const std::array data, std::array -void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) { +void elu(const std::array& data, const res_T alpha, std::array& res) { // Initialize the lookup table #include "activation_tables/elu_table.tb" // Index into the lookup table based on data @@ -434,14 +434,14 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i } } -template void elu(const std::array data, std::array res) { +template void elu(const std::array& data, std::array& res) { elu(data, 1.0, res); } // ************************************************* // SELU Activation // ************************************************* -template void selu(const std::array data, std::array res) { +template void selu(const std::array& data, std::array& res) { // Initialize the lookup table #include "activation_tables/selu_table.tb" // Index into the lookup table based on data @@ -478,7 +478,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_ // Binary TanH Activation // ************************************************* template -void binary_tanh(const std::array data, std::array res) { +void binary_tanh(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -496,7 +496,7 @@ void binary_tanh(const std::array data, std::array -void ternary_tanh(const std::array data, std::array res) { +void ternary_tanh(const std::array& data, std::array& res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = 2 * data[ii]; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index 5071a7d6a..f6bbfc04a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -38,7 +38,7 @@ struct dense_config { }; template -void dense_rf_gt(const std::array data, std::array res, +void dense_rf_gt(const std::array& data, std::array& res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -106,7 +106,7 @@ void dense_rf_gt(const std::array data, std::array -void dense_rf_lt(const std::array data, std::array res, +void dense_rf_lt(const std::array& data, std::array& res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -158,7 +158,7 @@ void dense_rf_lt(const std::array data, std::array void dense_resource( - const std::array data, std::array res, + const std::array& data, std::array& res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h new file mode 100644 index 000000000..830a322de --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h @@ -0,0 +1,18 @@ +#ifndef NNET_PRINTF_H_ +#define NNET_PRINTF_H_ + +#ifdef __SYCL_DEVICE_ONLY__ +#define CL_CONSTANT __attribute__((opencl_constant)) +#else +#define CL_CONSTANT +#endif + +using namespace sycl; + +#define PRINTF(format, ...) \ + { \ + static const CL_CONSTANT char _format[] = format; \ + ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \ + } + +#endif diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 6ccac4459..75555475c 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -160,7 +160,7 @@ def write_project_cpp(self, model): newline = line if io_type == 'io_parallel': for inp in model_inputs: - newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' + newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' else: raise NotImplementedError("Only io_parallel is currently supported with oneAPI") @@ -203,7 +203,7 @@ def write_project_cpp(self, model): newline = line if io_type == 'io_parallel': for out in model_outputs: - newline += indent + f'{out.pipe_name}::write({out.name});\n' + newline += indent + f'{out.pipe_name}::write({out.name});\n' else: raise NotImplementedError("Only io_parallel is currently supported with oneAPI") @@ -425,7 +425,7 @@ def write_test_bench(self, model): elif '// hls-fpga-machine-learning insert tb-output' in line: newline = line out = model_outputs[0] - newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n' + newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n' else: newline = line @@ -484,25 +484,14 @@ def write_bridge(self, model): newline += '\n' - for o in model_outputs: - newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap')) - - newline += '\n' - - # input_vars = ','.join([i.name + '_input' for i in model_inputs]) - # bram_vars = ','.join([b.name for b in model_brams]) - # output_vars = ','.join([o.name + '_output' for o in model_outputs]) - - # Concatenate the input, output, and bram variables. Filter out empty/null values - # all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) - - top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n' - newline += top_level + newline += indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n' + newline += indent + 'q.wait();\n' newline += '\n' for o in model_outputs: newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' + newline += indent + f'for (auto val : {o.name}_output) std::cout << val.to_double() << std::endl;\n' newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' From 3e69b9a7f2d25aaca4c513b4c3fa62a0c24990ca Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 10 Jan 2024 20:35:57 -0600 Subject: [PATCH 010/100] remove forgotten debug printing --- hls4ml/writer/oneapi_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 75555475c..deff589ed 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -491,7 +491,6 @@ def write_bridge(self, model): for o in model_outputs: newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' - newline += indent + f'for (auto val : {o.name}_output) std::cout << val.to_double() << std::endl;\n' newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' From 17e6856e58351863d04aad1b0d2e2e934146964c Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 10 Jan 2024 20:58:20 -0600 Subject: [PATCH 011/100] add build --- hls4ml/backends/oneapi/oneapi_backend.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index 53f8d83b3..0f7ca7a77 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -150,22 +150,31 @@ def compile(self, model): lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so' return lib_name - def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False): + def build(self, model, build_type='fpga_emu', run=False): """ Builds the project using Intel DPC++ (oneAPI) compiler. Args: model (ModelGraph): The model to build - synth, optional: Whether to run HLS synthesis - fpgasynth, optional: Whether to run FPGA synthesis (oneAPI Compile) - log_level, optional: Logging level to be displayed during HLS synthesis (0, 1, 2) - cont_if_large_area: Instruct the HLS compiler to continue synthesis if the estimated resource usage exceeds - device resources + build_type, optional: What to build (e.g. fpga_emu, fpga_sim, fpga, report) + run, optional: Whether to run the testbench Errors raise exceptions """ # Check software needed is present - pass + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True) + + if run and build_type in ('fpga_emu', 'fpga_sim', 'fpga'): + executable = builddir / f'{model.config.get_project_name()}.{build_type}' + subprocess.run(f'{str(executable)}', shell=True, cwd=builddir, check=True) @layer_optimizer(Layer) def init_base_layer(self, layer): From 2766a6e5717a8f11d323345e0ffe157c078e431d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 12 Jan 2024 14:50:40 -0600 Subject: [PATCH 012/100] pre-commit fixes --- hls4ml/backends/__init__.py | 2 +- hls4ml/backends/oneapi/oneapi_types.py | 23 ++++--- .../backends/oneapi/passes/core_templates.py | 2 + .../backends/oneapi/passes/transform_types.py | 31 +++++---- hls4ml/model/graph.py | 5 +- hls4ml/templates/oneapi/exception_handler.hpp | 15 ++--- hls4ml/templates/oneapi/firmware/defines.h | 6 +- .../templates/oneapi/firmware/myproject.cpp | 9 +-- hls4ml/templates/oneapi/firmware/myproject.h | 9 +-- .../firmware/nnet_utils/nnet_activation.h | 48 +++++++------ .../oneapi/firmware/nnet_utils/nnet_common.h | 10 +-- .../oneapi/firmware/nnet_utils/nnet_dense.h | 8 +-- .../oneapi/firmware/nnet_utils/nnet_helpers.h | 32 +++------ .../oneapi/firmware/nnet_utils/nnet_printf.h | 10 +-- hls4ml/templates/oneapi/myproject_bridge.cpp | 6 +- hls4ml/templates/oneapi/myproject_test.cpp | 50 ++++++-------- hls4ml/writer/__init__.py | 2 +- hls4ml/writer/oneapi_writer.py | 67 +++++++++---------- 18 files changed, 161 insertions(+), 174 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index cbd39813f..ac6dd73fe 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -1,9 +1,9 @@ from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend # noqa: F401 from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 +from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend from hls4ml.backends.vivado.vivado_backend import VivadoBackend -from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index d76449f1e..f28d697a4 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -1,27 +1,30 @@ ''' This package includes oneAPI-specific customizations to the variable types ''' -from hls4ml.backends.fpga.fpga_types import VariableDefinition, ArrayVariableConverter +from hls4ml.backends.fpga.fpga_types import ArrayVariableConverter, VariableDefinition # region ArrayVarable + class OneAPIArrayVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}' + class OneAPIInplaceArrayVariableDefinition(VariableDefinition): def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' + class OneAPIArrayVariableConverter(ArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition) + class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter): def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition - ) + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition) + # endregion @@ -31,12 +34,14 @@ def __init__(self, type_converter): class OneAPIInterfaceVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}' - + def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n' - lines += indent + (f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n') + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n' + ) return lines @@ -63,9 +68,7 @@ def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition' class OneAPIInterfaceVariableConverter(InterfaceVariableConverter): def __init__(self, type_converter): - super().__init__( - type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition - ) + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition) # endregion diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index 608e6b7ff..929b5a8be 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -38,6 +38,7 @@ # dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] dense_include_list = ['nnet_utils/nnet_dense.h'] + class DenseConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__(Dense) @@ -150,6 +151,7 @@ def format(self, node): # activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] activ_include_list = ['nnet_utils/nnet_activation.h'] + class ActivationConfigTemplate(LayerConfigTemplate): def __init__(self): super().__init__((Activation, ParametrizedActivation, PReLU)) diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py index 121392eda..2cfcd02c7 100644 --- a/hls4ml/backends/oneapi/passes/transform_types.py +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -1,17 +1,14 @@ -from hls4ml.backends.fpga.fpga_types import ( - ACTypeConverter, - HLSTypeConverter, - StaticWeightVariableConverter, -) +from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter from hls4ml.backends.oneapi.oneapi_types import ( OneAPIArrayVariableConverter, OneAPIInplaceArrayVariableConverter, - OneAPIInterfaceVariableConverter + OneAPIInterfaceVariableConverter, ) from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable from hls4ml.utils.string_utils import convert_to_pascal_case + class TransformTypes(GlobalOptimizerPass): def __init__(self): self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter()) @@ -28,15 +25,21 @@ def transform(self, model, node): raise NotImplementedError("io_stream is not yet implemented for oneAPI") elif io_type == 'io_parallel': if out_name in node.model.inputs: - new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register', - pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', - pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', - array_type=f'{var.name}_array_t') + new_var = self.interface_var_converter.convert( + var, + pragma='intel::fpga_register', + pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', + pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', + array_type=f'{var.name}_array_t', + ) elif out_name in node.model.outputs: - new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register', - pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', - pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', - array_type=f'{var.name}_array_t') + new_var = self.interface_var_converter.convert( + var, + pragma='intel::fpga_register', + pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', + pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', + array_type=f'{var.name}_array_t', + ) elif isinstance(var, InplaceTensorVariable): new_var = self.inplace_array_var_converter.convert(var, pragma='') else: diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index ba10e0285..eb7c6f36e 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -740,8 +740,11 @@ def predict(self, x): n_outputs = len(self.get_output_variables()) curr_dir = os.getcwd() - newdir = self.config.get_output_dir() + '/firmware' if os.path.exists(self.config.get_output_dir() + '/firmware') \ + newdir = ( + self.config.get_output_dir() + '/firmware' + if os.path.exists(self.config.get_output_dir() + '/firmware') else self.config.get_output_dir() + '/src/firmware' + ) os.chdir(newdir) output = [] diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp index f5b9c8433..bb7976f61 100644 --- a/hls4ml/templates/oneapi/exception_handler.hpp +++ b/hls4ml/templates/oneapi/exception_handler.hpp @@ -1,20 +1,19 @@ #ifndef __EXCEPTIONHANDLER_HPP__ #define __EXCEPTIONHANDLER_HPP__ -#include #include #include +#include namespace fpga_tools { void exception_handler(sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cout << "Caught asynchronous SYCL exception:\n" - << e.what() << std::endl; + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl; + } } - } } } // namespace fpga_tools diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index 04dc640a1..7e6bb6b6e 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -1,12 +1,12 @@ #ifndef DEFINES_H_ #define DEFINES_H_ -#include -#include +#include #include #include +#include #include -#include +#include // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp index 0dc79a21c..38e18e6ac 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.cpp +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -8,12 +8,9 @@ void MyProject::operator()() const { // NETWORK INSTANTIATION // **************************************** -// hls-fpga-machine-learning read in + // hls-fpga-machine-learning read in -// hls-fpga-machine-learning insert layers - -// hls-fpga-machine-learning return + // hls-fpga-machine-learning insert layers + // hls-fpga-machine-learning return } - - diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 52f457344..082ae5dc8 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -6,8 +6,7 @@ // This file defines the interface to the kernel // currently this is fixed -using PipeProps = decltype(sycl::ext::oneapi::experimental::properties( - sycl::ext::intel::experimental::ready_latency<0>)); +using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); // Need to declare the input and output pipes @@ -20,13 +19,11 @@ struct MyProject { // kernel property method to config invocation interface auto get(sycl::ext::oneapi::experimental::properties_tag) { - return sycl::ext::oneapi::experimental::properties{ - sycl::ext::intel::experimental::streaming_interface<>, - sycl::ext::intel::experimental::pipelined<>}; + return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>, + sycl::ext::intel::experimental::pipelined<>}; } SYCL_EXTERNAL void operator()() const; }; - #endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index 19fbdb3b5..411d42b09 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -24,7 +24,8 @@ struct activ_config { // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template void linear(const std::array& data, std::array& res) { +template +void linear(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -35,7 +36,8 @@ template void linear(const std::a // ************************************************* // RELU Activation // ************************************************* -template void relu(const std::array& data, std::array& res) { +template +void relu(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -47,7 +49,7 @@ template void relu(const std::arr } template -void relu_max(const std::array& data, std::array& res) { +void relu_max(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -60,11 +62,13 @@ void relu_max(const std::array& data, std::array void relu6(const std::array& data, std::array& res) { +template +void relu6(const std::array &data, std::array &res) { relu_max(data, res); } -template void relu1(const std::array& data, std::array& res) { +template +void relu1(const std::array &data, std::array &res) { relu_max(data, res); } @@ -72,7 +76,7 @@ template void relu1(const std::ar // Sigmoid Activation // ************************************************* template -void sigmoid(const std::array& data, std::array& res) { +void sigmoid(const std::array &data, std::array &res) { static const int MAX_VALUE = 8; #include "activation_tables/sigmoid_table.tb" #pragma unroll @@ -124,7 +128,7 @@ template inline unsigned softmax_latency_idx_f } template -void softmax_stable(const std::array& data, std::array& res) { +void softmax_stable(const std::array &data, std::array &res) { // Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" @@ -163,7 +167,7 @@ void softmax_stable(const std::array& data, std::array -void softmax_latency(const std::array& data, std::array& res) { +void softmax_latency(const std::array &data, std::array &res) { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" @@ -189,7 +193,7 @@ void softmax_latency(const std::array& data, std::array< } template -void softmax_legacy(const std::array& data, std::array& res) { +void softmax_legacy(const std::array &data, std::array &res) { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" @@ -231,7 +235,7 @@ void softmax_legacy(const std::array& data, std::array -void softmax_argmax(const std::array& data, std::array& res) { +void softmax_argmax(const std::array &data, std::array &res) { #pragma unroll for (int i = 0; i < CONFIG_T::n_in; i++) { res[i] = (res_T)0; @@ -252,7 +256,7 @@ void softmax_argmax(const std::array& data, std::array -inline void softmax(const std::array& data, std::array& res) { +inline void softmax(const std::array &data, std::array &res) { switch (CONFIG_T::implementation) { case softmax_implementation::stable: softmax_stable(data, res); @@ -276,7 +280,7 @@ inline void softmax(const std::array& data, std::array -void dense_tanh(const std::array& data, std::array& res) { +void dense_tanh(const std::array &data, std::array &res) { static const int MAX_VALUE = 4; // Initialize the lookup table #include "activation_tables/tanh_table.tb" @@ -306,7 +310,7 @@ void dense_tanh(const std::array& data, std::array -void hard_sigmoid(const std::array& data, std::array& res) { +void hard_sigmoid(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -319,7 +323,7 @@ void hard_sigmoid(const std::array& data, std::array -void hard_tanh(const std::array& data, std::array& res) { +void hard_tanh(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -365,7 +369,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI // Softplus Activation // ************************************************* template -void softplus(const std::array& data, std::array& res) { +void softplus(const std::array &data, std::array &res) { // Initialize the lookup table #include "activation_tables/softplus_table.tb" // Index into the lookup table based on data @@ -385,7 +389,7 @@ void softplus(const std::array& data, std::array -void softsign(const std::array& data, std::array& res) { +void softsign(const std::array &data, std::array &res) { static const int MAX_VALUE = 8; // Initialize the lookup table #include "activation_tables/softsign_table.tb" @@ -416,7 +420,7 @@ void softsign(const std::array& data, std::array -void elu(const std::array& data, const res_T alpha, std::array& res) { +void elu(const std::array &data, const res_T alpha, std::array &res) { // Initialize the lookup table #include "activation_tables/elu_table.tb" // Index into the lookup table based on data @@ -434,14 +438,16 @@ void elu(const std::array& data, const res_T alpha, std: } } -template void elu(const std::array& data, std::array& res) { +template +void elu(const std::array &data, std::array &res) { elu(data, 1.0, res); } // ************************************************* // SELU Activation // ************************************************* -template void selu(const std::array& data, std::array& res) { +template +void selu(const std::array &data, std::array &res) { // Initialize the lookup table #include "activation_tables/selu_table.tb" // Index into the lookup table based on data @@ -478,7 +484,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_ // Binary TanH Activation // ************************************************* template -void binary_tanh(const std::array& data, std::array& res) { +void binary_tanh(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = data[ii]; @@ -496,7 +502,7 @@ void binary_tanh(const std::array& data, std::array -void ternary_tanh(const std::array& data, std::array& res) { +void ternary_tanh(const std::array &data, std::array &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { data_T datareg = 2 * data[ii]; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h index abefd87b8..f37a61cb0 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h @@ -1,11 +1,10 @@ #ifndef NNET_COMMON_H_ #define NNET_COMMON_H_ - #include "nnet_helpers.h" -#include #include #include +#include typedef ac_fixed<16, 6> table_default_t; @@ -39,14 +38,11 @@ template void merge(data_T data1[NIN1], data_ * before applying and accumulate the result over the rolled dimension. * --- */ template T reduce(const T *x, Op op) { - static constexpr int leftN = pow2::val>::val > 0 ? - pow2::val>::val : - 0; + static constexpr int leftN = pow2::val>::val > 0 ? pow2::val>::val : 0; static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; if constexpr (N == 1) { return x[0]; - } - else if constexpr (N == 2) { + } else if constexpr (N == 2) { return op(x[0], x[1]); } else { return op(reduce(x, op), reduce(x + leftN, op)); diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index f6bbfc04a..d4a5ad895 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -4,8 +4,8 @@ #include "nnet_common.h" #include "nnet_helpers.h" #include "nnet_mult.h" -#include #include +#include namespace nnet { @@ -38,7 +38,7 @@ struct dense_config { }; template -void dense_rf_gt(const std::array& data, std::array& res, +void dense_rf_gt(const std::array &data, std::array &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -106,7 +106,7 @@ void dense_rf_gt(const std::array& data, std::array -void dense_rf_lt(const std::array& data, std::array& res, +void dense_rf_lt(const std::array &data, std::array &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -158,7 +158,7 @@ void dense_rf_lt(const std::array& data, std::array void dense_resource( - const std::array& data, std::array& res, + const std::array &data, std::array &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h index 888ea4a6f..284bbfd6f 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -2,13 +2,13 @@ #define NNET_HELPERS_H #include +#include +#include +#include #include #include #include -#include #include -#include -#include namespace nnet { @@ -30,48 +30,39 @@ extern size_t trace_type_size; // constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } // replace with template metaprogramming -template struct ceillog2 -{ +template struct ceillog2 { enum { val = 1 + ceillog2<((n + 1) / 2)>::val }; }; -template<> struct ceillog2<2> -{ +template <> struct ceillog2<2> { enum { val = 1 }; }; -template<> struct ceillog2<1> -{ +template <> struct ceillog2<1> { enum { val = 0 }; }; - // constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } // replace with template metaprogramming -template struct floorlog2 -{ +template struct floorlog2 { enum { val = 1 + floorlog2<(n / 2)>::val }; }; -template<> struct floorlog2<1> -{ +template <> struct floorlog2<1> { enum { val = 0 }; }; -template<> struct floorlog2<0> -{ +template <> struct floorlog2<0> { enum { val = 0 }; }; // constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } // replace with template metaprogramming -template struct pow2 -{ +template struct pow2 { enum { val = 2 * pow2<(n - 1)>::val }; }; -template<> struct pow2<0> -{ +template <> struct pow2<0> { enum { val = 1 }; }; @@ -81,7 +72,6 @@ template void save_output_array(data_T *data, save_ } } - // We don't want to include save_T in this function because it will be inserted into myproject.cpp // so a workaround with element size is used template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h index 830a322de..5fec90d1a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h @@ -9,10 +9,10 @@ using namespace sycl; -#define PRINTF(format, ...) \ - { \ - static const CL_CONSTANT char _format[] = format; \ - ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \ - } +#define PRINTF(format, ...) \ + { \ + static const CL_CONSTANT char _format[] = format; \ + ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \ + } #endif diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp index f4974ad8b..3beb224ea 100644 --- a/hls4ml/templates/oneapi/myproject_bridge.cpp +++ b/hls4ml/templates/oneapi/myproject_bridge.cpp @@ -55,8 +55,7 @@ void myproject_float( // hls-fpga-machine-learning insert header #float ) { auto selector = sycl::ext::intel::fpga_emulator_selector_v; - sycl::queue q(selector, fpga_tools::exception_handler, - sycl::property::queue::enable_profiling{}); + sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{}); // hls-fpga-machine-learning insert wrapper #float } @@ -65,8 +64,7 @@ void myproject_double( // hls-fpga-machine-learning insert header #double ) { auto selector = sycl::ext::intel::fpga_emulator_selector_v; - sycl::queue q(selector, fpga_tools::exception_handler, - sycl::property::queue::enable_profiling{}); + sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{}); // hls-fpga-machine-learning insert wrapper #double } } diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index fce1c19db..c64fb6549 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -1,10 +1,10 @@ #include #include +#include #include #include #include #include -#include #include "firmware/myproject.h" #include "firmware/parameters.h" @@ -17,34 +17,29 @@ #define CHECKPOINT 5000 - int main(int argc, char **argv) { #if FPGA_SIMULATOR auto selector = sycl::ext::intel::fpga_simulator_selector_v; #elif FPGA_HARDWARE auto selector = sycl::ext::intel::fpga_selector_v; -#else // #if FPGA_EMULATOR +#else // #if FPGA_EMULATOR auto selector = sycl::ext::intel::fpga_emulator_selector_v; #endif - sycl::queue q(selector, fpga_tools::exception_handler, - sycl::property::queue::enable_profiling{}); + sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{}); auto device = q.get_device(); // make sure the device supports USM host allocations if (!device.has(sycl::aspect::usm_host_allocations)) { - std::cerr << "This design must either target a board that supports USM " - "Host/Shared allocations, or IP Component Authoring. " - << std::endl; - std::terminate(); + std::cerr << "This design must either target a board that supports USM " + "Host/Shared allocations, or IP Component Authoring. " + << std::endl; + std::terminate(); } - std::cout << "Running on device: " - << device.get_info().c_str() - << std::endl; - + std::cout << "Running on device: " << device.get_info().c_str() << std::endl; // load input data from text file std::ifstream fin("tb_data/tb_input_features.dat"); @@ -95,32 +90,31 @@ int main(int argc, char **argv) { throw std::runtime_error("The output size does not match"); } std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); - } // Do this separately to avoid vector reallocation - for(int i = 0; i < num_iterations; i++) { + for (int i = 0; i < num_iterations; i++) { // hls-fpga-machine-learning insert tb-input - q.single_task(MyProject{}); // once or once for each + q.single_task(MyProject{}); // once or once for each } q.wait(); for (int j = 0; j < num_iterations; j++) { // hls-fpga-machine-learning insert tb-output - for(auto outval : outputs[j]) { - fout << outval << " "; + for (auto outval : outputs[j]) { + fout << outval << " "; } fout << std::endl; if (j % CHECKPOINT == 0) { std::cout << "Predictions" << std::endl; // hls-fpga-machine-learning insert predictions - for(auto predval : predictions[j]) { - std::cout << predval << " "; + for (auto predval : predictions[j]) { + std::cout << predval << " "; } std::cout << std::endl; std::cout << "Quantized predictions" << std::endl; // hls-fpga-machine-learning insert quantized - for(auto outval : outputs[j]) { - std::cout << outval << " "; + for (auto outval : outputs[j]) { + std::cout << outval << " "; } std::cout << std::endl; } @@ -132,14 +126,14 @@ int main(int argc, char **argv) { std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations << " invocations." << std::endl; // hls-fpga-machine-learning insert zero - for(int i = 0; i < num_iterations; i++) { + for (int i = 0; i < num_iterations; i++) { inputs.emplace_back(); outputs.emplace_back(); inputs.back().fill(0.0); } // hls-fpga-machine-learning insert top-level-function - for(int i = 0; i < num_iterations; i++) { + for (int i = 0; i < num_iterations; i++) { // hls-fpga-machine-learning insert tb-input q.single_task(MyProject{}); } @@ -147,13 +141,13 @@ int main(int argc, char **argv) { for (int j = 0; j < num_iterations; j++) { // hls-fpga-machine-learning insert tb-output - for(auto outval : outputs[j]) { - std::cout << outval << " "; + for (auto outval : outputs[j]) { + std::cout << outval << " "; } std::cout << std::endl; - for(auto outval : outputs[j]) { - fout << outval << " "; + for (auto outval : outputs[j]) { + fout << outval << " "; } fout << std::endl; } diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 942964fc8..c53163a4b 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,5 +1,5 @@ -from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.oneapi_writer import OneAPIWriter +from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index deff589ed..31ebaeaf9 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -63,7 +63,6 @@ def print_array_to_cpp(self, var, layer, odir): odir (str): Output directory """ with open(f"{odir}/src/firmware/weights/{var.name}.h", "w") as h_file: - # meta data h_file.write(f"//Numpy array shape {var.shape}\n") h_file.write(f"//Min {np.min(var.min):.12f}\n") @@ -98,9 +97,7 @@ def print_array_to_cpp(self, var, layer, odir): nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2) var_width = int(np.ceil(var.type.precision.width / 8)) bwidth = self.next_pow2(var_width) - weight_header += ( - f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' - ) + weight_header += f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' if var.storage.lower() == 'bram': weight_header += 'static ' else: @@ -116,7 +113,6 @@ def print_array_to_cpp(self, var, layer, odir): h_file.write("};\n") h_file.write("\n#endif\n") - def write_project_dir(self, model): """Write the base project directory @@ -135,9 +131,9 @@ def write_project_cpp(self, model): project_name = model.config.get_project_name() filedir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, \ - open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, open( + f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w' + ) as fout: model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] @@ -172,7 +168,6 @@ def write_project_cpp(self, model): if w not in model_brams: newline += f'#include "weights/{w.name}.h"\n' - # Neural net instantiation elif '// hls-fpga-machine-learning insert layers' in line: newline = line + '\n' @@ -213,7 +208,6 @@ def write_project_cpp(self, model): fout.write(newline) - def write_project_header(self, model): """Write the main architecture header file (myproject.h) @@ -224,9 +218,9 @@ def write_project_header(self, model): project_name = model.config.get_project_name() filedir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, \ - open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, open( + f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w' + ) as fout: model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] @@ -271,9 +265,9 @@ def write_defines(self, model): model (ModelGraph): the hls4ml model. """ filedir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, \ - open(f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, open( + f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w' + ) as fout: for line in f.readlines(): # Insert numbers if '// hls-fpga-machine-learning insert numbers' in line: @@ -315,13 +309,15 @@ def write_parameters(self, model): model (ModelGraph): the hls4ml model. """ filedir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, \ - open(f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, open( + f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w' + ) as fout: for line in f.readlines(): if '// hls-fpga-machine-learning insert includes' in line: newline = line - for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): + for include in sorted( + set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), [])) + ): newline += '#include "%s"\n' % include elif "// hls-fpga-machine-learning insert layer-config" in line: @@ -392,9 +388,9 @@ def write_test_bench(self, model): output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat' ) - with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, \ - open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, open( + f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w' + ) as fout: for line in f.readlines(): indent = ' ' * (len(line) - len(line.lstrip(' '))) @@ -447,9 +443,9 @@ def write_bridge(self, model): indent = ' ' filedir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, \ - open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, open( + f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w' + ) as fout: for line in f.readlines(): if 'MYPROJECT' in line: newline = line.replace('MYPROJECT', format(project_name.upper())) @@ -479,7 +475,10 @@ def write_bridge(self, model): newline = '' for i in model_inputs: newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n' - newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n' + newline += ( + indent + + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n' + ) newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n' newline += '\n' @@ -491,7 +490,10 @@ def write_bridge(self, model): for o in model_outputs: newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' - newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' + newline += ( + indent + + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' + ) elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): @@ -509,7 +511,6 @@ def write_bridge(self, model): newline = line fout.write(newline) - def write_build_script(self, model): """Write the build scripts (Makefile, build_lib.sh) @@ -520,9 +521,9 @@ def write_build_script(self, model): # Makefile filedir = os.path.dirname(os.path.abspath(__file__)) device = model.config.get_config_value('Part') - with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, \ - open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout: - + with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, open( + f'{model.config.get_output_dir()}/CMakeLists.txt', 'w' + ) as fout: for line in f.readlines(): line = line.replace('myproject', model.config.get_project_name()) line = line.replace('mystamp', model.config.get_config_value('Stamp')) @@ -532,7 +533,6 @@ def write_build_script(self, model): fout.write(line) - def write_nnet_utils(self, model): """Copy the nnet_utils, AP types headers and any custom source to the project output directory @@ -554,7 +554,6 @@ def write_nnet_utils(self, model): for h in headers: copyfile(srcpath + h, dstpath + h) - # custom source filedir = os.path.dirname(os.path.abspath(__file__)) From c4ce138799970981cb39987ef0884118d8e26f1a Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 12 Jan 2024 15:01:49 -0600 Subject: [PATCH 013/100] fix more pre-commit --- hls4ml/writer/oneapi_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 31ebaeaf9..3d826e1f0 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -2,7 +2,7 @@ import os import tarfile from collections import OrderedDict -from shutil import copyfile, copytree, rmtree +from shutil import copyfile import numpy as np import yaml From 354d70857ac91eab083e0bdc1b045ed9b15f1147 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 12 Jan 2024 15:33:47 -0600 Subject: [PATCH 014/100] fix more pre-commit errors --- hls4ml/writer/oneapi_writer.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 3d826e1f0..7ff0ccf08 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -97,7 +97,10 @@ def print_array_to_cpp(self, var, layer, odir): nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2) var_width = int(np.ceil(var.type.precision.width / 8)) bwidth = self.next_pow2(var_width) - weight_header += f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' + weight_header += ( + f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), ' + 'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' + ) if var.storage.lower() == 'bram': weight_header += 'static ' else: @@ -223,12 +226,12 @@ def write_project_header(self, model): ) as fout: model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() - model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + # model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] - # io_parallel and io_stream instantiate the top-level function differently - io_type = model.config.get_config_value('IOType') - indent = ' ' - brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + # io_parallel and io_stream instantiate the top-level function differently (io_stream not yet supported) + # io_type = model.config.get_config_value('IOType') + # indent = ' ' + # brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) for line in f.readlines(): if 'MYPROJECT' in line: @@ -392,7 +395,7 @@ def write_test_bench(self, model): f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w' ) as fout: for line in f.readlines(): - indent = ' ' * (len(line) - len(line.lstrip(' '))) + indent = ' ' * (len(line) - len(line.lstrip(' '))) if 'myproject' in line: newline = line.replace('myproject', project_name) @@ -439,7 +442,7 @@ def write_bridge(self, model): model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] # model brambs aren't actually supported yet - io_type = model.config.get_config_value('IOType') + # io_type = model.config.get_config_value('IOType') indent = ' ' filedir = os.path.dirname(os.path.abspath(__file__)) @@ -477,7 +480,8 @@ def write_bridge(self, model): newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n' newline += ( indent - + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n' + + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());' + + '\n' ) newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n' @@ -492,7 +496,8 @@ def write_bridge(self, model): newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' newline += ( indent - + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n' + + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>' + + f'({o.name}_output.data(), {o.name});\n' ) elif '// hls-fpga-machine-learning insert trace_outputs' in line: newline = '' From 8119029d178e373ca35c9bc0f723cac051e3ad1d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 21 Jan 2024 12:13:18 -0600 Subject: [PATCH 015/100] snapshot of work before reworking types --- hls4ml/backends/oneapi/oneapi_types.py | 51 +++++++++++++++++-- .../backends/oneapi/passes/transform_types.py | 28 +++++----- 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index f28d697a4..c679f14ae 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -1,7 +1,13 @@ ''' This package includes oneAPI-specific customizations to the variable types ''' -from hls4ml.backends.fpga.fpga_types import ArrayVariableConverter, VariableDefinition +from hls4ml.backends.fpga.fpga_types import ( + ArrayVariableConverter, + InplaceStreamVariableConverter, + StreamVariableConverter, + VariableDefinition, +) +from hls4ml.utils.string_utils import convert_to_pascal_case # region ArrayVarable @@ -51,16 +57,16 @@ def __init__(self, type_converter, prefix, definition_cls): self.prefix = prefix self.definition_cls = definition_cls - def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition'): + def convert(self, tensor_var, pragma='partition'): if isinstance(tensor_var, self.definition_cls): # Already converted return tensor_var tensor_var.pragma = pragma tensor_var.type = self.type_converter.convert(tensor_var.type) - tensor_var.pipe_name = pipe_name - tensor_var.pipe_id = pipe_id - tensor_var.array_type = array_type + tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe' + tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID' + tensor_var.array_type = f'{tensor_var.name}_array_t' tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {}) return tensor_var @@ -72,3 +78,38 @@ def __init__(self, type_converter): # endregion + + +# region StreamVariable +class OneAPIStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=True): + return f'{self.name}{name_suffix}' + + def declare_cpp(self, pipe_min_size=0, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.array_type}, {pipe_min_size}>;\n' + ) + return lines + + +class OneAPIInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'using {self.name} = {self.input_var.name}' + + +class OneAPIStreamVariableConverter(StreamVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition) + + +class OneAPIInplaceStreamVariableConverter(InplaceStreamVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition + ) + + +# endregion diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py index 2cfcd02c7..71a63585b 100644 --- a/hls4ml/backends/oneapi/passes/transform_types.py +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -2,11 +2,14 @@ from hls4ml.backends.oneapi.oneapi_types import ( OneAPIArrayVariableConverter, OneAPIInplaceArrayVariableConverter, + OneAPIInplaceStreamVariableConverter, OneAPIInterfaceVariableConverter, + OneAPIStreamVariableConverter, ) from hls4ml.model.optimizer import GlobalOptimizerPass from hls4ml.model.types import InplaceTensorVariable -from hls4ml.utils.string_utils import convert_to_pascal_case + +# from hls4ml.utils.string_utils import convert_to_pascal_case class TransformTypes(GlobalOptimizerPass): @@ -15,6 +18,8 @@ def __init__(self): self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter) self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter) self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter) self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter) def transform(self, model, node): @@ -22,24 +27,15 @@ def transform(self, model, node): for out_name, var in node.variables.items(): if io_type == 'io_stream': - raise NotImplementedError("io_stream is not yet implemented for oneAPI") + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var) + else: + new_var = self.stream_var_converter.convert(var) elif io_type == 'io_parallel': if out_name in node.model.inputs: - new_var = self.interface_var_converter.convert( - var, - pragma='intel::fpga_register', - pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', - pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', - array_type=f'{var.name}_array_t', - ) + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') elif out_name in node.model.outputs: - new_var = self.interface_var_converter.convert( - var, - pragma='intel::fpga_register', - pipe_name=f'{convert_to_pascal_case(var.name)}Pipe', - pipe_id=f'{convert_to_pascal_case(var.name)}PipeID', - array_type=f'{var.name}_array_t', - ) + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') elif isinstance(var, InplaceTensorVariable): new_var = self.inplace_array_var_converter.convert(var, pragma='') else: From cae1a8a001a66435812d1560ad4754fcced60a17 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 12 Feb 2024 17:31:51 -0600 Subject: [PATCH 016/100] Use using to decide array type, some preliminary updates --- hls4ml/backends/fpga/fpga_types.py | 2 +- hls4ml/backends/oneapi/oneapi_types.py | 2 +- hls4ml/templates/oneapi/CMakeLists.txt | 4 ++-- hls4ml/templates/oneapi/firmware/defines.h | 3 ++- hls4ml/templates/quartus/firmware/defines.h | 2 ++ hls4ml/templates/vivado/firmware/defines.h | 2 ++ 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index ceac0b5e4..41f3cd12e 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -172,7 +172,7 @@ def convert_precision(self, precision_converter): class PackedTypeConverter(TypeDefinition, TypePrecisionConverter): def definition_cpp(self): n_elem_expr = '/' if self.unpack else '*' - return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format( + return 'typedef array<{precision}, {n_elem}> {name};\n'.format( name=self.name, precision=self.precision.definition_cpp(), n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack), diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index c679f14ae..bf445c8f2 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -43,7 +43,7 @@ def definition_cpp(self, name_suffix='', as_reference=False): def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + f'using {self.array_type} = array<{self.type.name}, {self.size_cpp()}>;\n' lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n' diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index d6b2a4745..1ab0d3748 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -98,8 +98,8 @@ endif() set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS}) # for debugging need to do this. Not sure why -# set(COMMON_LINK_FLAGS -v -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) -set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) +set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) +# set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) # A SYCL ahead-of-time (AoT) compile processes the device code in two stages. # 1. The "compile" stage compiles the device code to an intermediate diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index 7e6bb6b6e..b88fca49b 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -11,6 +10,8 @@ // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" +using std::array; + // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h index c3fe4ec40..a465f2716 100644 --- a/hls4ml/templates/quartus/firmware/defines.h +++ b/hls4ml/templates/quartus/firmware/defines.h @@ -36,6 +36,8 @@ template using stream_out = ihc::stream_out; // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" +using nnet::array; + // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h index 1f11b0209..e0a75ec64 100644 --- a/hls4ml/templates/vivado/firmware/defines.h +++ b/hls4ml/templates/vivado/firmware/defines.h @@ -7,6 +7,8 @@ #include #include +using nnet::array; + // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision From 06a8c277c11622c8ddc08e2f80af52986ff6a69f Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 13 Feb 2024 19:00:15 -0600 Subject: [PATCH 017/100] snapshot unifying types --- hls4ml/backends/oneapi/oneapi_types.py | 78 ++++++---- .../firmware/nnet_utils/nnet_activation.h | 139 ++++++++---------- .../oneapi/firmware/nnet_utils/nnet_dense.h | 19 ++- hls4ml/writer/oneapi_writer.py | 4 +- 4 files changed, 117 insertions(+), 123 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index bf445c8f2..4559c1f9e 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -1,9 +1,11 @@ ''' This package includes oneAPI-specific customizations to the variable types ''' +import numpy as np + from hls4ml.backends.fpga.fpga_types import ( - ArrayVariableConverter, InplaceStreamVariableConverter, + PackedType, StreamVariableConverter, VariableDefinition, ) @@ -14,7 +16,7 @@ class OneAPIArrayVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): - return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}' + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' class OneAPIInplaceArrayVariableDefinition(VariableDefinition): @@ -22,12 +24,47 @@ def definition_cpp(self): return f'auto& {self.name} = {self.input_var.name}' -class OneAPIArrayVariableConverter(ArrayVariableConverter): +class AggregratedArrayVariableConverter: + """This is a bit of an extension of the standard ArrayVariableConverter""" + + def __init__(self, type_converter, prefix, definition_cls): + self.type_converter = type_converter + self.prefix = prefix + self.definition_cls = definition_cls + + def convert(self, tensor_var, pragma='', depth=0, n_pack=1): + if isinstance(tensor_var, self.definition_cls): # Already converted + return tensor_var + + tensor_var.pragma = pragma + if pragma == 'stream': + if depth == 0: + depth = np.prod(tensor_var.shape) // tensor_var.shape[-1] + self.pragma = ('stream', depth) + n_elem = tensor_var.shape[-1] + else: + self.pragma = pragma + n_elem = tensor_var.size() + n_pack = 1 # ignore any passed value + + tensor_var.type = self.type_converter.convert( + PackedType(tensor_var.type.name, tensor_var.type.precision, n_elem, n_pack) + ) + + # pipe_name and pipe_id are only used for io_stream and interface variables in io_parallel + tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe' + tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID' + + tensor_var.__class__ = type(self.prefix + 'AggregateArrayVariable', (type(tensor_var), self.definition_cls), {}) + return tensor_var + + +class OneAPIArrayVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition) -class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter): +class OneAPIInplaceArrayVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition) @@ -39,40 +76,19 @@ def __init__(self, type_converter): class OneAPIInterfaceVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): - return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}' + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.array_type} = array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + f'using {self.type.name} = array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n' lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n' + + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' ) return lines -class InterfaceVariableConverter: - def __init__(self, type_converter, prefix, definition_cls): - self.type_converter = type_converter - self.prefix = prefix - self.definition_cls = definition_cls - - def convert(self, tensor_var, pragma='partition'): - if isinstance(tensor_var, self.definition_cls): # Already converted - return tensor_var - - tensor_var.pragma = pragma - tensor_var.type = self.type_converter.convert(tensor_var.type) - - tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe' - tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID' - tensor_var.array_type = f'{tensor_var.name}_array_t' - - tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {}) - return tensor_var - - -class OneAPIInterfaceVariableConverter(InterfaceVariableConverter): +class OneAPIInterfaceVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition) @@ -87,10 +103,10 @@ def definition_cpp(self, name_suffix='', as_reference=True): def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + f'using {self.name} = std::array<{self.type.name}, {self.size_cpp()}>;\n' lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.array_type}, {pipe_min_size}>;\n' + + f'{self.type}, {pipe_min_size}>;\n' ) return lines diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index 411d42b09..ef22a6b20 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -2,7 +2,6 @@ #define NNET_ACTIVATION_H_ #include "nnet_common.h" -#include namespace nnet { @@ -24,11 +23,10 @@ struct activ_config { // ************************************************* // LINEAR Activation -- See Issue 53 // ************************************************* -template -void linear(const std::array &data, std::array &res) { +template void linear(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; res[ii] = datareg; } } @@ -36,11 +34,10 @@ void linear(const std::array &data, std::array -void relu(const std::array &data, std::array &res) { +template void relu(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg > 0) res[ii] = datareg; else @@ -48,11 +45,10 @@ void relu(const std::array &data, std::array -void relu_max(const std::array &data, std::array &res) { +template void relu_max(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg < 0) res[ii] = 0; else if (datareg > MAX_INT) @@ -62,27 +58,24 @@ void relu_max(const std::array &data, std::array -void relu6(const std::array &data, std::array &res) { +template void relu6(const data_T &data, res_T &res) { relu_max(data, res); } -template -void relu1(const std::array &data, std::array &res) { +template void relu1(const data_T &data, res_T &res) { relu_max(data, res); } // ************************************************* // Sigmoid Activation // ************************************************* -template -void sigmoid(const std::array &data, std::array &res) { +template void sigmoid(const data_T &data, res_T &res) { static const int MAX_VALUE = 8; #include "activation_tables/sigmoid_table.tb" #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - [[intel::fpga_register]] data_T absoluteValue; - [[intel::fpga_register]] res_T temp2; + [[intel::fpga_register]] typename data_T::value_type absoluteValue; + [[intel::fpga_register]] typename res_T::value_type temp2; if (data[ii] < 0) { absoluteValue = -data[ii]; } else { @@ -91,7 +84,7 @@ void sigmoid(const std::array &data, std::array MAX_VALUE) index = CONFIG_T::table_size - 1; - temp2 = (res_T)sigmoid_table[index]; + temp2 = static_cast(sigmoid_table[index]); if (data[ii] < 0) { res[ii] = 1 - temp2; } else { @@ -127,18 +120,19 @@ template inline unsigned softmax_latency_idx_f return y.to_uint(); } -template -void softmax_stable(const std::array &data, std::array &res) { +template void softmax_stable(const data_T &data, res_T &res) { // Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" // Find maximum - Op_max op_max; - [[intel::fpga_register]] data_T x_max = reduce>(data.data(), op_max); + Op_max op_max; + [[intel::fpga_register]] auto x_max = + reduce>(data.data(), op_max); // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed d_xi_xmax[CONFIG_T::n_in]; + [[intel::fpga_register]] ac_fixed + d_xi_xmax[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { d_xi_xmax[i] = data[i] - x_max; @@ -148,7 +142,7 @@ void softmax_stable(const std::array &data, std::array(d_xi_xmax[i])]; + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; } // Explicitly sum previously calculated exponentials with an adder tree @@ -166,8 +160,7 @@ void softmax_stable(const std::array &data, std::array -void softmax_latency(const std::array &data, std::array &res) { +template void softmax_latency(const data_T &data, res_T &res) { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" @@ -175,7 +168,7 @@ void softmax_latency(const std::array &data, std::array< [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; } // Explicitly sum the results with an adder tree. @@ -192,8 +185,7 @@ void softmax_latency(const std::array &data, std::array< } } -template -void softmax_legacy(const std::array &data, std::array &res) { +template void softmax_legacy(const data_T &data, res_T &res) { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" @@ -234,14 +226,13 @@ void softmax_legacy(const std::array &data, std::array -void softmax_argmax(const std::array &data, std::array &res) { +template void softmax_argmax(const data_T &data, res_T &res) { #pragma unroll for (int i = 0; i < CONFIG_T::n_in; i++) { - res[i] = (res_T)0; + res[i] = static_cast(0); } - [[intel::fpga_register]] data_T maximum = data[0]; + [[intel::fpga_register]] auto maximum = data[0]; [[intel::fpga_register]] int idx = 0; #pragma ii 1 @@ -252,11 +243,10 @@ void softmax_argmax(const std::array &data, std::array(1); } -template -inline void softmax(const std::array &data, std::array &res) { +template inline void softmax(const data_T &data, res_T &res) { switch (CONFIG_T::implementation) { case softmax_implementation::stable: softmax_stable(data, res); @@ -279,16 +269,15 @@ inline void softmax(const std::array &data, std::array -void dense_tanh(const std::array &data, std::array &res) { +template void dense_tanh(const data_T &data, res_T &res) { static const int MAX_VALUE = 4; // Initialize the lookup table #include "activation_tables/tanh_table.tb" // Index into the lookup table based on data #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - [[intel::fpga_register]] data_T temp; - [[intel::fpga_register]] res_T temp2; + [[intel::fpga_register]] typename data_T::value_type temp; + [[intel::fpga_register]] typename res_T::value_type temp2; if (data[ii] < 0) { temp = -data[ii]; } else { @@ -297,7 +286,7 @@ void dense_tanh(const std::array &data, std::array index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int(); if (temp > MAX_VALUE) index = CONFIG_T::table_size - 1; - temp2 = (res_T)tanh_table[index]; + temp2 = static_cast(tanh_table[index]); if (data[ii] < 0) { res[ii] = -temp2; } else { @@ -309,8 +298,7 @@ void dense_tanh(const std::array &data, std::array -void hard_sigmoid(const std::array &data, std::array &res) { +template void hard_sigmoid(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -322,8 +310,7 @@ void hard_sigmoid(const std::array &data, std::array -void hard_tanh(const std::array &data, std::array &res) { +template void hard_tanh(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; @@ -339,10 +326,10 @@ void hard_tanh(const std::array &data, std::array -void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) { +void leaky_relu(const data_T &data, typename data_T::value_type alpha, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg > 0) res[ii] = datareg; else @@ -354,10 +341,10 @@ void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n // Thresholded RELU Activation // ************************************************* template -void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) { +void thresholded_relu(const data_T &data, typename data_T::value_type theta, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg > theta) res[ii] = datareg; else @@ -368,8 +355,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI // ************************************************* // Softplus Activation // ************************************************* -template -void softplus(const std::array &data, std::array &res) { +template void softplus(const data_T &data, res_T &res) { // Initialize the lookup table #include "activation_tables/softplus_table.tb" // Index into the lookup table based on data @@ -381,15 +367,14 @@ void softplus(const std::array &data, std::array CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1; - res[ii] = (res_T)softplus_table[index]; + res[ii] = static_cast(softplus_table[index]); } } // ************************************************* // Softsign Activation // ************************************************* -template -void softsign(const std::array &data, std::array &res) { +template void softsign(const data_T &data, res_T &res) { static const int MAX_VALUE = 8; // Initialize the lookup table #include "activation_tables/softsign_table.tb" @@ -397,8 +382,8 @@ void softsign(const std::array &data, std::array &data, std::array index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int(); if (temp > MAX_VALUE) index = CONFIG_T::table_size - 1; - temp2 = (res_T)softsign_table[index]; + temp2 = static_cast(softsign_table[index]); if (data[ii] < 0) { res[ii] = -temp2; } else { @@ -419,14 +404,13 @@ void softsign(const std::array &data, std::array -void elu(const std::array &data, const res_T alpha, std::array &res) { +template void elu(const data_T &data, const res_T alpha, res_T &res) { // Initialize the lookup table #include "activation_tables/elu_table.tb" // Index into the lookup table based on data #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg >= 0) { res[ii] = datareg; } else { @@ -438,24 +422,22 @@ void elu(const std::array &data, const res_T alpha, std: } } -template -void elu(const std::array &data, std::array &res) { +template void elu(const data_T &data, res_T &res) { elu(data, 1.0, res); } // ************************************************* // SELU Activation // ************************************************* -template -void selu(const std::array &data, std::array &res) { +template void selu(const data_T &data, res_T &res) { // Initialize the lookup table #include "activation_tables/selu_table.tb" // Index into the lookup table based on data #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg >= 0) { - res[ii] = res_T(1.0507009873554804934193349852946) * datareg; + res[ii] = static_cast(1.0507009873554804934193349852946) * datareg; } else { ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); if (index > CONFIG_T::table_size - 1) @@ -468,11 +450,10 @@ void selu(const std::array &data, std::array -void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { +template void prelu(const data_T &data, const data_T &alpha, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; + auto datareg = data[ii]; if (datareg > 0) res[ii] = datareg; else @@ -483,30 +464,28 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_ // ************************************************* // Binary TanH Activation // ************************************************* -template -void binary_tanh(const std::array &data, std::array &res) { +template void binary_tanh(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = data[ii]; - res_T cache; + auto datareg = data[ii]; + typename res_T::value_type cache; if (datareg > 0) cache = 1; else cache = -1; - res[ii] = (res_T)cache; + res[ii] = cache; } } // ************************************************* // Ternary TanH Activation // ************************************************* -template -void ternary_tanh(const std::array &data, std::array &res) { +template void ternary_tanh(const data_T &data, res_T &res) { #pragma unroll for (int ii = 0; ii < CONFIG_T::n_in; ii++) { - data_T datareg = 2 * data[ii]; - res_T cache; + auto datareg = 2 * data[ii]; + typename res_T::value_type cache; if (datareg > 1) cache = 1; else if (datareg > -1 && datareg <= 1) @@ -514,7 +493,7 @@ void ternary_tanh(const std::array &data, std::array #include namespace nnet { @@ -38,7 +37,7 @@ struct dense_config { }; template -void dense_rf_gt(const std::array &data, std::array &res, +void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -76,8 +75,8 @@ void dense_rf_gt(const std::array &data, std::array::product(data[data_index], weights[w_index]); + tmp_acc[im] = CONFIG_T::template product::product( + data[data_index], weights[w_index]); } [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; ResetMult: @@ -102,11 +101,11 @@ void dense_rf_gt(const std::array &data, std::array(acc[ires]); // acc[jj]; + res[ires] = cast(acc[ires]); // acc[jj]; } } template -void dense_rf_lt(const std::array &data, std::array &res, +void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && @@ -131,8 +130,8 @@ void dense_rf_lt(const std::array &data, std::array= CONFIG_T::n_in * CONFIG_T::n_out) continue; // Modified this - mult[im] = - CONFIG_T::template product::product(data[in_index], weights[w_index]); + mult[im] = CONFIG_T::template product::product( + data[in_index], weights[w_index]); in_index += CONFIG_T::reuse_factor; if (in_index >= CONFIG_T::n_in) in_index = ir; @@ -153,12 +152,12 @@ void dense_rf_lt(const std::array &data, std::array(acc[ires]); + res[ires] = cast(acc[ires]); } } template void dense_resource( - const std::array &data, std::array &res, + const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 7ff0ccf08..7c58a9af4 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -410,13 +410,13 @@ def write_test_bench(self, model): newline = line # there should really be only one input inp = model_inputs[0] - newline += indent + f'std::vector<{inp.array_type}> inputs;\n' + newline += indent + f'std::vector<{inp.type}> inputs;\n' elif '// hls-fpga-machine-learning insert results' in line: newline = line # there should really be only one out out = model_outputs[0] - newline += indent + f'std::vector<{out.array_type}> outputs;\n' + newline += indent + f'std::vector<{out.type}> outputs;\n' elif '// hls-fpga-machine-learning insert tb-input' in line: newline = line inp = model_inputs[0] From 8f5877896d95111dc58927181a5fa3616c401ac7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 13 Feb 2024 22:24:56 -0600 Subject: [PATCH 018/100] fix the testbench and bridge --- hls4ml/writer/oneapi_writer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 7c58a9af4..a31d80b5e 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -410,13 +410,13 @@ def write_test_bench(self, model): newline = line # there should really be only one input inp = model_inputs[0] - newline += indent + f'std::vector<{inp.type}> inputs;\n' + newline += indent + f'std::vector<{inp.type.name}> inputs;\n' elif '// hls-fpga-machine-learning insert results' in line: newline = line # there should really be only one out out = model_outputs[0] - newline += indent + f'std::vector<{out.type}> outputs;\n' + newline += indent + f'std::vector<{out.type.name}> outputs;\n' elif '// hls-fpga-machine-learning insert tb-input' in line: newline = line inp = model_inputs[0] @@ -480,8 +480,8 @@ def write_bridge(self, model): newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n' newline += ( indent - + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());' - + '\n' + + f'nnet::convert_data<{dtype}, typename {i.type.name}::value_type, {i.size_cpp()}>' + + f'({i.name}, {i.name}_input.data());\n' ) newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n' @@ -496,7 +496,7 @@ def write_bridge(self, model): newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n' newline += ( indent - + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>' + + f'nnet::convert_data_back' + f'({o.name}_output.data(), {o.name});\n' ) elif '// hls-fpga-machine-learning insert trace_outputs' in line: From 86b0f4bb4c3c5436777942573d6d8a594c8e4d5f Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 14 Feb 2024 11:17:00 -0600 Subject: [PATCH 019/100] snapshot updating nnet_utils (not finished) --- .../oneapi/firmware/nnet_utils/nnet_conv1d.h | 5 +- .../nnet_utils/nnet_conv1d_resource.h | 69 +++++++------- .../nnet_utils/nnet_conv2d_resource.h | 31 +++--- .../nnet_utils/nnet_dense_compressed.h | 10 +- .../oneapi/firmware/nnet_utils/nnet_pooling.h | 20 ++-- .../firmware/nnet_utils/nnet_recurrent.h | 94 +++++++++---------- 6 files changed, 117 insertions(+), 112 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h index 8897e1315..549cb2c19 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h @@ -44,15 +44,14 @@ struct conv1d_config { }; template -void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], +void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { conv_1d_resource_cl(data, res, weights, biases); } template -void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], +void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h index a110d6d42..9690f56e2 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h @@ -12,13 +12,12 @@ enum class conv1d_implementation { combination, im2col, winograd }; // im2col - General-purpose 1D Convolution algorithm // **************************************************************** -template -void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int col) { +template +void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) { // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP // usage - hls_register int index = 0; + [[intel::fpga_register]] int index = 0; KernelLoop: #pragma unroll @@ -26,7 +25,7 @@ void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], ChannelLoop: #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { - hls_register int index_data = + [[intel::fpga_register]] int index_data = (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { data_col[index++] = data[index_data]; @@ -39,7 +38,7 @@ void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], template void conv_1d_im2col_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { // im2col performs no filter transformations; therefore, filter size remains constant @@ -48,6 +47,9 @@ void conv_1d_im2col_cl( // Unroll factor for loop traversing input image, derived from parallelisation_factor static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + using data_col_T = array; + using res_col_T = array; + ColLoop: #pragma unroll pf #pragma ii CONFIG_T::reuse_factor @@ -56,11 +58,11 @@ void conv_1d_im2col_cl( // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - hls_register data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; - im2col_1d_cl(data, data_col, i); + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_cl(data, data_col, i); - hls_register res_T res_col[CONFIG_T::n_filt]; - dense_resource(data_col, res_col, weights, biases); + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); // Unroll fully, since // (1) n_filt is usually low in io_parallel (< 32) @@ -88,7 +90,7 @@ inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[ template void winograd_conv1d_3x1_kernel_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { // Ensure Winograd conditions are met @@ -106,7 +108,7 @@ void winograd_conv1d_3x1_kernel_cl( int offset = CONFIG_T::n_filt * i; #pragma unroll for (int f = 0; f < CONFIG_T::n_filt; f++) { - res[offset + f] = static_cast(biases[f]); + res[offset + f] = static_cast(biases[f]); } } @@ -117,8 +119,8 @@ void winograd_conv1d_3x1_kernel_cl( #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { // Get current 4x1 tile - hls_register data_T T[16]; - hls_register uint8_t p = 0; + [[intel::fpga_register]] typename data_T::value_type T[16]; + [[intel::fpga_register]] uint8_t p = 0; #pragma unroll for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { @@ -130,24 +132,25 @@ void winograd_conv1d_3x1_kernel_cl( } // Transform input tile - hls_register typename CONFIG_T::accum_t D[4]; - winograd_transform_input_tile_3x1_kernel(T, D); + [[intel::fpga_register]] typename CONFIG_T::accum_t D[4]; + winograd_transform_input_tile_3x1_kernel(T, D); #pragma unroll for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { - hls_register int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel); + [[intel::fpga_register]] int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel); // Hadamard product between transformed input tile and kernel - hls_register typename CONFIG_T::accum_t Y[4]; + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[4]; #pragma unroll for (int i = 0; i < 4; i++) { Y[i] = static_cast(D[i] * weights[filter_offset + i]); } // Explicitly transform intermediate result Z = A'YA and save to output - res[CONFIG_T::n_filt * col + filter] += static_cast(Y[0] + Y[1] + Y[2]); + res[CONFIG_T::n_filt * col + filter] += static_cast(Y[0] + Y[1] + Y[2]); if ((col + 1) < CONFIG_T::out_width) - res[CONFIG_T::n_filt * (col + 1) + filter] += static_cast(Y[1] - Y[2] - Y[3]); + res[CONFIG_T::n_filt * (col + 1) + filter] += + static_cast(Y[1] - Y[2] - Y[3]); } } } @@ -157,17 +160,17 @@ void winograd_conv1d_3x1_kernel_cl( // 1D Convolution for 1x1 kernels using optimized im2col // **************************************************************** -template -void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan], - const int col) { +template +void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int col) { // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations - hls_register int index = 0; + [[intel::fpga_register]] int index = 0; ChannelLoop: #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { - hls_register int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + [[intel::fpga_register]] int index_data = + (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { data_col[index++] = data[index_data]; } else { @@ -177,8 +180,7 @@ void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], +void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); @@ -186,6 +188,9 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_ // Unroll factor for loop traversing input image, derived from parallelisation_factor static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); + using data_col_T = array; + using res_col_T = array; + ColLoop: #pragma unroll pf #pragma ii CONFIG_T::reuse_factor @@ -194,11 +199,11 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_ // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - hls_register data_T data_col[CONFIG_T::n_chan]; - im2col_1d_pointwise_cl(data, data_col, col); + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_pointwise_cl(data, data_col, col); - hls_register res_T res_col[CONFIG_T::n_filt]; - dense_resource(data_col, res_col, weights, biases); + [[intel::fpga_register]] res_T res_col; + dense_resource(data_col, res_col, weights, biases); // Unroll fully, since // (1) n_filt is usually low in io_parallel (< 32) @@ -216,7 +221,7 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_ // **************************************************************** template void conv_1d_resource_cl( - data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { static constexpr bool winograd_conditions = diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h index 73ad45592..85c4c78d9 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -20,18 +20,18 @@ void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little // DSP usage - hls_register int index = 0; + [[intel::fpga_register]] int index = 0; FiltHeightLoop: #pragma unroll for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) { - hls_register int input_row = + [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; FiltWidthLoop: #pragma unroll for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { - hls_register int input_col = + [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; ChannelLoop: @@ -73,10 +73,11 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - hls_register data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; + [[intel::fpga_register]] data_T + data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; im2col_2d_cl(data, data_col, i, j); - hls_register res_T res_col[CONFIG_T::n_filt]; + [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt]; dense_resource(data_col, res_col, weights, biases); // Unroll fully, since @@ -158,9 +159,9 @@ void winograd_conv2d_3x3_kernel_cl( #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { // Get current 4x4 tile - hls_register data_T T[16]; - hls_register typename CONFIG_T::accum_t D[16]; - hls_register uint8_t p = 0; + [[intel::fpga_register]] data_T T[16]; + [[intel::fpga_register]] typename CONFIG_T::accum_t D[16]; + [[intel::fpga_register]] uint8_t p = 0; #pragma unroll for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) { @@ -179,10 +180,10 @@ void winograd_conv2d_3x3_kernel_cl( #pragma unroll for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { - hls_register int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel); + [[intel::fpga_register]] int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel); // Hadamard product between transformed input tile and kernel - hls_register typename CONFIG_T::accum_t Y[16]; + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[16]; #pragma unroll for (int i = 0; i < 16; i++) { Y[i] = static_cast(D[i] * weights[filter_offset + i]); @@ -215,14 +216,14 @@ void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width data_T data_col[CONFIG_T::n_chan], const int row, const int col) { // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations - hls_register int index = 0; + [[intel::fpga_register]] int index = 0; ChannelLoop: #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { - hls_register int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; - hls_register int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; + [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; + [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { data_col[index++] = @@ -256,10 +257,10 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - hls_register data_T data_col[CONFIG_T::n_chan]; + [[intel::fpga_register]] data_T data_col[CONFIG_T::n_chan]; im2col_2d_pointwise_cl(data, data_col, row, col); - hls_register res_T res_col[CONFIG_T::n_filt]; + [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt]; dense_resource(data_col, res_col, weights, biases); FiltLoop: diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h index ba50a631b..a66423cef 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h @@ -12,7 +12,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; InitAccum: #pragma unroll @@ -20,8 +20,8 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], acc[i] = (typename CONFIG_T::accum_t)(biases[i]); } - hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; - hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; + [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; + [[intel::fpga_register]] data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor]; #pragma unroll for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { @@ -36,7 +36,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], #pragma nofusion #pragma speculated_iterations 0 for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { - hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor]; + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor]; CompressedMultLoop: #pragma unroll for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) { @@ -49,7 +49,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], inputs[is][im] = inputs[is + 1][im]; } } - hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out]; + [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out]; ResetMult: #pragma unroll for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h index bbfc0908e..c50c34601 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h @@ -7,7 +7,7 @@ namespace nnet { // Returns the maximum value from an array of size N template T max(T x[N]) { - hls_register T y = x[0]; + [[intel::fpga_register]] T y = x[0]; // Due to loop dependencies, pipelining & unrolling is not possible // Explictily disabling pipeline significantly reduces resource usage @@ -22,7 +22,7 @@ template T max(T x[N]) { // Returns the mean value of an array of size N template T avg(T (&x)[N]) { - hls_register T y = 0; + [[intel::fpga_register]] T y = 0; // Due to loop dependencies, pipelining & unrolling is not possible // Explictily disabling pipeline significantly reduces resource usage @@ -38,7 +38,7 @@ template T avg(T (&x)[N]) { // Returns the mean value of an array of size N // Overload of the above function; using a wider accumulator than the input to avoid overflow template ac_int avg(ac_int (&x)[N]) { - hls_register ac_int tmp = 0; + [[intel::fpga_register]] ac_int tmp = 0; // Due to loop dependencies, pipelining & unrolling is not possible // Explictily disabling pipeline significantly reduces resource usage @@ -57,7 +57,7 @@ template ac_int avg(ac_int (&x)[N]) { // Returns the mean value of an array of size N // Overload of the above function; using a wider accumulator than the input to avoid overflow template ac_fixed avg(ac_fixed (&x)[N]) { - hls_register ac_fixed tmp = 0; + [[intel::fpga_register]] ac_fixed tmp = 0; // Due to loop dependencies, pipelining & unrolling is not possible // Explictily disabling pipeline significantly reduces resource usage @@ -136,10 +136,10 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF #pragma unroll #pragma disable_loop_pipelining for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) { - hls_register data_T pool[CONFIG_T::pool_width]; + [[intel::fpga_register]] data_T pool[CONFIG_T::pool_width]; // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling - hls_register unsigned img_overlap = 0; + [[intel::fpga_register]] unsigned img_overlap = 0; PoolWidthLoop: #pragma unroll @@ -178,7 +178,7 @@ void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T r #pragma unroll #pragma disable_loop_pipelining for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { - hls_register data_T pool[CONFIG_T::n_in]; + [[intel::fpga_register]] data_T pool[CONFIG_T::n_in]; InputWidthLoop: #pragma unroll @@ -241,10 +241,10 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ #pragma unroll #pragma disable_loop_pipelining for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) { - hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + [[intel::fpga_register]] data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling - hls_register unsigned img_overlap = 0; + [[intel::fpga_register]] unsigned img_overlap = 0; PoolHeightLoop: #pragma unroll @@ -301,7 +301,7 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * #pragma unroll #pragma disable_loop_pipelining for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { - hls_register data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; + [[intel::fpga_register]] data_T pool[CONFIG_T::in_height * CONFIG_T::in_width]; InputLoop: #pragma unroll diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h index 464c6d415..340a8eda1 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h @@ -104,18 +104,18 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor; // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and // candidate state gates, for each of the units - hls_register typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units]; nnet::dense_resource(x, mat_mul_x_w, weights, bias); // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights), // for update, reset and candidate state gates, for each of the units - hls_register typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units]; nnet::dense_resource( h, mat_mul_h_wr, recurrent_weights, recurrent_bias); // A vector containing both the values of z(t) and r(t) for every state - hls_register typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units]; // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1) // Unrolled fully, no DSPs used @@ -125,12 +125,12 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], } // Activation on z(t) and r(t) - hls_register typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units]; CONFIG_T::template activation_recr::activation(z_r, z_r_act); // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h - hls_register typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units]; #pragma unroll recurrent_unroll_factor for (int i = 0; i < (CONFIG_T::n_units); i++) { hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; @@ -145,7 +145,7 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units], } // Activation on candidate state - hls_register typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units]; + [[intel::fpga_register]] typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units]; CONFIG_T::template activation::activation(h_cand, h_cand_act); @@ -163,8 +163,8 @@ void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T:: const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units], const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) { - hls_register data_T x[CONFIG_T::n_in]; - hls_register res_T h[CONFIG_T::n_units]; + [[intel::fpga_register]] data_T x[CONFIG_T::n_in]; + [[intel::fpga_register]] res_T h[CONFIG_T::n_units]; #pragma unroll for (int i = 0; i < CONFIG_T::n_units; i++) { @@ -233,17 +233,17 @@ void simple_rnn_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T: const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) { // Weight multiplication - typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] [[intel::fpga_register]]; multiply_W( inputs, afterW, kernel); // Bias addition - typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] [[intel::fpga_register]]; add_bias( afterW, afterBias, bias); // Hidden state - typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]]; multiply_U(hidden_state, hiddenCand, rec_kernel); @@ -261,10 +261,10 @@ void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[C const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out], const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out], const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) { - res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; - res_T hidden_state_temp[CONFIG_T::n_out] hls_register; - res_T h[CONFIG_T::n_out] hls_register; - data_T in[CONFIG_T::n_in] hls_register; + res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]]; + res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]]; + res_T h[CONFIG_T::n_out] [[intel::fpga_register]]; + data_T in[CONFIG_T::n_in] [[intel::fpga_register]]; // Set initially hidden state (output) to zero INIT_LOOP: @@ -360,39 +360,39 @@ void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) { // Internals definitions - typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] [[intel::fpga_register]]; // Hidden state Gate candidates, intermediate variables - typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]]; // After addition, intermediate variables - typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]]; // Gate outputs - typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t h[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t h[CONFIG_T::n_out] [[intel::fpga_register]]; // Intermediate variable cell calculation - typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] hls_register; - typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] hls_register; + typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] [[intel::fpga_register]]; + typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] [[intel::fpga_register]]; //-----------Gate I Calculations // Weight multiplication @@ -518,13 +518,13 @@ void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_ const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out], const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out], const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) { - res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; - res_T hidden_state_temp[CONFIG_T::n_out] hls_register; - res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register; - res_T cell_state_temp[CONFIG_T::n_out] hls_register; - res_T h[CONFIG_T::n_out] hls_register; - res_T c[CONFIG_T::n_out] hls_register; - data_T in[CONFIG_T::n_in] hls_register; + res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]]; + res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]]; + res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]]; + res_T cell_state_temp[CONFIG_T::n_out] [[intel::fpga_register]]; + res_T h[CONFIG_T::n_out] [[intel::fpga_register]]; + res_T c[CONFIG_T::n_out] [[intel::fpga_register]]; + data_T in[CONFIG_T::n_in] [[intel::fpga_register]]; // Set initially hidden state (output) to zero INIT_LOOP: From 62c5ecb3cedeb30b93ae133948aea48aa9c93b33 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 14 Feb 2024 11:39:08 -0600 Subject: [PATCH 020/100] define array in nnet_types for oneAPI --- hls4ml/backends/fpga/fpga_types.py | 2 +- hls4ml/backends/oneapi/oneapi_types.py | 6 ++++-- hls4ml/templates/oneapi/firmware/defines.h | 2 -- hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h | 3 +++ hls4ml/templates/quartus/firmware/defines.h | 2 -- hls4ml/templates/vivado/firmware/defines.h | 2 -- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py index 41f3cd12e..ceac0b5e4 100644 --- a/hls4ml/backends/fpga/fpga_types.py +++ b/hls4ml/backends/fpga/fpga_types.py @@ -172,7 +172,7 @@ def convert_precision(self, precision_converter): class PackedTypeConverter(TypeDefinition, TypePrecisionConverter): def definition_cpp(self): n_elem_expr = '/' if self.unpack else '*' - return 'typedef array<{precision}, {n_elem}> {name};\n'.format( + return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format( name=self.name, precision=self.precision.definition_cpp(), n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack), diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index 4559c1f9e..103f015c4 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -80,7 +80,9 @@ def definition_cpp(self, name_suffix='', as_reference=False): def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.type.name} = array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n' + lines += ( + indent + f'using {self.type.name} = nnet::array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n' + ) lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' @@ -103,7 +105,7 @@ def definition_cpp(self, name_suffix='', as_reference=True): def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.name} = std::array<{self.type.name}, {self.size_cpp()}>;\n' + lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n' lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + f'{self.type}, {pipe_min_size}>;\n' diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index b88fca49b..05b98cda2 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -10,8 +10,6 @@ // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" -using std::array; - // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index 221055938..cd572f0c7 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -1,12 +1,15 @@ #ifndef NNET_TYPES_H_ #define NNET_TYPES_H_ +#include #include #include #include namespace nnet { +template using array = std::array; + /* * HLS Shift Register Implementation * To verify a shift register is used in hardware, go to report.html > Area Analysis of System diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h index a465f2716..c3fe4ec40 100644 --- a/hls4ml/templates/quartus/firmware/defines.h +++ b/hls4ml/templates/quartus/firmware/defines.h @@ -36,8 +36,6 @@ template using stream_out = ihc::stream_out; // Include nnet::array - a custom array-like struct, mainly used with io_stream #include "nnet_utils/nnet_types.h" -using nnet::array; - // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h index e0a75ec64..1f11b0209 100644 --- a/hls4ml/templates/vivado/firmware/defines.h +++ b/hls4ml/templates/vivado/firmware/defines.h @@ -7,8 +7,6 @@ #include #include -using nnet::array; - // hls-fpga-machine-learning insert numbers // hls-fpga-machine-learning insert layer-precision From d203b42af702fc9b14ad47354c742f796306b088 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 14 Feb 2024 13:45:35 -0600 Subject: [PATCH 021/100] fix parallel conv2d --- .../firmware/nnet_utils/nnet_activation.h | 3 +- .../oneapi/firmware/nnet_utils/nnet_conv2d.h | 6 +- .../nnet_utils/nnet_conv2d_resource.h | 65 ++++++++++--------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index ef22a6b20..3fbeeaa66 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -404,7 +404,8 @@ template void softsign(const data // ************************************************* // ELU Activation // ************************************************* -template void elu(const data_T &data, const res_T alpha, res_T &res) { +template +void elu(const data_T &data, const typename res_T::value_type alpha, res_T &res) { // Initialize the lookup table #include "activation_tables/elu_table.tb" // Index into the lookup table based on data diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h index 3aa71a74b..0038ce7d1 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h @@ -50,8 +50,7 @@ struct conv2d_config { }; template -void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], +void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { @@ -59,8 +58,7 @@ void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T: } template -void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], +void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h index 85c4c78d9..8c7fdcad2 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -13,10 +13,8 @@ enum class conv2d_implementation { combination, im2col, winograd }; // im2col - General-purpose 2D Convolution algorithm // **************************************************************** -template -void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int row, - const int col) { +template +void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little // DSP usage @@ -49,8 +47,7 @@ void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_ } template -void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], +void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { @@ -62,6 +59,10 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height); + using data_col_T = + array; + using res_col_T = array; + HeightLoop: #pragma unroll pfr for (int i = 0; i < CONFIG_T::out_height; i++) { @@ -73,12 +74,11 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - [[intel::fpga_register]] data_T - data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan]; - im2col_2d_cl(data, data_col, i, j); + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_cl(data, data_col, i, j); - [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt]; - dense_resource(data_col, res_col, weights, biases); + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); // Unroll fully, since // (1) n_filt is usually low in io_parallel (< 32) @@ -122,8 +122,7 @@ inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D template void winograd_conv2d_3x3_kernel_cl( - data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], + const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { @@ -145,7 +144,7 @@ void winograd_conv2d_3x3_kernel_cl( int offset = CONFIG_T::n_filt * i; #pragma unroll for (int f = 0; f < CONFIG_T::n_filt; f++) { - res[offset + f] = static_cast(biases[f]); + res[offset + f] = static_cast(biases[f]); } } @@ -159,7 +158,7 @@ void winograd_conv2d_3x3_kernel_cl( #pragma unroll for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { // Get current 4x4 tile - [[intel::fpga_register]] data_T T[16]; + [[intel::fpga_register]] typename data_T::value_type T[16]; [[intel::fpga_register]] typename CONFIG_T::accum_t D[16]; [[intel::fpga_register]] uint8_t p = 0; @@ -176,7 +175,7 @@ void winograd_conv2d_3x3_kernel_cl( } // Transform input tile - winograd_transform_input_tile_3x3_kernel(T, D); + winograd_transform_input_tile_3x3_kernel(T, D); #pragma unroll for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { @@ -191,16 +190,20 @@ void winograd_conv2d_3x3_kernel_cl( // Explicitly transform intermediate result Z = A'YA and save to output res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] += - static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + Y[10]); + static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + + Y[10]); if ((col + 1) < CONFIG_T::out_height) res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] += - static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - Y[11]); + static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - + Y[11]); if ((row + 1) < CONFIG_T::out_width) res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] += - static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - Y[13] - Y[14]); + static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - + Y[13] - Y[14]); if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height) res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] += - static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - Y[13] + Y[14]); + static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - + Y[13] + Y[14]); } } } @@ -211,9 +214,8 @@ void winograd_conv2d_3x3_kernel_cl( // 2D Convolution for 1x1 kernels using optimized im2col // **************************************************************** -template -void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - data_T data_col[CONFIG_T::n_chan], const int row, const int col) { +template +void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations [[intel::fpga_register]] int index = 0; @@ -235,8 +237,7 @@ void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width } template -void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], +void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); @@ -246,6 +247,9 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width); static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height); + using data_col_T = array; + using res_col_T = array; + HeightLoop: #pragma unroll pfr for (int row = 0; row < CONFIG_T::out_height; row++) { @@ -257,11 +261,11 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i // See Intel's HLS - Loop Best Practices // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html - [[intel::fpga_register]] data_T data_col[CONFIG_T::n_chan]; - im2col_2d_pointwise_cl(data, data_col, row, col); + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_pointwise_cl(data, data_col, row, col); - [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt]; - dense_resource(data_col, res_col, weights, biases); + [[intel::fpga_register]] res_T res_col; + dense_resource(data_col, res_col, weights, biases); FiltLoop: #pragma unroll @@ -276,8 +280,7 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i // Top-level function - handles different implementations // **************************************************************** template -void conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan], - res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt], +void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { From f983ecea46f760a992c182da83a15b34e49338a7 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 14 Feb 2024 17:41:17 -0600 Subject: [PATCH 022/100] add back the streaming versions of algs, most unconverted --- .../firmware/nnet_utils/nnet_activation.h | 3 +- .../nnet_utils/nnet_activation_stream.h | 661 ++++++++++++++++++ .../nnet_utils/nnet_batchnorm_stream.h | 106 +++ .../firmware/nnet_utils/nnet_conv1d_stream.h | 172 +++++ .../firmware/nnet_utils/nnet_conv2d_stream.h | 238 +++++++ .../firmware/nnet_utils/nnet_dense_stream.h | 23 + .../firmware/nnet_utils/nnet_embed_stream.h | 29 + .../firmware/nnet_utils/nnet_merge_stream.h | 341 +++++++++ .../firmware/nnet_utils/nnet_padding_stream.h | 83 +++ .../firmware/nnet_utils/nnet_pooling_stream.h | 317 +++++++++ .../nnet_utils/nnet_recurrent_stream.h | 65 ++ .../firmware/nnet_utils/nnet_resize_stream.h | 56 ++ .../oneapi/firmware/nnet_utils/nnet_stream.h | 116 +++ .../nnet_utils/nnet_transpose_stream.h | 32 + 14 files changed, 2240 insertions(+), 2 deletions(-) create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index 3fbeeaa66..1aceaeb26 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -235,8 +235,7 @@ template void softmax_argmax(cons [[intel::fpga_register]] auto maximum = data[0]; [[intel::fpga_register]] int idx = 0; - #pragma ii 1 - for (int i = 1; i < CONFIG_T::n_in; i++) { + [[intel::initiation_interval(1)]] for (int i = 1; i < CONFIG_T::n_in; i++) { if (data[i] > maximum) { maximum = data[i]; idx = i; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h new file mode 100644 index 000000000..8cb1349fd --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,661 @@ +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" + +namespace nnet { + +// ************************************************* +// Linear Activation +// ************************************************* +template void linear() { +LinearActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + out_data[j] = in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ReLU Activation +// ************************************************* +template void relu() { +ReLUActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + ReLUPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(const typename data_pipe::value_type::value_type alpha) { + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +LeakyReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + LeakyReLUPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(const typename data_pipe::value_type::value_type theta) { +ThresholdedReLUActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + ThresholdedReLUPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(const typename data_pipe::value_type::value_type alpha) { +#include "activation_tables/elu_table.tb" + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +EluActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + EluPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +template void elu() { + elu(data, 1.0, res); +} + +// ************************************************* +// SeLU Activation +// ************************************************* +template void selu() { +#include "activation_tables/selu_table.tb" + +SeluActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + SeluPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) { + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +PReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + PReLUPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * res_pipe::value_type::size + j] * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template void softplus() { +#include "activation_tables/softplus_table.tb" + +SoftplusActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + SoftplusPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); + [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template void softsign() { +#include "activation_tables/softsign_table.tb" + + static const int MAX_VALUE = 8; + +SoftsignActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + SoftsignPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue; + ; + if (in_data[j] < 0) { + absValue = -in_data[j]; + } else { + absValue = in_data[j]; + } + ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (absValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + if (in_data[j] < 0) { + out_data[j] = static_cast(-softsign_table[index]); + } else { + out_data[j] = static_cast(softsign_table[index]); + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +template void softmax_stable() { +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + + [[intel::fpga_register]] typename data_pipe::value_type::value_type data_array[data_pipe::value_type::size]; + +SoftmaxArrayLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) { + auto in_pack = data_pipe::read(); + + SoftmaxArrayPackLoop: + #pragma unroll + for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max = + reduce>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed + d_xi_xmax[data_pipe::value_type::size]; + #pragma unroll + for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size]; + #pragma unroll + for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + exp_res[j] = exp_table[softmax_stable_idx_from_real_val( + d_xi_xmax[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>( + exp_res, op_add); + + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + typename res_pipe::value_type out_pack; + + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + + // TODO - Find Quartus-equivalent pragma + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_latency() { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size]; + +SoftmaxExpLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) { + auto in_pack = data_pipe::read(); + + SoftmaxExpPackLoop: + #pragma unroll + for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + exp_res[j] = + exp_table_latency[softmax_latency_idx_from_real_val( + in_pack[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + + typename res_pipe::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_legacy() { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + // Index into the lookup table based on data for exponentials + [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[data_pipe::value_type::size]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; + [[intel::fpga_register]] typename data_pipe::value_type::value_type data_cache[data_pipe::value_type::size]; + +SoftmaxInitLoop: + [[intel::initiation_interval(1)]] for (unsigned s = 0; s < CONFIG_T::n_in / data_pipe::value_type::size; s++) { + auto in_pack = data_pipe::read(); + + SoftmaxInitPackLoop: + #pragma unroll + for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + #pragma unroll + for (int i = 0; i < data_pipe::value_type::size; i++) { + SoftmaxExpInner: + #pragma unroll + for (int j = 0; j < data_pipe::value_type::size; j++) { + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table_legacy[index]; + } + exp_res[i] += exp_diff_res; + } + } + + typename res_pipe::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + out_pack[j] = static_cast(invert_table_legacy[exp_res_index]); + } + + res_pipe::write(out_pack); + } +} + +template void softmax_argmax() { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + #pragma unroll + for (int i = 0; i < res_pipe::value_type::size; i++) { + out_data[i] = static_cast(0); + } + + [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; i < res_pipe::value_type::size; i++) { + if (in_data[i] > maximum) { + maximum = in_data[i]; + idx = i; + } + } + + out_data[idx] = static_cast(1); + res_pipe::write(out_data); + } +} + +template void softmax() { + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + default: + softmax_stable(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void dense_tanh() { +#include "activation_tables/tanh_table.tb" + static const int MAX_VALUE = 4; + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +TanHActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + TanHPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = tanh_table[index]; + else + out_data[j] = -tanh_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template void sigmoid() { +#include "activation_tables/sigmoid_table.tb" + static const int MAX_VALUE = 8; + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +SigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + SigmoidPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = sigmoid_table[index]; + else + out_data[j] = 1 - sigmoid_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +// Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations +template void hard_sigmoid() { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res_pipe::write(out_data); + } +} + +template void hard_tanh() { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + + auto in_data = data_pipe::read(); + typename res_pipe::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh() { +BinaryTanHActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename res_pipe::value_type out_data; + + BinaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > 0) + out_data[j] = static_cast(1); + else + out_data[j] = static_cast(-1); + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh() { +TernaryTanHActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename res_pipe::value_type out_data; + + TernaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < res_pipe::value_type::size; j++) { + if (in_data[j] > 1) + out_data[j] = static_cast(1); + else if (in_data[j] <= -1) + out_data[j] = static_cast(-1); + else + out_data[j] = static_cast(0); + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h new file mode 100644 index 000000000..0f5970bfe --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -0,0 +1,106 @@ +#ifndef NNET_BATCHNORM_STREAM_H_ +#define NNET_BATCHNORM_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include "nnet_types.h" + +namespace nnet { + +// **************************************************** +// Streaming Batch Normalization +// **************************************************** +template +void normalize(stream &data, stream &res, const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias], + const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit; + CONFIG_T::template product::limit(multiplier_limit); + +BatchNormLoop: + #pragma ii pipeline + for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + data_T in_data = data.read(); + res_T out_data; + + BatchNormpack: + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * data_T::size + j; + else + norm_index = j % CONFIG_T::n_filt; + out_data[j] = CONFIG_T::template product::product( + in_data[j], scale[norm_index]) + + bias[norm_index]; + } + + res.write(out_data); + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +template +void normalize_binary_tanh(stream &data, stream, CONFIG_T::n_scale_bias>> &res, + const typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) { + +BinaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * data_T::size + j; + else + norm_index = j % CONFIG_T::n_filt; + + out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0; + } + + res.write(out_data); + } +} + +template +void normalize_ternary_tanh(stream &data, stream, CONFIG_T::n_scale_bias>> &res, + const typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias], + const typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) { + +TernaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) { + data_T in_data = data.read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * data_T::size + j; + else + norm_index = j % CONFIG_T::n_filt; + + if (in_data[j] > threshold_hi[norm_index]) + out_data[j] = 1; + else if (in_data[j] <= threshold_lo[norm_index]) + out_data[j] = -1; + else + out_data[j] = 0; + } + + res.write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 000000000..28e9f6b87 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,172 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel]; + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel]; + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_1d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[channel] = in_elem[channel]; + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D layer weights biases - Conv1D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_1d( + const data_T &in_elem, stream &res_stream, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan], + const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // Thresholds + static constexpr int lShiftX = CONFIG_T::filt_width - 1; + + // X position pixel + static int pX = 0; + + // X strides + static int sX = 0; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt]; + dense_resource( + kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_stream.write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_1d_cl(stream &data, stream &res, + const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // Line buffer and kernel window + [[intel::fpga_register]] static nnet::shift_reg + line_buffer[CONFIG_T::n_chan]; + [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + static const data_T padds(0); + +// Input image left-side padding +PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_1d(padds, res, line_buffer, kernel_window, weights, biases); + } + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_1d(data.read(), res, line_buffer, kernel_window, weights, biases); + } + +// Input image right-side padding +PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_1d(padds, res, line_buffer, kernel_window, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 000000000..1090f9bda --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,238 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_2d( + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftHeight: + #pragma unroll + for (int row = 0; row < CONFIG_T::filt_height; row++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + col * CONFIG_T::n_chan + channel] = + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + (col + 1) * CONFIG_T::n_chan + channel]; + } + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushHeight: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_height; col++) { + KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan + + channel] = shift_buffer[col][channel]; + } + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_2d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[CONFIG_T::filt_height - 1][channel] = in_elem[channel]; + } + +// Shift line buffer and save popped values to shift buffer +LineBufferDataIn: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + LineBufferShift: + #pragma unroll + for (unsigned col = 1; col < CONFIG_T::filt_height; col++) { + // Shift the line buffer, return the popped pixel + typename data_T::value_type pop = + line_buffer[col - 1][channel].shift(shift_buffer[CONFIG_T::filt_height - col][channel]); + + // Place popped pixed into the shift buffer, one row above + shift_buffer[CONFIG_T::filt_height - col - 1][channel] = pop; + } + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D/Conv2D layer weights biases - Conv1D/Conv2D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_2d( + const data_T &in_elem, stream &res_stream, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], + const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + // Thresholds + static constexpr int lShiftX = CONFIG_T::filt_width - 1; + static constexpr int lShiftY = CONFIG_T::filt_height - 1; + + // X, Y position pixels + static int pX = 0; + static int pY = 0; + + // X, Y strides + static int sX = 0; + static int sY = 0; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt]; + dense_resource( + kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_stream.write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_2d_cl(stream &data, stream &res, + const typename CONFIG_T::weight_t + weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + // Line buffer and kernel window + [[intel::fpga_register]] static nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan]; + [[intel::fpga_register]] static + typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + static const data_T padds(0); + +// Padding above input image +PaddingTopHeight: + #pragma loop_coalesce 2 + for (int row = 0; row < CONFIG_T::pad_top; row++) { + PaddingTopWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, res, line_buffer, kernel_window, weights, biases); + } + } + +ReadInputHeight: + #pragma loop_coalesce 2 + for (int row = 0; row < CONFIG_T::in_height; row++) { + // Input image left-side padding + PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_2d(padds, res, line_buffer, kernel_window, weights, biases); + } + + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_2d(data.read(), res, line_buffer, kernel_window, weights, biases); + } + + // Input image right-side padding + PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, res, line_buffer, kernel_window, weights, biases); + } + } + +// Padding below input image +PaddingBottomHeight: + #pragma loop_coalesce 2 + for (int row = 0; row < CONFIG_T::pad_bottom; row++) { + PaddingBottomWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, res, line_buffer, kernel_window, weights, biases); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h new file mode 100644 index 000000000..85b734624 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,23 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +// Note: DataPack logic removed, at least in the initial version +template +void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + [[intel::fpga_register]] typename res_pipe::value_type res; + [[intel::fpga_register]] auto data = data_pipe::read() + dense_resource(data, res, weights, biases); + res_pipe::write(res); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h new file mode 100644 index 000000000..51e54e991 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h @@ -0,0 +1,29 @@ +#ifndef NNET_EMBED_STREAM_H_ +#define NNET_EMBED_STREAM_H_ + +namespace nnet { + +template +void embedding(stream &data, stream &res, + const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) { + data_T in_data = data.read(); + +InputSequence: + #pragma ii CONFIG_T::reuse_factor + for (int j = 0; j < data_T::size; j++) { + + res_T res_pack; + + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i]; + } + + res.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h new file mode 100644 index 000000000..aeafc00ca --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h @@ -0,0 +1,341 @@ +#ifndef NNET_MERGE_STREAM_H_ +#define NNET_MERGE_STREAM_H_ + +namespace nnet { + +template +void add(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +AddLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + AddPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = static_cast(in_data1[j] + in_data2[j]); + } + + res.write(out_data); + } +} + +template +void subtract(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +SubtractLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + SubtractPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = static_cast(in_data1[j] - in_data2[j]); + } + + res.write(out_data); + } +} + +template +void multiply(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MultLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + MultPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = static_cast(in_data1[j] * in_data2[j]); + } + + res.write(out_data); + } +} + +template +void average(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +AvgLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + AvgPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = + static_cast((in_data1[j] + in_data2[j]) / (typename res_T::value_type)2); + } + + res.write(out_data); + } +} + +template +void maximum(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MaxLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + MaxPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = static_cast(out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] + : in_data2[j]); + } + + res.write(out_data); + } +} + +template +void minimum(stream &data1, stream &data2, stream &res) { + assert(input1_T::size == input2_T::size && input1_T::size == res_T::size); + +MinLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + + [[intel::fpga_register]] res_T out_data; + + MinPack: + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = static_cast(out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] + : in_data2[j]); + } + + res.write(out_data); + } +} + +template +void concatenate1d(stream &data1, stream &data2, stream &res) { + [[intel::fpga_register]] res_T out_data; + +ConcatLoop1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + ConcatPack1: + #pragma unroll + for (int j = 0; j < input1_T::size; j++) { + out_data[j + (i * input1_T::size)] = static_cast(in_data1[j]); + } + } + +ConcatLoop2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) { + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + ConcatPack2: + #pragma unroll + for (int j = 0; j < input2_T::size; j++) { + out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = + static_cast(in_data2[j]); + } + } + res.write(out_data); +} + +template +void concatenate2d_0(stream &data1, stream &data2, stream &res) { +ConcatLoopHeight1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1_T::size; k++) { + out_data[k] = static_cast(in_data1[k]); + } + + res.write(out_data); + } + +ConcatLoopHeight2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2_T::size; k++) { + out_data[k] = static_cast(in_data2[k]); + } + + res.write(out_data); + } +} + +template +void concatenate2d_1(stream &data1, stream &data2, stream &res) { +ConcatLoopHeight: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1_T::size; k++) { + out_data[k] = static_cast(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2_T::size; k++) { + out_data[input1_T::size + k] = static_cast(in_data2[k]); + } + + res.write(out_data); + } +} + +template +void concatenate2d(stream &data1, stream &data2, stream &res) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(stream &data1, stream &data2, stream &res) { +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] res_T out_data; + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1_T::size; k++) { + out_data[k] = static_cast(in_data1[k]); + } + + res.write(out_data); + } + } + +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2_T::size; k++) { + out_data[k] = static_cast(in_data2[k]); + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_1(stream &data1, stream &data2, stream &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1_T::size; k++) { + out_data[k] = static_cast(in_data1[k]); + } + + res.write(out_data); + } + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2_T::size; k++) { + out_data[k] = static_cast(in_data2[k]); + } + + res.write(out_data); + } + } +} + +template +void concatenate3d_2(stream &data1, stream &data2, stream &res) { +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] input1_T in_data1 = data1.read(); + [[intel::fpga_register]] input2_T in_data2 = data2.read(); + [[intel::fpga_register]] res_T out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1_T::size; k++) { + out_data[k] = static_cast(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2_T::size; k++) { + out_data[input1_T::size + k] = static_cast(in_data2[k]); + } + + res.write(out_data); + } + } +} + +template +void concatenate3d(stream &data1, stream &data2, stream &res) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h new file mode 100644 index 000000000..8990a3339 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h @@ -0,0 +1,83 @@ +#ifndef NNET_PADDING_STREAM_H_ +#define NNET_PADDING_STREAM_H_ + +namespace nnet { + +template inline void fill_zero(stream &res) { + [[intel::fpga_register]] res_T res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = 0; + } + res.write(res_part); +} + +template inline void fill_data(stream &data, stream &res) { + [[intel::fpga_register]] data_T data_part = data.read(); + [[intel::fpga_register]] res_T res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = data_part[i]; + } + res.write(res_part); +} + +template void zeropad1d_cl(stream &data, stream &res) { +PadLeft: + for (int i = 0; i < CONFIG_T::pad_left; i++) { + fill_zero(res); + } + +CopyMain: + for (int i = 0; i < CONFIG_T::in_width; i++) { + fill_data(data, res); + } + +PadRight: + for (int i = 0; i < CONFIG_T::pad_right; i++) { + fill_zero(res); + } +} + +template void zeropad2d_cl(stream &data, stream &res) { +PadTop: + #pragma loop_coalesce 2 + for (int i = 0; i < CONFIG_T::pad_top; i++) { + PadTopWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } + +PadMain: + #pragma loop_coalesce 2 + for (int i = 0; i < CONFIG_T::in_height; i++) { + + PadLeft: + for (int j = 0; j < CONFIG_T::pad_left; j++) { + fill_zero(res); + } + + CopyMain: + for (int j = 0; j < CONFIG_T::in_width; j++) { + fill_data(data, res); + } + + PadRight: + for (int j = 0; j < CONFIG_T::pad_right; j++) { + fill_zero(res); + } + } + +PadBottom: + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + PadBottomWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(res); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 000000000..ffaf74b2f --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,317 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "nnet_conv1d_stream.h" +#include "nnet_conv2d_stream.h" +#include "nnet_pooling.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void compute_pool_buffer_1d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_1d(const data_T &in_elem, stream &res_stream, + nnet::shift_reg line_buffer[CONFIG_T::n_filt], + typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt]) { + // Thresholds + static constexpr int lShiftX = CONFIG_T::pool_width - 1; + + // X position pixels + static int pX = 0; + + // X strides + static int sX = 0; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op(pool_window)); + } + + // Write result to output stream + res_stream.write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling1d_cl(stream &data, stream &res) { + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + // Line buffer and kernel window + [[intel::fpga_register]] static nnet::shift_reg + line_buffer[CONFIG_T::n_filt]; + [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt]; + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_1d(data.read(), res, line_buffer, kernel_window); + } +} + +/* + * void compute_pool_buffer_2d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_2d( + const data_T &in_elem, stream &res_stream, + nnet::shift_reg line_buffer[CONFIG_T::pool_height - 1] + [CONFIG_T::n_filt], + typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]) { + // Thresholds + static constexpr int lShiftX = CONFIG_T::pool_width - 1; + static constexpr int lShiftY = CONFIG_T::pool_height - 1; + + // X, Y position pixels + static int pX = 0; + static int pY = 0; + + // X, Y strides + static int sX = 0; + static int sY = 0; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_height * CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op( + pool_window)); + } + + // Write result to output stream + res_stream.write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling2d_cl(stream &data, stream &res) { + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + // Line buffer and kernel window + [[intel::fpga_register]] static nnet::shift_reg + line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt]; + [[intel::fpga_register]] static + typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]; + +ReadInputHeight: + #pragma loop_coalesce 2 + for (int row = 0; row < CONFIG_T::in_height; row++) { + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_2d(data.read(), res, line_buffer, kernel_window); + } + } +} + +/* + * A function used with Global Pooling + * Returns the value before pooling + * Max : Return the minimal possible value + * Avg : Return 0 + */ +template inline T init_pool_value() { + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + } + case Average: + return 0; + } +} + +/* + * A function used with Global Pooling + * Updates the output pooling value + * Max : Return the maximum between the previous maximum and current input + * Avg : Returns the cumulative sum + */ +template inline T_y reduce_global_pool(T_y y, T_x x) { + if (op == Max) { + return (x > y) ? (T_y)x : y; + } else { + return (T_y)(x + y); + } +} + +/* + * A function used with Global Pooling + * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max) + */ +template +void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt]) { + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = reduce_global_pool( + data_input[i], in_elem[i]); + } +} + +template void global_pooling1d_cl(stream &data, stream &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt]; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = init_pool_value(); + } + + for (int i = 0; i < CONFIG_T::n_in; i++) { + compute_global_pool(data.read(), data_input); + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i] / CONFIG_T::n_in); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res.write(res_pack); +} + +template void global_pooling2d_cl(stream &data, stream &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt]; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = init_pool_value(); + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::in_width; j++) { + compute_global_pool(data.read(), data_input); + } + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = + static_cast(data_input[i] / (CONFIG_T::in_width * CONFIG_T::in_height)); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res.write(res_pack); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h new file mode 100644 index 000000000..9e51d35a0 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h @@ -0,0 +1,65 @@ +#ifndef NNET_RECURRENT_STREAM_H_ +#define NNET_RECURRENT_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { +template +void gru(stream &data_stream, stream &res_stream, + const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in], + const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units], + const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units], + const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) { + + [[intel::fpga_register]] typename res_T::value_type h[CONFIG_T::n_units]; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + [[intel::fpga_register]] typename data_T::value_type x[CONFIG_T::n_in]; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / data_T::size; i_in++) { + data_T data_pack = data_stream.read(); + + DataPack: + #pragma unroll + for (int i_pack = 0; i_pack < data_T::size; i_pack++) { + x[i_pack] = data_pack[i_pack]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, + bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_stream.write(res_pack); + } + } + + if (!CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackNoRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < res_T::size; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_stream.write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h new file mode 100644 index 000000000..c619edb7c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h @@ -0,0 +1,56 @@ +#ifndef NNET_IMAGE_STREAM_H_ +#define NNET_IMAGE_STREAM_H_ + +#include "nnet_common.h" + +namespace nnet { + +template void resize_nearest(stream &image, stream &resized) { + assert(CONFIG_T::new_height % CONFIG_T::height == 0); + assert(CONFIG_T::new_width % CONFIG_T::width == 0); + + constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height; + constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width; + +ImageHeight: + for (unsigned h = 0; h < CONFIG_T::height; h++) { + [[intel::fpga_register]] data_T data_in_row[CONFIG_T::width]; + + ImageWidth: + for (unsigned i = 0; i < CONFIG_T::width; i++) { + [[intel::fpga_register]] data_T in_data = image.read(); + + ImageChan: + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_chan; j++) { + data_in_row[i][j] = in_data[j]; + } + } + + ResizeHeight: + for (unsigned i = 0; i < ratio_height; i++) { + + ImageWidth2: + for (unsigned l = 0; l < CONFIG_T::width; l++) { + + ResizeWidth: + for (unsigned j = 0; j < ratio_width; j++) { + + [[intel::fpga_register]] data_T out_data; + + ResizeChan: + #pragma unroll + for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { + out_data[k] = data_in_row[l][k]; + } + + resized.write(out_data); + } + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h new file mode 100644 index 000000000..2bee64476 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h @@ -0,0 +1,116 @@ +#ifndef NNET_CLONE_H +#define NNET_CLONE_H + +#include "nnet_common.h" + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_chan = 1; + static const unsigned n_dupl = 2; +}; + +template +void clone_stream(stream &data, stream &res1, stream &res2) { +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) { + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + + ClonePack: + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + } +} + +template +void clone_stream(stream &data, stream &res1, stream &res2, stream &res3) { +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) { + data_T in_data = data.read(); + res_T out_data1; + res_T out_data2; + res_T out_data3; + + ClonePack: + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + out_data3[j] = in_data[j]; + } + + res1.write(out_data1); + res2.write(out_data2); + res3.write(out_data3); + } +} + +template void repack_stream(stream &data, stream &res) { + if (data_T::size == res_T::size) { + [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + out_data[j] = in_data[j]; + } + + res.write(out_data); + } + } else if (data_T::size > res_T::size) { + constexpr unsigned pack_diff = data_T::size / res_T::size; + + for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + res_T out_data; + + [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) { + + res_T out_data; + + #pragma unroll + for (int k = 0; k < res_T::size; k++) { + out_data[k] = in_data[j * res_T::size + k]; + } + res.write(out_data); + } + } + } else { // data_T::size < res_T::size + res_T out_data; + constexpr unsigned pack_diff = res_T::size / data_T::size; + unsigned pack_cnt = 0; + [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) { + + data_T in_data = data.read(); + + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + out_data[pack_cnt * data_T::size + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res.write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h new file mode 100644 index 000000000..5fa126890 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h @@ -0,0 +1,32 @@ +#ifndef NNET_TRANSPOSE_STREAM_H_ +#define NNET_TRANSPOSE_STREAM_H_ + +namespace nnet { + +template void transpose_2d(stream &data, stream &res) { + [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width]; + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) { + [[intel::fpga_register]] data_T in_data = data.read(); + + #pragma unroll + for (int j = 0; j < data_T::size; j++) { + data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]); + } + } + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) { + [[intel::fpga_register]] res_T out_data; + + #pragma unroll + for (int j = 0; j < res_T::size; j++) { + out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]); + } + + res.write(out_data); + } +} + +} // namespace nnet + +#endif From 5dd9282427e9b45375eb683a4caced9d8d0f59bc Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 15 Feb 2024 01:45:07 -0600 Subject: [PATCH 023/100] tentatively complete streaming for dense but not functional --- hls4ml/backends/oneapi/oneapi_template.py | 43 +++++++++ hls4ml/backends/oneapi/oneapi_types.py | 21 ++--- .../backends/oneapi/passes/core_templates.py | 89 +++++++++++++++++-- .../backends/oneapi/passes/transform_types.py | 8 +- hls4ml/templates/oneapi/firmware/defines.h | 1 - .../templates/oneapi/firmware/myproject.cpp | 8 ++ .../nnet_utils/nnet_activation_stream.h | 14 ++- .../oneapi/firmware/nnet_utils/nnet_dense.h | 8 +- .../nnet_utils/nnet_dense_compressed.h | 4 +- .../firmware/nnet_utils/nnet_dense_stream.h | 4 +- hls4ml/writer/oneapi_writer.py | 33 +++++-- 11 files changed, 184 insertions(+), 49 deletions(-) create mode 100644 hls4ml/backends/oneapi/oneapi_template.py diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py new file mode 100644 index 000000000..184e319f3 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_template.py @@ -0,0 +1,43 @@ +''' +This package includes oneAPI-specific templates +''' + +from hls4ml.backends.template import Template + + +class StreamFunctionCallTemplate(Template): + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_stream_function_template' + super().__init__(name, layer_class, 'stream_function_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + return params + + def transform(self, model, node): + return super().transform(model, node) + + +class TaskSequenceTemplate(Template): + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_task_sequence_template' + super().__init__(name, layer_class, 'tast_sequence_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + params['config'] = f'config{layer.index}' + params['input_pipe'] = layer.get_input_variable().pipe_name + params['output_pipe'] = layer.get_output_variable().pipe_name + + return params + + def transform(self, model, node): + return super().transform(model, node) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index 103f015c4..640ff3c6b 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -3,12 +3,7 @@ ''' import numpy as np -from hls4ml.backends.fpga.fpga_types import ( - InplaceStreamVariableConverter, - PackedType, - StreamVariableConverter, - VariableDefinition, -) +from hls4ml.backends.fpga.fpga_types import PackedType, VariableDefinition from hls4ml.utils.string_utils import convert_to_pascal_case # region ArrayVarable @@ -40,10 +35,10 @@ def convert(self, tensor_var, pragma='', depth=0, n_pack=1): if pragma == 'stream': if depth == 0: depth = np.prod(tensor_var.shape) // tensor_var.shape[-1] - self.pragma = ('stream', depth) + tensor_var.pragma = ('stream', depth) n_elem = tensor_var.shape[-1] else: - self.pragma = pragma + tensor_var.pragma = pragma n_elem = tensor_var.size() n_pack = 1 # ignore any passed value @@ -103,12 +98,12 @@ class OneAPIStreamVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=True): return f'{self.name}{name_suffix}' - def declare_cpp(self, pipe_min_size=0, indent=''): + def declare_cpp(self, indent=''): lines = indent + f'class {self.pipe_id};\n' - lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n' + # lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n' lines += indent + ( f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.type}, {pipe_min_size}>;\n' + + f'{self.type.name}, {self.pragma[-1]}>;\n' ) return lines @@ -118,12 +113,12 @@ def definition_cpp(self): return f'using {self.name} = {self.input_var.name}' -class OneAPIStreamVariableConverter(StreamVariableConverter): +class OneAPIStreamVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition) -class OneAPIInplaceStreamVariableConverter(InplaceStreamVariableConverter): +class OneAPIInplaceStreamVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__( type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index 929b5a8be..ac6e66c48 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -1,4 +1,5 @@ from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax @@ -35,8 +36,11 @@ dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' -# dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] -dense_include_list = ['nnet_utils/nnet_dense.h'] +dense_task_sequence_template = 'task_sequence> {name};' + +dense_stream_function_template = '{name}.async({w}, {b});' + +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h'] class DenseConfigTemplate(LayerConfigTemplate): @@ -68,6 +72,30 @@ def format(self, node): return self.template.format(**params) +class DenseTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class DenseStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + # BatchNormalization templates batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ @@ -148,8 +176,11 @@ def format(self, node): activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' -# activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] -activ_include_list = ['nnet_utils/nnet_activation.h'] +activ_task_sequence_template = 'task_sequence> {name};' +activ_stream_function_template = '{name}.async();' +param_activ_stream_function_template = '{name}.async({param});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] class ActivationConfigTemplate(LayerConfigTemplate): @@ -190,7 +221,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() - params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + params['config'] = f"{node.get_attr('activation')}_config{node.index}" return self.template.format(**params) @@ -204,7 +235,7 @@ def format(self, node): params = self._default_function_params(node) params['activation'] = node._get_act_function_name() params['param'] = node.get_attr('activ_param', 1.0) - params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + params['config'] = f"{node.get_attr('activation')}_config{node.index}" return self.template.format(**params) @@ -218,6 +249,50 @@ def format(self, node): params = self._default_function_params(node) params['activation'] = node.get_attr('activation').lower() params['param'] = node.get_weights('alpha').name - params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index) + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class ActivationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Activation, ParametrizedActivation, PReLU, HardActivation, Softmax, ParametrizedActivation, PReLU)) + self.template = activ_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + return self.template.format(**params) + + +class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax)) + self.template = activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) + +class ParametrizedActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_attr('activ_param', 1.0) + return self.template.format(**params) + + +class PReLUActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(PReLU) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_weights('alpha').name return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py index 71a63585b..665857445 100644 --- a/hls4ml/backends/oneapi/passes/transform_types.py +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -27,10 +27,14 @@ def transform(self, model, node): for out_name, var in node.variables.items(): if io_type == 'io_stream': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') if isinstance(var, InplaceTensorVariable): - new_var = self.inplace_stream_var_converter.convert(var) + new_var = self.inplace_stream_var_converter.convert(var, pragma='stream') else: - new_var = self.stream_var_converter.convert(var) + new_var = self.stream_var_converter.convert(var, pragma='stream') elif io_type == 'io_parallel': if out_name in node.model.inputs: new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index 05b98cda2..05de507dc 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -1,7 +1,6 @@ #ifndef DEFINES_H_ #define DEFINES_H_ -#include #include #include #include diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp index 38e18e6ac..06e7d3fe3 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.cpp +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -1,8 +1,14 @@ #include "myproject.h" #include "parameters.h" +#include // hls-fpga-machine-learning insert weights +// The inter-task pipes need to be declared in the global scope +// hls-fpga-machine-learning insert inter-task pipes + +using sycl::ext::intel::experimental::task_sequence; + void MyProject::operator()() const { // **************************************** // NETWORK INSTANTIATION @@ -10,6 +16,8 @@ void MyProject::operator()() const { // hls-fpga-machine-learning read in + // hls-fpga-machine-learning declare task sequences + // hls-fpga-machine-learning insert layers // hls-fpga-machine-learning return diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index 8cb1349fd..9989036cb 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -129,9 +129,7 @@ void elu(const typename data_pipe::value_type::value_type alpha) { } } -template void elu() { - elu(data, 1.0, res); -} +template void elu() { elu(1.0); } // ************************************************* // SeLU Activation @@ -452,19 +450,19 @@ template void softmax_argma template void softmax() { switch (CONFIG_T::implementation) { case softmax_implementation::latency: - softmax_latency(data, res); + softmax_latency(); break; case softmax_implementation::stable: - softmax_stable(data, res); + softmax_stable(); break; case softmax_implementation::legacy: - softmax_legacy(data, res); + softmax_legacy(); break; case softmax_implementation::argmax: - softmax_argmax(data, res); + softmax_argmax(); break; default: - softmax_stable(data, res); + softmax_stable(); break; } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index bb5dac59b..2bedac676 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -63,9 +63,7 @@ void dense_rf_gt(const data_T &data, res_T &res, } } Product1: - #pragma nofusion - #pragma speculated_iterations 0 - for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; Product2: #pragma unroll @@ -119,9 +117,7 @@ void dense_rf_lt(const data_T &data, res_T &res, acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; } ReuseLoop: - #pragma nofusion - #pragma speculated_iterations 0 - for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; MultLoop: #pragma unroll diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h index a66423cef..cb50e4e4b 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h @@ -33,9 +33,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out], } } ReuseLoop: - #pragma nofusion - #pragma speculated_iterations 0 - for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor]; CompressedMultLoop: #pragma unroll diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 85b734624..3b7249038 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -13,8 +13,8 @@ void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * C const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { [[intel::fpga_register]] typename res_pipe::value_type res; - [[intel::fpga_register]] auto data = data_pipe::read() - dense_resource(data, res, weights, biases); + [[intel::fpga_register]] auto data = data_pipe::read(); + dense_resource(data, res, weights, biases); res_pipe::write(res); } diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index a31d80b5e..889363e45 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -154,14 +154,23 @@ def write_project_cpp(self, model): elif 'MyProject' in line: newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + # oneAPI pipes need to be declared and passed as template parameters + elif '// hls-fpga-machine-learning insert inter-task pipes' in line: + newline = line + if io_type == 'io_stream': + for layer in model.get_layers(): + vars = layer.get_variables() + for var in vars: + if var not in model_inputs and var not in model_outputs: + newline += var.declare_cpp() + # Read in inputs elif '// hls-fpga-machine-learning read in' in line: newline = line if io_type == 'io_parallel': for inp in model_inputs: newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' - else: - raise NotImplementedError("Only io_parallel is currently supported with oneAPI") + # for streaming we don't need to read it in # Insert weights elif '// hls-fpga-machine-learning insert weights' in line: @@ -171,11 +180,18 @@ def write_project_cpp(self, model): if w not in model_brams: newline += f'#include "weights/{w.name}.h"\n' + # Insert task sequences + elif '// hls-fpga-machine-learning declare task sequences' in line: + newline = line + if io_type == 'io_stream': # only need this for io_stream + for layer in model.get_layers(): + ts = layer.get_attr('tast_sequence_cpp') + if ts: + newline += ' ' + ts + '\n' + # Neural net instantiation elif '// hls-fpga-machine-learning insert layers' in line: newline = line + '\n' - model_inputs = model.get_input_variables() - model_outputs = model.get_output_variables() for layer in model.get_layers(): if io_type != 'io_stream': vars = layer.get_variables() @@ -184,7 +200,11 @@ def write_project_cpp(self, model): def_cpp = var.definition_cpp() if def_cpp is not None: newline += ' ' + def_cpp + ';\n' - func = layer.get_attr('function_cpp', None) + func = ( + layer.get_attr('function_cpp') + if io_type == 'io_parallel' + else layer.get_attr('stream_function_cpp') + ) if func: newline += ' ' + func + '\n' if model.config.trace_output and layer.get_attr('trace', False): @@ -202,8 +222,7 @@ def write_project_cpp(self, model): if io_type == 'io_parallel': for out in model_outputs: newline += indent + f'{out.pipe_name}::write({out.name});\n' - else: - raise NotImplementedError("Only io_parallel is currently supported with oneAPI") + # don't need to add anything in io_stream # Just copy line else: From 09b95136246fe64d446a4843d3b5f6a311eaac30 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 15 Feb 2024 12:41:41 -0600 Subject: [PATCH 024/100] first version that compiles streaming --- hls4ml/backends/oneapi/oneapi_types.py | 10 +- .../backends/oneapi/passes/core_templates.py | 4 +- .../nnet_utils/nnet_activation_stream.h | 233 ++++++++++-------- .../firmware/nnet_utils/nnet_dense_stream.h | 4 +- hls4ml/writer/oneapi_writer.py | 1 - 5 files changed, 149 insertions(+), 103 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index 640ff3c6b..f74e69114 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -11,7 +11,10 @@ class OneAPIArrayVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): - return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' class OneAPIInplaceArrayVariableDefinition(VariableDefinition): @@ -71,7 +74,10 @@ def __init__(self, type_converter): class OneAPIInterfaceVariableDefinition(VariableDefinition): def definition_cpp(self, name_suffix='', as_reference=False): - return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' def declare_cpp(self, pipe_min_size=0, indent=''): lines = indent + f'class {self.pipe_id};\n' diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index ac6e66c48..a68600350 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -36,7 +36,7 @@ dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' -dense_task_sequence_template = 'task_sequence> {name};' +dense_task_sequence_template = 'task_sequence> {name};' dense_stream_function_template = '{name}.async({w}, {b});' @@ -176,7 +176,7 @@ def format(self, node): activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' -activ_task_sequence_template = 'task_sequence> {name};' +activ_task_sequence_template = 'task_sequence> {name};' activ_stream_function_template = '{name}.async();' param_activ_stream_function_template = '{name}.async({param});' diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index 9989036cb..f9ad60031 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -9,15 +9,16 @@ namespace nnet { // ************************************************* // Linear Activation // ************************************************* -template void linear() { +template void linear_stream() { LinearActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; LinearPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { out_data[j] = in_data[j]; } @@ -28,15 +29,16 @@ template void linear() { // ************************************************* // ReLU Activation // ************************************************* -template void relu() { +template void relu_stream() { ReLUActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; ReLUPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else @@ -52,17 +54,20 @@ template void relu() { // ************************************************* template void leaky_relu(const typename data_pipe::value_type::value_type alpha) { - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; LeakyReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; LeakyReLUPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else @@ -79,13 +84,14 @@ void leaky_relu(const typename data_pipe::value_type::value_type alpha) { template void thresholded_relu(const typename data_pipe::value_type::value_type theta) { ThresholdedReLUActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; ThresholdedReLUPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > theta) out_data[j] = in_data[j]; else @@ -103,17 +109,20 @@ template void elu(const typename data_pipe::value_type::value_type alpha) { #include "activation_tables/elu_table.tb" - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; EluActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; EluPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; if (datareg >= 0) { out_data[j] = datareg; @@ -129,22 +138,25 @@ void elu(const typename data_pipe::value_type::value_type alpha) { } } -template void elu() { elu(1.0); } +template void elu_stream() { + elu_stream(1.0); +} // ************************************************* // SeLU Activation // ************************************************* -template void selu() { +template void selu_stream() { #include "activation_tables/selu_table.tb" SeluActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; SeluPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; if (datareg >= 0) { out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg; @@ -165,21 +177,24 @@ template void selu() { // ************************************************* template void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) { - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; PReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; PReLUPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else - out_data[j] = alpha[i * res_pipe::value_type::size + j] * in_data[j]; + out_data[j] = alpha[i * std::tuple_size{} + j] * in_data[j]; } res_pipe::write(out_data); @@ -189,17 +204,18 @@ void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in // ************************************************* // Softplus Activation // ************************************************* -template void softplus() { +template void softplus_stream() { #include "activation_tables/softplus_table.tb" SoftplusActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; SoftplusPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; if (index < 0) @@ -216,19 +232,20 @@ template void softplus() { // ************************************************* // Softsign Activation // ************************************************* -template void softsign() { +template void softsign_stream() { #include "activation_tables/softsign_table.tb" static const int MAX_VALUE = 8; SoftsignActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; SoftsignPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue; ; if (in_data[j] < 0) { @@ -254,44 +271,48 @@ template void softsign() { // Softmax Activation // ************************************************* -template void softmax_stable() { +template void softmax_stable_stream() { #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; - [[intel::fpga_register]] typename data_pipe::value_type::value_type data_array[data_pipe::value_type::size]; + [[intel::fpga_register]] + typename data_pipe::value_type::value_type data_array[std::tuple_size{}]; SoftmaxArrayLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_pack = data_pipe::read(); SoftmaxArrayPackLoop: #pragma unroll - for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { data_array[j] = in_pack[j]; } // Find the max and compute all delta(x_i, x_max) Op_max op_max; [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max = - reduce{}, Op_max>(data_array, op_max); // For the diffs, use the same type as the input but force rounding and saturation [[intel::fpga_register]] ac_fixed - d_xi_xmax[data_pipe::value_type::size]; + d_xi_xmax[std::tuple_size{}]; #pragma unroll - for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { d_xi_xmax[j] = data_array[j] - x_max; } // Calculate all the e^x's - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size]; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; #pragma unroll - for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { exp_res[j] = exp_table[softmax_stable_idx_from_real_val( d_xi_xmax[j])]; } @@ -300,8 +321,8 @@ template void softmax_stabl // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce>( - exp_res, op_add); + reduce{}, + Op_add>(exp_res, op_add); [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; @@ -309,7 +330,7 @@ template void softmax_stabl SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { // TODO - Find Quartus-equivalent pragma // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation @@ -321,23 +342,26 @@ template void softmax_stabl } } -template void softmax_latency() { +template void softmax_latency_stream() { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; // Calculate all the e^x's - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size]; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; SoftmaxExpLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_pack = data_pipe::read(); SoftmaxExpPackLoop: #pragma unroll - for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val( in_pack[j])]; @@ -356,7 +380,7 @@ template void softmax_laten typename res_pipe::value_type out_pack; SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation out_pack[j] = exp_res[j] * inv_exp_sum; } @@ -365,32 +389,34 @@ template void softmax_laten } } -template void softmax_legacy() { +template void softmax_legacy_stream() { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" // Index into the lookup table based on data for exponentials - [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[data_pipe::value_type::size]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size{}]; [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; - [[intel::fpga_register]] typename data_pipe::value_type::value_type data_cache[data_pipe::value_type::size]; + [[intel::fpga_register]] + typename data_pipe::value_type::value_type data_cache[std::tuple_size{}]; SoftmaxInitLoop: - [[intel::initiation_interval(1)]] for (unsigned s = 0; s < CONFIG_T::n_in / data_pipe::value_type::size; s++) { + [[intel::initiation_interval(1)]] for (unsigned s = 0; + s < CONFIG_T::n_in / std::tuple_size{}; s++) { auto in_pack = data_pipe::read(); SoftmaxInitPackLoop: #pragma unroll - for (unsigned j = 0; j < data_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { data_cache[j] = in_pack[j]; exp_res[j] = 0; } SoftmaxExpLoop: #pragma unroll - for (int i = 0; i < data_pipe::value_type::size; i++) { + for (int i = 0; i < std::tuple_size{}; i++) { SoftmaxExpInner: #pragma unroll - for (int j = 0; j < data_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (i == j) { exp_diff_res = 1; } else { @@ -409,7 +435,7 @@ template void softmax_legac typename res_pipe::value_type out_pack; SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < res_pipe::value_type::size; j++) { + for (unsigned j = 0; j < std::tuple_size{}; j++) { int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); if (exp_res_index < 0) exp_res_index = 0; @@ -422,20 +448,21 @@ template void softmax_legac } } -template void softmax_argmax() { - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { +template void softmax_argmax_stream() { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; #pragma unroll - for (int i = 0; i < res_pipe::value_type::size; i++) { + for (int i = 0; i < std::tuple_size{}; i++) { out_data[i] = static_cast(0); } [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0]; [[intel::fpga_register]] int idx = 0; - [[intel::initiation_interval(1)]] for (int i = 1; i < res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size{}; i++) { if (in_data[i] > maximum) { maximum = in_data[i]; idx = i; @@ -447,22 +474,22 @@ template void softmax_argma } } -template void softmax() { +template void softmax_stream() { switch (CONFIG_T::implementation) { case softmax_implementation::latency: - softmax_latency(); + softmax_latency_stream(); break; case softmax_implementation::stable: - softmax_stable(); + softmax_stable_stream(); break; case softmax_implementation::legacy: - softmax_legacy(); + softmax_legacy_stream(); break; case softmax_implementation::argmax: - softmax_argmax(); + softmax_argmax_stream(); break; default: - softmax_stable(); + softmax_stable_stream(); break; } } @@ -470,22 +497,25 @@ template void softmax() { // ************************************************* // TanH Activation // ************************************************* -template void dense_tanh() { +template void dense_tanh_stream() { #include "activation_tables/tanh_table.tb" static const int MAX_VALUE = 4; - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; TanHActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; TanHPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; if (in_data[j] < 0) @@ -512,21 +542,24 @@ template void dense_tanh() // ************************************************* // Sigmoid Activation // ************************************************* -template void sigmoid() { +template void sigmoid_stream() { #include "activation_tables/sigmoid_table.tb" static const int MAX_VALUE = 8; - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; SigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; SigmoidPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; if (in_data[j] < 0) @@ -554,20 +587,23 @@ template void sigmoid() { // Hard sigmoid Activation // ************************************************* // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations -template void hard_sigmoid() { +template void hard_sigmoid_stream() { - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; HardSigmoidPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; if (datareg > 1) datareg = 1; @@ -580,20 +616,23 @@ template void hard_sigmoid( } } -template void hard_tanh() { +template void hard_tanh_stream() { - constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit; + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / std::tuple_size{}; + i++) { auto in_data = data_pipe::read(); typename res_pipe::value_type out_data; HardSigmoidPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; if (sigmoid > 1) sigmoid = 1; @@ -609,16 +648,17 @@ template void hard_tanh() { // ************************************************* // Binary TanH Activation // ************************************************* -template void binary_tanh() { +template void binary_tanh_stream() { BinaryTanHActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); [[intel::fpga_register]] typename res_pipe::value_type out_data; BinaryTanHPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > 0) out_data[j] = static_cast(1); else @@ -632,16 +672,17 @@ template void binary_tanh() // ************************************************* // Ternary TanH Activation // ************************************************* -template void ternary_tanh() { +template void ternary_tanh_stream() { TernaryTanHActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) { + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; + i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); [[intel::fpga_register]] typename res_pipe::value_type out_data; TernaryTanHPackLoop: #pragma unroll - for (int j = 0; j < res_pipe::value_type::size; j++) { + for (int j = 0; j < std::tuple_size{}; j++) { if (in_data[j] > 1) out_data[j] = static_cast(1); else if (in_data[j] <= -1) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 3b7249038..0572e1810 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -9,8 +9,8 @@ namespace nnet { // Note: DataPack logic removed, at least in the initial version template -void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], - const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { +void dense_resource_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], + const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { [[intel::fpga_register]] typename res_pipe::value_type res; [[intel::fpga_register]] auto data = data_pipe::read(); diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 889363e45..91dbaef76 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -214,7 +214,6 @@ def write_project_cpp(self, model): var.type.name, var.name, layer.name, var.size_cpp() ) newline += '#endif\n' - newline += '\n' # Write the output elif '// hls-fpga-machine-learning return' in line: From 0e3f9ba655c80a624c5222372bdc74099f7f5ed6 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 16 Feb 2024 10:51:04 -0600 Subject: [PATCH 025/100] change how the pipe value type is extracted --- .../nnet_utils/nnet_activation_stream.h | 287 ++++++++++-------- .../firmware/nnet_utils/nnet_dense_stream.h | 5 +- .../oneapi/firmware/nnet_utils/nnet_types.h | 12 + 3 files changed, 168 insertions(+), 136 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index f9ad60031..a4f3c6072 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -11,14 +11,14 @@ namespace nnet { // ************************************************* template void linear_stream() { LinearActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; LinearPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { out_data[j] = in_data[j]; } @@ -31,14 +31,14 @@ template void linear_stream // ************************************************* template void relu_stream() { ReLUActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; ReLUPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else @@ -53,21 +53,22 @@ template void relu_stream() // Leaky RELU Activation // ************************************************* template -void leaky_relu(const typename data_pipe::value_type::value_type alpha) { +void leaky_relu(const typename ExtractPipeType::value_type::value_type alpha) { constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; LeakyReLUActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; LeakyReLUPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else @@ -82,16 +83,16 @@ void leaky_relu(const typename data_pipe::value_type::value_type alpha) { // Thresholded RELU Activation // ************************************************* template -void thresholded_relu(const typename data_pipe::value_type::value_type theta) { +void thresholded_relu(const typename ExtractPipeType::value_type::value_type theta) { ThresholdedReLUActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; ThresholdedReLUPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > theta) out_data[j] = in_data[j]; else @@ -106,24 +107,25 @@ void thresholded_relu(const typename data_pipe::value_type::value_type theta) { // ELU Activation // ************************************************* template -void elu(const typename data_pipe::value_type::value_type alpha) { +void elu(const typename ExtractPipeType::value_type::value_type alpha) { #include "activation_tables/elu_table.tb" constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; EluActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; EluPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { - [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; if (datareg >= 0) { out_data[j] = datareg; } else { @@ -149,17 +151,18 @@ template void selu_stream() #include "activation_tables/selu_table.tb" SeluActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; SeluPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { - [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j]; + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; if (datareg >= 0) { - out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg; + out_data[j] = + typename ExtractPipeType::value_type::value_type(1.0507009873554804934193349852946) * datareg; } else { int index = (datareg * CONFIG_T::table_size / -8).to_int(); if (index > CONFIG_T::table_size - 1) @@ -176,25 +179,26 @@ template void selu_stream() // PReLU Activation // ************************************************* template -void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) { +void prelu(const typename ExtractPipeType::value_type::value_type alpha[CONFIG_T::n_in]) { constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; PReLUActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; PReLUPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > 0) out_data[j] = in_data[j]; else - out_data[j] = alpha[i * std::tuple_size{} + j] * in_data[j]; + out_data[j] = alpha[i * std::tuple_size::value_type>{} + j] * in_data[j]; } res_pipe::write(out_data); @@ -208,14 +212,14 @@ template void softplus_stre #include "activation_tables/softplus_table.tb" SoftplusActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; SoftplusPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; if (index < 0) @@ -238,15 +242,15 @@ template void softsign_stre static const int MAX_VALUE = 8; SoftsignActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; SoftsignPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { - [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue; + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absValue; ; if (in_data[j] < 0) { absValue = -in_data[j]; @@ -257,9 +261,10 @@ template void softsign_stre if (absValue > MAX_VALUE) index = CONFIG_T::table_size - 1; if (in_data[j] < 0) { - out_data[j] = static_cast(-softsign_table[index]); + out_data[j] = + static_cast::value_type::value_type>(-softsign_table[index]); } else { - out_data[j] = static_cast(softsign_table[index]); + out_data[j] = static_cast::value_type::value_type>(softsign_table[index]); } } @@ -276,61 +281,65 @@ template void softmax_stabl #include "activation_tables/invert_table.tb" constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; - [[intel::fpga_register]] - typename data_pipe::value_type::value_type data_array[std::tuple_size{}]; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_array[std::tuple_size::value_type>{}]; SoftmaxArrayLoop: [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_pack = data_pipe::read(); SoftmaxArrayPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { data_array[j] = in_pack[j]; } // Find the max and compute all delta(x_i, x_max) - Op_max op_max; - [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max = - reduce{}, - Op_max>(data_array, op_max); + Op_max::value_type::value_type> op_max; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = + reduce::value_type::value_type, + std::tuple_size::value_type>{}, + Op_max::value_type::value_type>>(data_array, op_max); // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed - d_xi_xmax[std::tuple_size{}]; + [[intel::fpga_register]] ac_fixed::value_type::value_type::width, + ExtractPipeType::value_type::value_type::i_width, true, AC_RND, AC_SAT> + d_xi_xmax[std::tuple_size::value_type>{}]; #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { d_xi_xmax[j] = data_array[j] - x_max; } // Calculate all the e^x's - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { - exp_res[j] = exp_table[softmax_stable_idx_from_real_val( - d_xi_xmax[j])]; + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = + exp_table[softmax_stable_idx_from_real_val::value_type::value_type, + CONFIG_T>(d_xi_xmax[j])]; } // Explicitly sum the results with an adder tree. // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing Op_add op_add; [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce{}, + reduce::value_type>{}, Op_add>(exp_res, op_add); [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; - typename res_pipe::value_type out_pack; + typename ExtractPipeType::value_type out_pack; SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { // TODO - Find Quartus-equivalent pragma // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation @@ -347,24 +356,25 @@ template void softmax_laten #include "activation_tables/invert_table_latency.tb" constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; // Calculate all the e^x's - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; SoftmaxExpLoop: [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_pack = data_pipe::read(); SoftmaxExpPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { - exp_res[j] = - exp_table_latency[softmax_latency_idx_from_real_val( - in_pack[j])]; + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val< + typename ExtractPipeType::value_type::value_type, CONFIG_T>(in_pack[j])]; } // Explicitly sum the results with an adder tree. @@ -377,10 +387,10 @@ template void softmax_laten [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; - typename res_pipe::value_type out_pack; + typename ExtractPipeType::value_type out_pack; SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation out_pack[j] = exp_res[j] * inv_exp_sum; } @@ -394,29 +404,32 @@ template void softmax_legac #include "activation_tables/invert_table_legacy.tb" // Index into the lookup table based on data for exponentials - [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size{}]; - [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; [[intel::fpga_register]] - typename data_pipe::value_type::value_type data_cache[std::tuple_size{}]; + typename CONFIG_T::table_t exp_res[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_cache[std::tuple_size::value_type>{}]; SoftmaxInitLoop: [[intel::initiation_interval(1)]] for (unsigned s = 0; - s < CONFIG_T::n_in / std::tuple_size{}; s++) { + s < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + s++) { auto in_pack = data_pipe::read(); SoftmaxInitPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { data_cache[j] = in_pack[j]; exp_res[j] = 0; } SoftmaxExpLoop: #pragma unroll - for (int i = 0; i < std::tuple_size{}; i++) { + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { SoftmaxExpInner: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (i == j) { exp_diff_res = 1; } else { @@ -432,16 +445,17 @@ template void softmax_legac } } - typename res_pipe::value_type out_pack; + typename ExtractPipeType::value_type out_pack; SoftmaxInvPackLoop: #pragma unroll - for (unsigned j = 0; j < std::tuple_size{}; j++) { + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); if (exp_res_index < 0) exp_res_index = 0; if (exp_res_index > CONFIG_T::table_size - 1) exp_res_index = CONFIG_T::table_size - 1; - out_pack[j] = static_cast(invert_table_legacy[exp_res_index]); + out_pack[j] = + static_cast::value_type::value_type>(invert_table_legacy[exp_res_index]); } res_pipe::write(out_pack); @@ -449,27 +463,28 @@ template void softmax_legac } template void softmax_argmax_stream() { - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; #pragma unroll - for (int i = 0; i < std::tuple_size{}; i++) { - out_data[i] = static_cast(0); + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { + out_data[i] = static_cast::value_type::value_type>(0); } - [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0]; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type maximum = in_data[0]; [[intel::fpga_register]] int idx = 0; - [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size{}; i++) { + [[intel::initiation_interval(1)]] for (int i = 1; + i < std::tuple_size::value_type>{}; i++) { if (in_data[i] > maximum) { maximum = in_data[i]; idx = i; } } - out_data[idx] = static_cast(1); + out_data[idx] = static_cast::value_type::value_type>(1); res_pipe::write(out_data); } } @@ -502,21 +517,22 @@ template void dense_tanh_st static const int MAX_VALUE = 4; constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; TanHActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; TanHPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { - [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; if (in_data[j] < 0) absoluteValue = (-1) * in_data[j]; @@ -547,20 +563,21 @@ template void sigmoid_strea static const int MAX_VALUE = 8; constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; SigmoidActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; SigmoidPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { - [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue; + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; if (in_data[j] < 0) absoluteValue = (-1) * in_data[j]; @@ -590,20 +607,21 @@ template void sigmoid_strea template void hard_sigmoid_stream() { constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; HardSigmoidActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; HardSigmoidPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; if (datareg > 1) datareg = 1; @@ -619,20 +637,21 @@ template void hard_sigmoid_ template void hard_tanh_stream() { constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; HardSigmoidActLoop: [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / std::tuple_size{}; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; i++) { auto in_data = data_pipe::read(); - typename res_pipe::value_type out_data; + typename ExtractPipeType::value_type out_data; HardSigmoidPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; if (sigmoid > 1) sigmoid = 1; @@ -650,19 +669,19 @@ template void hard_tanh_str // ************************************************* template void binary_tanh_stream() { BinaryTanHActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); - [[intel::fpga_register]] typename res_pipe::value_type out_data; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; BinaryTanHPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > 0) - out_data[j] = static_cast(1); + out_data[j] = static_cast::value_type::value_type>(1); else - out_data[j] = static_cast(-1); + out_data[j] = static_cast::value_type::value_type>(-1); } res_pipe::write(out_data); @@ -674,21 +693,21 @@ template void binary_tanh_s // ************************************************* template void ternary_tanh_stream() { TernaryTanHActLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; - i++) { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { [[intel::fpga_register]] auto in_data = data_pipe::read(); - [[intel::fpga_register]] typename res_pipe::value_type out_data; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; TernaryTanHPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size{}; j++) { + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { if (in_data[j] > 1) - out_data[j] = static_cast(1); + out_data[j] = static_cast::value_type::value_type>(1); else if (in_data[j] <= -1) - out_data[j] = static_cast(-1); + out_data[j] = static_cast::value_type::value_type>(-1); else - out_data[j] = static_cast(0); + out_data[j] = static_cast::value_type::value_type>(0); } res_pipe::write(out_data); diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 0572e1810..53987a02d 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -12,9 +12,10 @@ template void dense_resource_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out], const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { - [[intel::fpga_register]] typename res_pipe::value_type res; + [[intel::fpga_register]] typename ExtractPipeType::value_type res; [[intel::fpga_register]] auto data = data_pipe::read(); - dense_resource(data, res, weights, biases); + dense_resource::value_type, typename ExtractPipeType::value_type, + CONFIG_T>(data, res, weights, biases); res_pipe::write(res); } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index cd572f0c7..8e48121c1 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -8,8 +8,20 @@ namespace nnet { +// Define the pipe type that we use template using array = std::array; +// This is a helper to extract the value_type of a pipe +template struct ExtractPipeType { typedef T value_type; }; + +template