From 45185379af2a2aa80883ec99375b6760f7813478 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 25 Oct 2024 02:43:54 -0400 Subject: [PATCH] Beginnings of the oneAPI backend (#955) * snapshot adding oneapi * fix reduce constexpr * further updates * update the bridge and testbench * fix issues discovered when compiling * update bridge writing files * build library (but not tested) * fix a bug in testbench * snapshot after some debugging * remove forgotten debug printing * add build * pre-commit fixes * fix more pre-commit * fix more pre-commit errors * snapshot of work before reworking types * Use using to decide array type, some preliminary updates * snapshot unifying types * fix the testbench and bridge * snapshot updating nnet_utils (not finished) * define array in nnet_types for oneAPI * fix parallel conv2d * add back the streaming versions of algs, most unconverted * tentatively complete streaming for dense but not functional * first version that compiles streaming * change how the pipe value type is extracted * fix pre-commit error * always treat elu as ELU class * fix batchnorm * snapshot towards fixing conv * snapshot fixing test for streaming * fix conv1d * fix conv2d * fix reshape and flatten for oneAPI * initial oneAPI tests * remove nnet_dense_compressed from oneAPI * add merge functionality (untested) * fix merge for oneAPI * fix merge for oneAPI (missing commit) * add zeropadding * standardize paralellization spelling * fix pointwise for oneAPI * remove references to quartus * more replace quartus with oneapi * snapshot on the way towards implementing pooling * fix io_stream pooling for oneAPI * add fix for Conv2DBatchnorm * accidentally committed CMakeLists.txt in my debug setup * reshaping, not fully tested * fix cloning of streams * fix pytest library loading * remove unused template * fix some activation bugs * fix the overwriting of directories in the pytest * update version of test repository * try to fix docker issue * bump hls4ml-testing tag to 0.5.2 * try not restricting tensorflow-model-optimizatoin * Update to 0.5.3 for testing * bump to docker image 0.5.4, suggested by Ben * fix pre-commit warning * dial down N_TESTS_PER_YAML to 4 * revert tensorflow-model-optimization change * fix issue of saving in "obsolete" h5 format * fix embedding for oneAPI * First attempt at adding RNNs to oneAPI * fix bug in array size * fix order or indices * make queues static in bridge * fix logic error in repack stream * changing the style, but functionally identical * update pointwise optimizer for oneAPI * add oneAPI to test_multi_dense.py * fix updating weight types * initial changes of templates, for testing * fix weight naming, product selection * make im2col the default; fix winograd size * fix up streaming dense and convolution * fix prelu, some batchnorm * fix weight array of exponential types * move ACExponentialPrecisionDefinition to oneapi_types * attempt to fix batchnorm and recurrent * fixed BatchNormalizationQuantizedTanhConfigTemplate template selection * fix embedding_stream * fix lstm and simple rnn * fix GRU * fix winograd, and also disable it by default * fix threshold name * split bn_quant to be backend-specific * add type inference to oneAPI * add oneAPI to pytorch tests * fix pooling with padding for oneAPI and Quartus * Compilation for larger models enabled by increasing -fconstexpr-steps * add oneapi clone tests; remove reduntand multi_clone test * remove some attributes to avoid overwrite warnings * make extra handling for oneAPI like others (as in PR #1067) * remove warnings for extra optimizers that are not scheduled on purpose * update parametrized activations * fix reference to alpha that had not been switched to param * add oneapi documentation * add parallelization factor to the attributes for oneAPI --------- Co-authored-by: Lauri Laatu Co-authored-by: Jan-Frederik Schulte --- docs/advanced/oneapi.rst | 35 + docs/index.rst | 1 + hls4ml/backends/__init__.py | 2 + .../{fpga => catapult}/passes/bn_quant.py | 0 hls4ml/backends/oneapi/__init__.py | 0 hls4ml/backends/oneapi/oneapi_backend.py | 376 +++++++ hls4ml/backends/oneapi/oneapi_template.py | 61 ++ hls4ml/backends/oneapi/oneapi_types.py | 267 +++++ hls4ml/backends/oneapi/passes/__init__.py | 0 hls4ml/backends/oneapi/passes/bn_quant.py | 222 ++++ .../backends/oneapi/passes/clone_templates.py | 32 + .../oneapi/passes/convolution_templates.py | 235 +++++ .../oneapi/passes/convolution_winograd.py | 179 ++++ .../backends/oneapi/passes/core_templates.py | 351 +++++++ .../oneapi/passes/embedding_templates.py | 32 + .../backends/oneapi/passes/merge_templates.py | 137 +++ hls4ml/backends/oneapi/passes/pointwise.py | 156 +++ .../oneapi/passes/pooling_templates.py | 153 +++ .../oneapi/passes/quantization_templates.py | 63 ++ .../oneapi/passes/recurrent_templates.py | 369 +++++++ .../oneapi/passes/reshaping_templates.py | 244 +++++ .../oneapi/passes/resource_strategy.py | 77 ++ .../backends/oneapi/passes/transform_types.py | 60 ++ hls4ml/backends/quartus/passes/bn_quant.py | 169 +++ .../quartus/passes/convolution_templates.py | 4 +- hls4ml/backends/template.py | 21 + hls4ml/backends/vivado/passes/bn_quant.py | 169 +++ hls4ml/converters/keras/core.py | 4 + hls4ml/model/layers.py | 2 +- hls4ml/model/optimizer/passes/stamp.py | 8 +- .../objectives/vivado_objectives.py | 4 +- hls4ml/templates/oneapi/CMakeLists.txt | 338 ++++++ hls4ml/templates/oneapi/exception_handler.hpp | 21 + hls4ml/templates/oneapi/firmware/defines.h | 20 + .../templates/oneapi/firmware/myproject.cpp | 24 + hls4ml/templates/oneapi/firmware/myproject.h | 29 + .../firmware/nnet_utils/nnet_activation.h | 499 +++++++++ .../nnet_utils/nnet_activation_stream.h | 712 +++++++++++++ .../firmware/nnet_utils/nnet_batchnorm.h | 104 ++ .../nnet_utils/nnet_batchnorm_stream.h | 107 ++ .../oneapi/firmware/nnet_utils/nnet_common.h | 76 ++ .../oneapi/firmware/nnet_utils/nnet_conv1d.h | 61 ++ .../nnet_utils/nnet_conv1d_resource.h | 237 +++++ .../firmware/nnet_utils/nnet_conv1d_stream.h | 177 ++++ .../oneapi/firmware/nnet_utils/nnet_conv2d.h | 67 ++ .../nnet_utils/nnet_conv2d_resource.h | 297 ++++++ .../firmware/nnet_utils/nnet_conv2d_stream.h | 241 +++++ .../oneapi/firmware/nnet_utils/nnet_dense.h | 164 +++ .../firmware/nnet_utils/nnet_dense_stream.h | 23 + .../oneapi/firmware/nnet_utils/nnet_embed.h | 43 + .../firmware/nnet_utils/nnet_embed_stream.h | 31 + .../oneapi/firmware/nnet_utils/nnet_helpers.h | 118 +++ .../oneapi/firmware/nnet_utils/nnet_merge.h | 232 +++++ .../firmware/nnet_utils/nnet_merge_stream.h | 359 +++++++ .../oneapi/firmware/nnet_utils/nnet_mult.h | 113 ++ .../oneapi/firmware/nnet_utils/nnet_padding.h | 104 ++ .../firmware/nnet_utils/nnet_padding_stream.h | 81 ++ .../oneapi/firmware/nnet_utils/nnet_pooling.h | 257 +++++ .../firmware/nnet_utils/nnet_pooling_stream.h | 322 ++++++ .../oneapi/firmware/nnet_utils/nnet_printf.h | 18 + .../firmware/nnet_utils/nnet_recurrent.h | 566 ++++++++++ .../nnet_utils/nnet_recurrent_activation.h | 47 + .../nnet_utils/nnet_recurrent_stream.h | 68 ++ .../oneapi/firmware/nnet_utils/nnet_resize.h | 36 + .../firmware/nnet_utils/nnet_resize_stream.h | 58 ++ .../oneapi/firmware/nnet_utils/nnet_stream.h | 126 +++ .../firmware/nnet_utils/nnet_transpose.h | 48 + .../nnet_utils/nnet_transpose_stream.h | 39 + .../oneapi/firmware/nnet_utils/nnet_types.h | 71 ++ hls4ml/templates/oneapi/firmware/parameters.h | 11 + hls4ml/templates/oneapi/myproject_bridge.cpp | 77 ++ hls4ml/templates/oneapi/myproject_test.cpp | 133 +++ .../quartus/firmware/nnet_utils/nnet_conv1d.h | 2 +- .../nnet_utils/nnet_conv1d_resource.h | 12 +- .../quartus/firmware/nnet_utils/nnet_conv2d.h | 2 +- .../nnet_utils/nnet_conv2d_resource.h | 18 +- .../firmware/nnet_utils/nnet_pooling.h | 34 +- .../quartus/firmware/nnet_utils/nnet_stream.h | 1 + hls4ml/utils/fixed_point_utils.py | 11 +- hls4ml/writer/__init__.py | 2 + hls4ml/writer/oneapi_writer.py | 969 ++++++++++++++++++ test/pytest/test_activations.py | 2 +- test/pytest/test_batchnorm.py | 4 +- test/pytest/test_conv1d.py | 4 + test/pytest/test_embed.py | 4 +- test/pytest/test_globalpooling.py | 4 +- test/pytest/test_keras_api.py | 12 +- test/pytest/test_merge.py | 10 +- test/pytest/test_multi_dense.py | 1 + test/pytest/test_pointwiseconv.py | 8 +- test/pytest/test_pooling.py | 70 +- test/pytest/test_pytorch_api.py | 27 +- test/pytest/test_qkeras.py | 22 +- test/pytest/test_repack_stream.py | 23 +- test/pytest/test_reshape.py | 4 +- test/pytest/test_rnn.py | 39 +- test/pytest/test_stream_clone.py | 4 +- test/pytest/test_stream_multi_clone.py | 48 - test/pytest/test_transpose_concat.py | 4 +- test/pytest/test_upsampling.py | 2 +- test/pytest/test_zeropadding.py | 2 +- 101 files changed, 10764 insertions(+), 169 deletions(-) create mode 100644 docs/advanced/oneapi.rst rename hls4ml/backends/{fpga => catapult}/passes/bn_quant.py (100%) create mode 100644 hls4ml/backends/oneapi/__init__.py create mode 100644 hls4ml/backends/oneapi/oneapi_backend.py create mode 100644 hls4ml/backends/oneapi/oneapi_template.py create mode 100644 hls4ml/backends/oneapi/oneapi_types.py create mode 100644 hls4ml/backends/oneapi/passes/__init__.py create mode 100644 hls4ml/backends/oneapi/passes/bn_quant.py create mode 100644 hls4ml/backends/oneapi/passes/clone_templates.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_templates.py create mode 100644 hls4ml/backends/oneapi/passes/convolution_winograd.py create mode 100644 hls4ml/backends/oneapi/passes/core_templates.py create mode 100644 hls4ml/backends/oneapi/passes/embedding_templates.py create mode 100644 hls4ml/backends/oneapi/passes/merge_templates.py create mode 100644 hls4ml/backends/oneapi/passes/pointwise.py create mode 100644 hls4ml/backends/oneapi/passes/pooling_templates.py create mode 100644 hls4ml/backends/oneapi/passes/quantization_templates.py create mode 100644 hls4ml/backends/oneapi/passes/recurrent_templates.py create mode 100644 hls4ml/backends/oneapi/passes/reshaping_templates.py create mode 100644 hls4ml/backends/oneapi/passes/resource_strategy.py create mode 100644 hls4ml/backends/oneapi/passes/transform_types.py create mode 100644 hls4ml/backends/quartus/passes/bn_quant.py create mode 100644 hls4ml/backends/vivado/passes/bn_quant.py create mode 100644 hls4ml/templates/oneapi/CMakeLists.txt create mode 100644 hls4ml/templates/oneapi/exception_handler.hpp create mode 100644 hls4ml/templates/oneapi/firmware/defines.h create mode 100644 hls4ml/templates/oneapi/firmware/myproject.cpp create mode 100644 hls4ml/templates/oneapi/firmware/myproject.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h create mode 100644 hls4ml/templates/oneapi/firmware/parameters.h create mode 100644 hls4ml/templates/oneapi/myproject_bridge.cpp create mode 100644 hls4ml/templates/oneapi/myproject_test.cpp create mode 100644 hls4ml/writer/oneapi_writer.py delete mode 100644 test/pytest/test_stream_multi_clone.py diff --git a/docs/advanced/oneapi.rst b/docs/advanced/oneapi.rst new file mode 100644 index 000000000..ae0e0bc56 --- /dev/null +++ b/docs/advanced/oneapi.rst @@ -0,0 +1,35 @@ +============== +oneAPI Backend +============== + +The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually +replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus +program continues to be used with IP produced by the ``oneAPI`` backend.) +This section discusses details of the ``oneAPI`` backend. + +The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the +accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the +kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on +PCIe accelerator boards, is planned to be added in the future. + +The produced work areas use cmake to build the projects in a style based +`oneAPI-samples `_. +The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib`` +produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands +in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there +if desired. + +The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There +is no ``Latency`` implementation of any of the layers. + +Note: currently tracing and external weights (i.e. setting BramFactor) are not supported. + +io_parallel and io_stream +========================= + +As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for +larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its +own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This +is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel`` +always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis +backend sometimes uses dataflow with ``io_parallel``. diff --git a/docs/index.rst b/docs/index.rst index c21b90aeb..07fcd217d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ advanced/fifo_depth advanced/extension + advanced/oneapi advanced/accelerator advanced/model_optimization diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 8b3117af7..4a48f072c 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -1,5 +1,6 @@ from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend # noqa: F401 from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 +from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend from hls4ml.backends.vivado.vivado_backend import VivadoBackend @@ -16,3 +17,4 @@ register_backend('Quartus', QuartusBackend) register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) +register_backend('oneAPI', OneAPIBackend) diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/catapult/passes/bn_quant.py similarity index 100% rename from hls4ml/backends/fpga/passes/bn_quant.py rename to hls4ml/backends/catapult/passes/bn_quant.py diff --git a/hls4ml/backends/oneapi/__init__.py b/hls4ml/backends/oneapi/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py new file mode 100644 index 000000000..c85a8c0e9 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -0,0 +1,376 @@ +import subprocess +from pathlib import Path +from warnings import warn + +import numpy as np + +from hls4ml.backends import FPGABackend +from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute +from hls4ml.model.flow import register_flow +from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax +from hls4ml.model.optimizer import get_backend_passes, layer_optimizer +from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType + +# from hls4ml.report import parse_oneapi_report + + +class OneAPIBackend(FPGABackend): + def __init__(self): + super().__init__('oneAPI') + self._register_layer_attributes() + self._register_flows() + + def _register_layer_attributes(self): + # Add RNN-specific recurrent_reuse_factor attribute + rnn_layers = [ + SimpleRNN, + LSTM, + GRU, + ] + + for layer in rnn_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1)) + attrs.append(ConfigurableAttribute('table_size', default=1024)) + attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8))) + self.attribute_map[layer] = attrs + + # Add ParallelizationFactor to Conv1D/2D + pf_layers = [ + Conv1D, + Conv2D, + ] + + for layer in pf_layers: + attrs = self.attribute_map.get(layer, []) + attrs.append(ConfigurableAttribute('parallelization_factor', default=1)) + self.attribute_map[layer] = attrs + + def _register_flows(self): + initializers = self._get_layer_initializers() + init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name) + + streaming_passes = ['oneapi:clone_output'] + streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name) + + oneapi_types = [ + 'oneapi:transform_types', + 'oneapi:register_bram_weights', + 'oneapi:apply_resource_strategy', + 'oneapi:apply_winograd_kernel_transformation', + ] + oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=[init_flow], backend=self.name) + + quantization_passes = [ + 'oneapi:merge_batch_norm_quantized_tanh', + 'oneapi:quantize_dense_output', + 'fuse_consecutive_batch_normalization', + 'oneapi:xnor_pooling', + 'oneapi:generate_conv_im2col', + ] + quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name) + + optimization_passes = [ + 'oneapi:remove_final_reshape', + 'oneapi:optimize_pointwise_conv', + 'oneapi:inplace_parallel_reshape', + 'oneapi:skip_softmax', + 'oneapi:fix_softmax_table_size', + 'infer_precision_types', + ] + optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name) + + templates = self._get_layer_templates() + template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name) + + writer_passes = ['make_stamp', 'oneapi:write_hls'] + + self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name) + + all_passes = get_backend_passes(self.name) + + extras = [ + # Ideally this should be empty + opt_pass + for opt_pass in all_passes + if opt_pass + not in initializers + + streaming_passes + + oneapi_types + + quantization_passes + + templates + + optimization_passes + + writer_passes + + ['oneapi:inplace_stream_flatten', 'oneapi:reshape_stream'] # not needed + + ['oneapi:process_fixed_point_quantizer_layer'] # not yet supported + ] + + if len(extras) > 0: + for opt in extras: + warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.') + + ip_flow_requirements = [ + 'optimize', + init_flow, + streaming_flow, + quantization_flow, + optimization_flow, + oneapi_types_flow, + template_flow, + ] + ip_flow_requirements = list(filter(None, ip_flow_requirements)) + + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'): + config = {} + + config['Part'] = part if part is not None else 'Arria10' + config['ClockPeriod'] = clock_period + config['IOType'] = io_type + config['HLSConfig'] = {} + + return config + + def compile(self, model): + """Compile the generated project that can be linked into Python runtime. + + Args: + model (ModelGraph): Model to compile. + + Raises: + Exception: If the project failed to compile + + Returns: + string: Returns the name of the compiled library. + """ + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run('make lib', shell=True, cwd=builddir, check=True) + + lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so' + return lib_name + + def build(self, model, build_type='fpga_emu', run=False): + """ + Builds the project using Intel DPC++ (oneAPI) compiler. + + Args: + model (ModelGraph): The model to build + build_type, optional: What to build (e.g. fpga_emu, fpga_sim, fpga, report) + run, optional: Whether to run the testbench + Errors raise exceptions + """ + + # Check software needed is present + outdir = Path(Path.cwd(), model.config.get_output_dir()) + builddir = outdir / 'build' + builddir.mkdir(exist_ok=True) + try: + subprocess.run('which icpx', shell=True, cwd=builddir, check=True) + except subprocess.CalledProcessError: + raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately') + subprocess.run('cmake ..', shell=True, cwd=builddir, check=True) + subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True) + + if run and build_type in ('fpga_emu', 'fpga_sim', 'fpga'): + executable = builddir / f'{model.config.get_project_name()}.{build_type}' + subprocess.run(f'{str(executable)}', shell=True, cwd=builddir, check=True) + + @layer_optimizer(Layer) + def init_base_layer(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('reuse_factor', reuse_factor) + + target_cycles = layer.model.config.get_target_cycles(layer) + layer.set_attr('target_cycles', target_cycles) + + @layer_optimizer(Dense) + def init_dense(self, layer): + index_t = IntegerPrecisionType(width=1, signed=False) + + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + if layer.model.config.get_compression(layer): + layer.set_attr('strategy', 'compressed') + else: + n_in, n_out = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('strategy', 'resource') + + if layer.model.config.is_resource_strategy(layer): + if layer.model.config.get_compression(layer): + index_t = layer.get_weights('weight').type.index_precision + + layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t)) + + @layer_optimizer(Activation) + def init_activation(self, layer): + if layer.get_attr('activation') == 'tanh': + layer.set_attr('activation', 'dense_tanh') + if layer.get_attr('recurrent_activation') == 'tanh': + layer.set_attr('recurrent_activation', 'dense_tanh') + + @layer_optimizer(Softmax) + def init_softmax(self, layer): + if layer.model.config.get_config_value('IOType') == 'io_parallel': + assert ( + len(layer.get_input_variable().shape) == 1 + ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' + + @layer_optimizer(Embedding) + def init_embed(self, layer): + if layer.attributes['n_in'] is None: + raise Exception('Input length of Embedding layer must be specified.') + + @layer_optimizer(GRU) + def init_gru(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # Dense multiplication properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + index_t = IntegerPrecisionType(width=1, signed=False) + layer.set_attr('index_t', index_t) + + if 'table_t' not in layer.attributes: + layer.set_attr( + 'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8)) + ) + if 'table_size' not in layer.attributes: + layer.set_attr('table_size', 1024) + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + layer.set_attr('index_t', index_t) + + @layer_optimizer(Conv1D) + def init_conv1d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv1D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width determines the filter size post-Winograd transformation + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(Conv2D) + def init_conv2d(self, layer): + # This can happen if we assign weights of Dense layer to 1x1 Conv2D + if len(layer.weights['weight'].data.shape) == 2: + layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1)) + + # Dense matrix multiply properties + layer.set_attr('rfpad', 0) + layer.set_attr('bfpad', 0) + + # Reuse and parallelization factors + layer.set_attr('strategy', 'resource') + n_in, n_out = self.get_layer_mult_size(layer) + self.set_target_reuse_factor(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)) + + # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation + layer.set_attr('impl_filt_height', layer.get_attr('filt_height')) + layer.set_attr('impl_filt_width', layer.get_attr('filt_width')) + + # Implementation: + # - combination - at compile-time, the decision between Winograd and im2col is made + # - im2col - specifically use im2col + # - Winograd - use Winograd, if possible + layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col')) + + layer.set_attr( + 'n_partitions', 1 + ) # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend + + @layer_optimizer(LSTM) + def init_lstm(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # We don't use RF yet + if True: # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication + n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer) + self.set_closest_reuse_factor(layer, n_in, n_out) + self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor') + layer.set_attr('strategy', 'resource') + + # Split weights for easier storage in on-chip memory and implementation in HLS + weights_data = layer.weights['weight'].data + rec_weights_data = layer.weights['recurrent_weight'].data + bias_data = layer.weights['bias'].data + + weight_types = ['i', 'f', 'c', 'o'] + for i in range(0, 4): + layer.add_weights_variable( + name=f'weight_{weight_types[i]}', + var_name=f'kernel_{weight_types[i]}_{{index}}', + data=weights_data[ + 0 : layer.get_attr('n_in'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'recurrent_weight_{weight_types[i]}', + var_name=f'recurrent_kernel_{weight_types[i]}_{{index}}', + data=rec_weights_data[ + 0 : layer.get_attr('n_out'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out') + ], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + layer.add_weights_variable( + name=f'bias_{weight_types[i]}', + var_name=f'bias_{weight_types[i]}_{{index}}', + data=bias_data[i * layer.get_attr('n_out') : (i + 1) * (layer.get_attr('n_out'))], + quantizer=layer.get_attr('weight_quantizer'), + compression=None, + ) + + @layer_optimizer(SimpleRNN) + def init_simple_rnn(self, layer): + reuse_factor = layer.model.config.get_reuse_factor(layer) + layer.set_attr('recurrent_reuse_factor', reuse_factor) + + # TODO - Consider setting and using RF diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py new file mode 100644 index 000000000..c86b8f7ea --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_template.py @@ -0,0 +1,61 @@ +''' +This package includes oneAPI-specific templates +''' + +from hls4ml.backends.template import Template + + +class StreamFunctionCallTemplate(Template): + """Base class for the streaming function call templates in oneAPI: provides the 'stream_function_cpp' attribute. + This generally provides the async call to the task sequence that executes the streaming function. + + Note: the include header files are specified in the regular FunctionCallTemplate, not here. + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_stream_function_template' + super().__init__(name, layer_class, 'stream_function_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + params['name'] = layer.name + return params + + def transform(self, model, node): + return super().transform(model, node) + + +class TaskSequenceTemplate(Template): + """Base class for the task sequence definition in oneAPI: provides the 'task_sequence_cpp' attribute. + This defines the task sequence that is then called by the StreamFunctionCallTemplate. + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + + def __init__(self, layer_class): + if isinstance(layer_class, (list, tuple, set)): + name = '_'.join([cls.__name__.lower() for cls in layer_class]) + else: + name = layer_class.__name__.lower() + name += '_task_sequence_template' + super().__init__(name, layer_class, 'tast_sequence_cpp') + + def _default_function_params(self, layer): + params = self._default_params(layer) + params['name'] = layer.name + params['config'] = f'config{layer.index}' + params['input_pipe'] = layer.get_input_variable().pipe_name + params['output_pipe'] = layer.get_output_variable().pipe_name + + return params + + def transform(self, model, node): + return super().transform(model, node) diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py new file mode 100644 index 000000000..3106e1e10 --- /dev/null +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -0,0 +1,267 @@ +''' +This package includes oneAPI-specific customizations to the variable types +''' + +import numpy as np + +from hls4ml.backends.fpga.fpga_types import ( + ACFixedPrecisionDefinition, + ACIntegerPrecisionDefinition, + FixedPrecisionConverter, + HLSTypeConverter, + NamedTypeConverter, + PrecisionDefinition, + TypeDefinition, + TypePrecisionConverter, + VariableDefinition, +) +from hls4ml.model.types import ( + CompressedType, + ExponentPrecisionType, + ExponentType, + FixedPrecisionType, + IntegerPrecisionType, + NamedType, + PackedType, + XnorPrecisionType, +) +from hls4ml.utils.fixed_point_utils import next_pow2 +from hls4ml.utils.string_utils import convert_to_pascal_case + + +class ACExponentPrecisionDefinition(PrecisionDefinition): + def definition_cpp(self): + typestring = f'std::pair, ac_int<{self.width}, true>>' + return typestring + + +class OneAPIACTypeConverter(FixedPrecisionConverter): + def __init__(self): + super().__init__( + type_map={ + FixedPrecisionType: ACFixedPrecisionDefinition, + IntegerPrecisionType: ACIntegerPrecisionDefinition, + ExponentPrecisionType: ACExponentPrecisionDefinition, + XnorPrecisionType: ACIntegerPrecisionDefinition, + }, + prefix='AC', + ) + + +class OneAPICompressedTypeConverter(TypeDefinition, TypePrecisionConverter): + """Use a tuple for storing a compressed type for oneAPI since it's better supported. (Currently unused)""" + + def definition_cpp(self): + """tuple format is row_index, col_index, weight""" + cpp_fmt = 'typedef std::tuple<{index}, {index}, {precision}> {name};\n' + return cpp_fmt.format(name=self.name, index=self.index_precision, precision=self.precision.definition_cpp()) + + def convert_precision(self, precision_converter): + super().convert_precision(precision_converter) + self.index_precision = precision_converter.convert(self.index_precision) + + +class OneAPIExponentTypeConverter(TypeDefinition, TypePrecisionConverter): + """Use a pair for storing a exponent type for oneAPI since it's better supported""" + + def definition_cpp(self): + cpp_fmt = 'typedef std::pair<{sign}, {precision}> {name};\n' + return cpp_fmt.format(name=self.name, precision=self.precision.definition_cpp(), sign=self.sign.definition_cpp()) + + def convert_precision(self, precision_converter): + super().convert_precision(precision_converter) + self.sign = precision_converter.convert(self.sign) + + +class OneAPIPackedTypeConverter(TypeDefinition, TypePrecisionConverter): + def definition_cpp(self): + n_elem_expr = '/' if self.unpack else '*' + return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format( + name=self.name, + precision=self.precision.definition_cpp(), + n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack), + ) + + def convert_precision(self, precision_converter): + self.precision = precision_converter.convert(self.precision) + + +class OneAPIHLSTypeConverter(HLSTypeConverter): + def __init__(self, precision_converter): + self.precision_converter = precision_converter + self.type_map = { + NamedType: NamedTypeConverter, + CompressedType: OneAPICompressedTypeConverter, + ExponentType: OneAPIExponentTypeConverter, + PackedType: OneAPIPackedTypeConverter, + } + + +# region ArrayVarable + + +class OneAPIArrayVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' + + +class OneAPIInplaceArrayVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'auto& {self.name} = {self.input_var.name}' + + +class AggregratedArrayVariableConverter: + """This is a bit of an extension of the standard ArrayVariableConverter""" + + def __init__(self, type_converter, prefix, definition_cls): + self.type_converter = type_converter + self.prefix = prefix + self.definition_cls = definition_cls + + def convert(self, tensor_var, pragma='', depth=0, n_pack=1): + if isinstance(tensor_var, self.definition_cls): # Already converted + return tensor_var + + tensor_var.pragma = pragma + if pragma == 'stream': + if depth == 0: + depth = np.prod(tensor_var.shape) // tensor_var.shape[-1] + tensor_var.pragma = ('stream', depth) + n_elem = tensor_var.shape[-1] + else: + tensor_var.pragma = pragma + n_elem = tensor_var.size() + n_pack = 1 # ignore any passed value + + tensor_var.type = self.type_converter.convert( + PackedType(tensor_var.type.name, tensor_var.type.precision, n_elem, n_pack) + ) + + # pipe_name and pipe_id are only used for io_stream and interface variables in io_parallel + tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe' + tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID' + + tensor_var.__class__ = type(self.prefix + 'AggregateArrayVariable', (type(tensor_var), self.definition_cls), {}) + return tensor_var + + +class OneAPIArrayVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition) + + +class OneAPIInplaceArrayVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition) + + +# endregion + +# region InterfaceMemberVariable + + +class OneAPIInterfaceVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' + + def declare_cpp(self, pipe_min_size=0, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' + ) + return lines + + +class OneAPIInterfaceVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition) + + +# endregion + + +# region StreamVariable +class OneAPIStreamVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=True): + return f'{self.name}{name_suffix}' + + def declare_cpp(self, indent=''): + lines = indent + f'class {self.pipe_id};\n' + lines += indent + ( + f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' + + f'{self.type.name}, {self.pragma[-1]}>;\n' + ) + return lines + + +class OneAPIInplaceStreamVariableDefinition(VariableDefinition): + def definition_cpp(self): + return f'using {self.name} = {self.input_var.name}' + + +class OneAPIStreamVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition) + + +class OneAPIInplaceStreamVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition + ) + + +# region WeightsVariable + + +class OneAPIStaticWeightVariableDefinition(VariableDefinition): + def definition_cpp(self, reuse_factor): + """Write the appropriate weight definiiton""" + # first determine whether to store in register or bram (heuristic) + if reuse_factor == 1 or self.data_length < 2048 or self.type.precision.width < 3: + attribute = '[[intel::fpga_register]]' + else: + # revisit this heuristic + nbanks = int(2 ** np.ceil(np.log2(self.data_length)) / 2) + var_width = int(np.ceil(self.type.precision.width / 8)) + bwidth = next_pow2(var_width) + attribute = ( + f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), ' + 'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]' + ) + if self.storage == 'register': + return f'{attribute} static constexpr {self.type.name} {self.name}' + else: + return f'{attribute} {self.type.name} {self.name}' + + +class OneAPIStaticWeightVariableConverter: + def __init__(self, type_converter): + self.type_converter = type_converter + + def convert(self, weight_var): + if isinstance(weight_var, OneAPIStaticWeightVariableDefinition): # Already converted + return weight_var + + weight_var.weight_class = weight_var.__class__.__name__ + weight_var.storage = 'register' + weight_var.type = self.type_converter.convert( + PackedType(weight_var.name + '_t', weight_var.type.precision, weight_var.data_length, 1) + ) + + weight_var.__class__ = type( + 'OneAPIStaticWeightVariable', (type(weight_var), OneAPIStaticWeightVariableDefinition), {} + ) + return weight_var + + +# endregion + +# endregion diff --git a/hls4ml/backends/oneapi/passes/__init__.py b/hls4ml/backends/oneapi/passes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/hls4ml/backends/oneapi/passes/bn_quant.py b/hls4ml/backends/oneapi/passes/bn_quant.py new file mode 100644 index 000000000..8425d5da1 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/bn_quant.py @@ -0,0 +1,222 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {threshold_t.name} threshold_t; +}};\n""" + +batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {threshold_hi_t.name} threshold_hi_t; + typedef {threshold_lo_t.name} threshold_lo_t; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {output_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + +batchnorm_quantized_tanh_task_sequence_template = ( + 'task_sequence> {name};' +) + +batchnorm_quantized_tanh_stream_function_template = '{name}.async({threshold});' + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template) + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + if node.get_attr('quantize') == 2: + return self.template[0].format(**params) + else: + return self.template[1].format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + backend.register_template(BatchNormalizationQuantizedTanhTaskSequenceTemplate) + backend.register_template(BatchNormalizationQuantizedTanhStreamFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/backends/oneapi/passes/clone_templates.py b/hls4ml/backends/oneapi/passes/clone_templates.py new file mode 100644 index 000000000..447ae126e --- /dev/null +++ b/hls4ml/backends/oneapi/passes/clone_templates.py @@ -0,0 +1,32 @@ +""" The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts +""" + +from hls4ml.backends.fpga.passes.clone import Clone +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate + +clone_stream_function_template = '{name}.async();' + + +class CloneTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Clone) + + def format(self, node): + params = self._default_function_params(node) + for i in range(len(node.outputs)): + params[f'output{i + 1}_pipe'] = node.variables[node.outputs[i]].pipe_name + + output_pipes = ', '.join([f'{{output{i + 1}_pipe}}' for i in range(len(node.outputs))]) + + template = f'task_sequence> {{name}};' + return template.format(**params) + + +class CloneStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Clone) + self.template = clone_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py new file mode 100644 index 000000000..17154559d --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_templates.py @@ -0,0 +1,235 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm + +# TODO - Dilation rate ? + +''' Shared mutliplication config ''' +conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +''' 1D Conv ''' +conv1d_config_template = """struct config{index} : nnet::conv1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_width; + + static const unsigned n_filt = {n_filt}; + static const unsigned out_width = {out_width}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_width = {stride_width}; + static const unsigned dilation = {dilation}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelization_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}}; +""" + +conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +conv1d_task_sequence_template = ( + 'task_sequence> {name};' +) + +conv_stream_function_template = '{name}.async({w}, {b});' + +conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h'] + + +class Conv1DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Conv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise RuntimeError('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Conv1D, include_header=conv1d_include_list) + self.template = conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class Conv1DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Conv1D) + self.template = conv1d_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class ConvStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Conv1D, Conv2D, Conv2DBatchnorm)) + self.template = conv_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +''' 2D Conv ''' +conv2d_config_template = """struct config{index} : nnet::conv2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned filt_height = {filt_height}; + static const unsigned filt_width = {filt_width}; + static const unsigned impl_filt_height = {impl_filt_height}; + static const unsigned impl_filt_width = {impl_filt_width}; + static const unsigned kernel_size = filt_height * filt_width; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned reuse_factor = {reuse}; + static const unsigned parallelization_factor = {parallelization}; + static const bool store_weights_in_bram = false; + + static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation}; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {config_t} mult_config; +}};\n""" + +conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' + +conv2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h'] + + +class Conv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm)) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + def format(self, node): + conv_params = self._default_config_params(node) + conv_params['dilation'] = node.get_attr('dilation', 1) + if conv_params['dilation'] != 1: + raise RuntimeError('dilation != 1 not supported yet') + conv_params['config_t'] = f'config{node.index}_mult' + conv_config = self.template.format(**conv_params) + + mult_params = self._default_config_params(node) + mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width') + mult_params['n_out'] = node.get_attr('n_filt') + mult_params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_config = self.mult_template.format(**mult_params) + + return mult_config + '\n' + conv_config + + +class Conv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list) + self.template = conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class Conv2DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Conv2D, Conv2DBatchnorm)) + self.template = conv2d_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py new file mode 100644 index 000000000..fdab408b3 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py @@ -0,0 +1,179 @@ +import math + +import numpy as np + +from hls4ml.model.layers import Conv1D, Conv2D +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyWinogradKernelTransformation(OptimizerPass): + ''' + Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution + For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks + ''' + + def match(self, node): + node_matches = isinstance(node, (Conv1D, Conv2D)) + + # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters + weights_transformed = node.get_attr('_weights_transposed', False) is True + + # User opted for Winograd + implementation_is_winograd = ( + node.get_attr('implementation', 'combination') == 'combination' + or node.get_attr('implementation', 'combination') == 'winograd' + ) + + parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel' + + # Winograd algorithm-specific conditions + if isinstance(node, Conv1D): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with stride != 1 + stride_is_one = node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_width') > 2 + + winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type + + elif isinstance(node, (Conv2D)): + # Winograd only applies to specific kernel sizes + # Current implementation only supports fs = 3; easily extendable to other filter sizes + filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3 + + # Winograd's minimal filtering algorithm doesn't work with striede != 1 + stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1 + + # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once + loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2 + + padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr( + 'pad_left', 0 + ) == node.get_attr('pad_right', 0) + + winograd_conditions = ( + filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type + ) + + else: + winograd_conditions = False + + # Check any previous transformations + already_transformed = node.get_attr('_winograd_transformation_applied', False) is True + + if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd': + raise RuntimeError( + 'Not possible to use Winograd algorithm with current architecture. ' + 'Please set implementation to im2col or combination' + ) + + return ( + node_matches + and weights_transformed + and winograd_conditions + and not already_transformed + and implementation_is_winograd + ) + + def transform(self, model, node): + if isinstance(node, Conv1D): + if node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C) + # Therefore, (F, W, C) => (F, C, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3) => (4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4)) + + # Transformation matrices for 3x1 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel]) + node.weights['weight'].data_length = node.weights['weight'].data.size + # need to always be consistent + node.weights['weight'].type.n_elem = node.weights['weight'].data_length + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_width', 4) + + elif isinstance(node, Conv2D): + if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3: + # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W) + # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C) + # Therefore, (F, H, W, C) => (F, C, H, W) + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2]) + + # Temporary copy of data + weights = node.weights['weight'].data + + # Expand weight dimensionality (3x3) => (4x4) + node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4)) + + # Transformation matrices for 3x3 kernels + G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) + GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]]) + + # Transformation GfG' + for filter in range(0, weights.data.shape[0]): + for channel in range(0, weights.data.shape[1]): + node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT) + node.weights['weight'].data_length = node.weights['weight'].data.size + # need to always be consistent + node.weights['weight'].type.n_elem = node.weights['weight'].data_length + + # Winograd's minimal filtering algorithm transforms the weight matrix + # This transformation consists of addition and division (by 2&4) of the weight matrix + # Therefore, increase precision (if needed), to accomodate for new weights + # This error is only noticeable for low precisions, such as those used with QKeras + + # Integer precision is only updated if it exceeds the one defined in hls4ml config + maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max())) + if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer: + node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1 + node.weights['weight'].type.precision.width += ( + maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer + ) + + # Fractional precision is increased by 2 bits (division by 4), + # for low-precision (less than 8) fractional weights + if node.weights['weight'].type.precision.fractional < 8: + node.weights['weight'].type.precision.width += 2 + + # Modified kernel size + node.set_attr('impl_filt_height', 4) + node.set_attr('impl_filt_width', 4) + else: + raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer') + + node.set_attr('_winograd_transformation_applied', True) + + return False diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py new file mode 100644 index 000000000..5ccf1a521 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -0,0 +1,351 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax + +# Dense templates + +dense_config_template = """struct config{index} : nnet::dense_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned n_out = {n_out}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned n_zeros = {nzeros}; + static constexpr unsigned n_nonzeros = {nonzeros}; + static constexpr bool store_weights_in_bram = false; + + static constexpr unsigned rf_pad = {rfpad}; + static constexpr unsigned bf_pad = {bfpad}; + + static constexpr unsigned reuse_factor = {reuse}; + static constexpr unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor); + static constexpr unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static constexpr unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static constexpr unsigned block_factor_rounded = block_factor + bf_pad; + static constexpr unsigned multiplier_factor = MIN(n_in, reuse_factor); + static constexpr unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static constexpr unsigned multiplier_scale = multiplier_limit/n_out; + + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + typedef {index_t.name} index_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + +dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +dense_task_sequence_template = 'task_sequence> {name};' +dense_stream_function_template = '{name}.async({w}, {b});' +dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h'] + + +class DenseConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_config_template + + def format(self, node): + params = self._default_config_params(node) + params['nzeros'] = node.get_weights('weight').nzeros + params['nonzeros'] = node.get_weights('weight').nonzeros + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + + return self.template.format(**params) + + +class DenseFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Dense, include_header=dense_include_list) + self.template = dense_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class DenseTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class DenseStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Dense) + self.template = dense_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# BatchNormalization templates + +batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned n_filt = {n_filt}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + static constexpr bool store_weights_in_bram = false; + typedef {bias_t.name} bias_t; + typedef {scale_t.name} scale_t; + template + using product = nnet::product::{product_type}; +}};\n""" + +batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});' +batchnorm_task_sequence_template = 'task_sequence> {name};' +batchnorm_stream_function_template = '{name}.async({scale}, {bias});' +batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class BatchNormalizationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class BatchNormalizationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class BatchNormalizationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalization) + self.template = batchnorm_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +# Activation templates + +activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n""" + +param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; + typedef {param_t.name} param_t; +}};\n""" + +hard_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr {slope_t.name} slope = {slope}; + static constexpr {shift_t.name} shift = {shift}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; +}};\n""" + +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static constexpr unsigned n_in = {n_in}; + static constexpr unsigned table_size = {table_size}; + static constexpr unsigned io_type = nnet::{iotype}; + static constexpr unsigned reuse_factor = {reuse}; + static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; + typedef {exp_table_t.name} exp_table_t; + typedef {inv_table_t.name} inv_table_t; +}};\n""" + +activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' +param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' + +activ_task_sequence_template = 'task_sequence> {name};' +activ_stream_function_template = '{name}.async();' +param_activ_stream_function_template = '{name}.async({param});' + +activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h'] + + +class ActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Activation) + self.template = activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class ParamActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ParametrizedActivation, PReLU)) + self.template = param_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class HardActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(HardActivation) + self.template = hard_activ_config_template + + def format(self, node): + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + + return self.template.format(**params) + + +class SoftmaxConfigTemplate(ActivationConfigTemplate): + def __init__(self): + super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ + self.template = softmax_config_template + + +class ActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class ParametrizedActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['param'] = node.get_attr('activ_param', 1.0) + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class PReLUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PReLU, include_header=activ_include_list) + self.template = param_activ_function_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['param'] = node.get_weights('param').name + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + + return self.template.format(**params) + + +class ActivationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax, PReLU)) + self.template = activ_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node.get_attr('activation').lower() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + return self.template.format(**params) + + +class ParametrizedActivationTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(ParametrizedActivation) + self.template = activ_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['activation'] = node._get_act_function_name() + params['config'] = f"{node.get_attr('activation')}_config{node.index}" + return self.template.format(**params) + + +class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Activation, HardActivation, Softmax)) + self.template = activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) + + +class ParametrizedActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(ParametrizedActivation) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_attr('activ_param', 1.0) + return self.template.format(**params) + + +class PReLUActivationStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(PReLU) + self.template = param_activ_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['param'] = node.get_weights('param').name + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/embedding_templates.py b/hls4ml/backends/oneapi/passes/embedding_templates.py new file mode 100644 index 000000000..6fda678f0 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/embedding_templates.py @@ -0,0 +1,32 @@ +""" +These are the stream oneAPI templates for embedding layers. The io_parallel ones are in backends/fpga/passes/embedding.py. +""" + +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.model.layers import Embedding + +embed_task_sequence_template = 'task_sequence> {name};' +embed_stream_function_template = '{name}.async({e});' + + +class EmbeddingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Embedding) + self.template = embed_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class EmbeddingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(Embedding) + self.template = embed_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['e'] = node.get_weights('embeddings').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py new file mode 100644 index 000000000..c38e1e055 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/merge_templates.py @@ -0,0 +1,137 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Concatenate, Dot, Merge + +# TODO - Very similar to vivado/merge_templates.py - only difference is on line 67: +# TODO - get_backend('vivado').product_type(inp1.type.precision, inp2.type.precision) +# TODO - Look into ways of having passes similar accross many backends in a shared folder thorugh inheritance and overriding. + +# Merge templates +merge_config_template = """struct config{index} : nnet::merge_config {{ + static const unsigned n_elem = {n_elem}; +}};\n""" + +merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});' + +merge_task_sequence_template = ( + 'task_sequence> {name};' +) + +merge_stream_function_template = '{name}.async();' + +merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h'] + + +class MergeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Merge) + self.template = merge_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp() + + return self.template.format(**params) + + +class MergeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list) + self.template = merge_function_template + + def format(self, node): + params = self._default_function_params(node) + params['merge'] = node.get_attr('op').lower() + params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name + params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name + params['input1'] = node.get_input_variable(node.inputs[0]).name + params['input2'] = node.get_input_variable(node.inputs[1]).name + + return self.template.format(**params) + + +class MergeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot)) + self.template = merge_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['merge'] = node.get_attr('op').lower() + params['input1_pipe'] = node.get_input_variable(node.inputs[0]).pipe_name + params['input2_pipe'] = node.get_input_variable(node.inputs[1]).pipe_name + return self.template.format(**params) + + +class MergeStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Merge, Concatenate, Dot)) + self.template = merge_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +# Dot templates +dot_config_template = """struct config{index} : nnet::dot_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned reuse_factor = {reuse}; + + typedef {accum_t.name} accum_t; + + template + using product = nnet::product::{product_type}; +}};\n""" + + +class DotConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Dot) + self.template = dot_config_template + + def format(self, node): + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + params = self._default_config_params(node) + params['n_out'] = 1 + params['n_in'] = inp1.shape[0] + params['product_type'] = get_backend('oneAPI').product_type(inp1.type.precision, inp2.type.precision) + + return self.template.format(**params) + + +# Concatenate templates +concat_config_template = """struct config{index} : nnet::concat_config {{ + static const unsigned n_elem1_0 = {n_elem1_0}; + static const unsigned n_elem1_1 = {n_elem1_1}; + static const unsigned n_elem1_2 = {n_elem1_2}; + static const unsigned n_elem2_0 = {n_elem2_0}; + static const unsigned n_elem2_1 = {n_elem2_1}; + static const unsigned n_elem2_2 = {n_elem2_2}; + + static const int axis = {axis}; +}};\n""" + + +class ConcatenateConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Concatenate) + self.template = concat_config_template + + def format(self, node): + params = self._default_config_params(node) + for i in range(3): + params.setdefault(f'n_elem1_{i}', 0) + params.setdefault(f'n_elem2_{i}', 0) + inp1 = node.get_input_variable(node.inputs[0]) + inp2 = node.get_input_variable(node.inputs[1]) + for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)): + params[f'n_elem1_{i}'] = s1 + params[f'n_elem2_{i}'] = s2 + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py new file mode 100644 index 000000000..ccf410d1f --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pointwise.py @@ -0,0 +1,156 @@ +from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.oneapi.passes.convolution_templates import ( + Conv1DConfigTemplate, + Conv2DConfigTemplate, + conv1d_config_template, + conv2d_config_template, + conv_mult_config_template, +) +from hls4ml.backends.template import FunctionCallTemplate +from hls4ml.model.layers import register_layer +from hls4ml.model.optimizer import OptimizerPass + +''' +Custom hls4ml layer implementation for 1x1 Conv filters using im2col +Allows lower latency andresource usage, due to less loop invocations +''' + +pointwise_conv1d_function_template = ( + 'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) +pointwise_conv2d_function_template = ( + 'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +) + +pointwise_conv1d_task_sequence_template = ( + 'task_sequence> {name};' +) + +pointwise_conv2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +pointwise_conv_stream_function_template = '{name}.async({w}, {b});' + +sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h'] +sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h'] + + +class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate): + def __init__(self): + super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D) + self.template = conv1d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv1DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PointwiseConv1D, include_header=sepconv1d_include_list) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class PointwiseConv1DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(PointwiseConv1D) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate): + def __init__(self): + super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D) + self.template = conv2d_config_template + self.mult_template = conv_mult_config_template + + +class PointwiseConv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(PointwiseConv2D, include_header=sepconv2d_include_list) + self.template = pointwise_conv2d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class PointwiseConv2DTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(PointwiseConv2D) + self.template = pointwise_conv1d_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + return self.template.format(**params) + + +class PointwiseConvStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((PointwiseConv1D, PointwiseConv2D)) + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + + return self.template.format(**params) + + +def register_pointwise(backend): + # Register the layer types to the layer map + register_layer('PointwiseConv1D', PointwiseConv1D) + register_layer('PointwiseConv2D', PointwiseConv2D) + + # Register the optimization passes + backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv) + + # Register template passes + backend.register_template(PointwiseConv1DConfigTemplate) + backend.register_template(PointwiseConv1DFunctionTemplate) + backend.register_template(PointwiseConv2DConfigTemplate) + backend.register_template(PointwiseConv2DFunctionTemplate) + + +class OptimizePointwiseConv(OptimizerPass): + def match(self, node): + return ( + node.class_name in ('Conv1D', 'Conv2D') + and node.get_attr('filt_height', 1) == 1 + and node.get_attr('filt_width') == 1 + and node.model.config.get_config_value('IOType') == 'io_parallel' + ) + + def transform(self, model, node): + dim = node.__class__.__name__[-2:] # '1D' or '2D' + new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + pw_node = model.make_node( + 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() + ) + model.replace_node(node, pw_node) + + return True diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py new file mode 100644 index 000000000..97136ed84 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/pooling_templates.py @@ -0,0 +1,153 @@ +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D + +pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned stride_width = {stride_width}; + static const unsigned pool_width = {pool_width}; + + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned filt_width = {pool_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned in_width = {n_in}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned stride_height = {stride_height}; + static const unsigned stride_width = {stride_width}; + + static const unsigned pool_height = {pool_height}; + static const unsigned pool_width = {pool_width}; + static const unsigned filt_height = {pool_height}; + static const unsigned filt_width = {pool_width}; + + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + + static const unsigned n_filt = {n_filt}; + static const unsigned n_chan = {n_filt}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; + static const bool count_pad = {count_pad}; + + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_filt = {n_filt}; + static const nnet::Pool_Op pool_op = nnet::{pool_op}; + typedef {accum_t.name} accum_t; +}};\n""" + +pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +global_pooling1d_function_template = ( + 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) +global_pooling2d_function_template = ( + 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +) + +pooling1d_task_sequence_template = ( + 'task_sequence>({name});' +) +pooling2d_task_sequence_template = ( + 'task_sequence>({name});' +) +global_pooling1d_task_sequence_template = ( + 'task_sequence>({name});' +) +global_pooling2d_task_sequence_template = ( + 'task_sequence>({name});' +) + +pooling_stream_function_template = '{name}.async();' + +pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h'] + + +class PoolingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_config_template, + 'Pooling2D': pooling2d_config_template, + 'GlobalPooling1D': global_pooling1d_config_template, + 'GlobalPooling2D': global_pooling2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class PoolingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list) + self.templates = { + 'Pooling1D': pooling1d_function_template, + 'Pooling2D': pooling2d_function_template, + 'GlobalPooling1D': global_pooling1d_function_template, + 'GlobalPooling2D': global_pooling2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + return self.templates[node.class_name].format(**params) + + +class PoolingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.templates = { + 'Pooling1D': pooling1d_task_sequence_template, + 'Pooling2D': pooling2d_task_sequence_template, + 'GlobalPooling1D': global_pooling1d_task_sequence_template, + 'GlobalPooling2D': global_pooling2d_task_sequence_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('channels_first not supported for oneAPI') + params['data_format'] = 'cl' + return self.templates[node.class_name].format(**params) + + +class PoolingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D)) + self.template = pooling_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py new file mode 100644 index 000000000..c46e17485 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/quantization_templates.py @@ -0,0 +1,63 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.oneapi.passes.core_templates import ( + batchnorm_config_template, + batchnorm_function_template, + batchnorm_include_list, + batchnorm_stream_function_template, + batchnorm_task_sequence_template, +) +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha + + +class ApplyAlphaConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + params['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('scale').type.precision + ) + + return self.template.format(**params) + + +class ApplyAlphaFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha, include_header=batchnorm_include_list) + self.template = batchnorm_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) + + +class ApplyAlphaTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class ApplyAlphaStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(ApplyAlpha) + self.template = batchnorm_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['scale'] = node.get_weights('scale').name + params['bias'] = node.get_weights('bias').name + + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py new file mode 100644 index 000000000..00cd16879 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py @@ -0,0 +1,369 @@ +from hls4ml.backends.backend import get_backend +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import GRU, LSTM, SimpleRNN + +# Note: currently only GRU is supported for stream; lstm and simpleRNN are parallel-only + +recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h'] + +################################################ +# Shared Matrix Multiplication Template (Dense) +################################################ +recr_mult_x_config_template = '''struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + typedef {accum_t.name} accum_t; + typedef {bias_t.name} bias_t; + typedef {weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n''' + +recr_mult_h_config_template = '''struct config{index}_mult : nnet::dense_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + + static const unsigned rf_pad = {rfpad}; + static const unsigned bf_pad = {bfpad}; + static const unsigned reuse_factor = {reuse}; + static const unsigned reuse_factor_rounded = reuse_factor + rf_pad; + static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor); + static const unsigned block_factor_rounded = block_factor + bf_pad; + static const unsigned multiplier_factor = MIN(n_in, reuse_factor); + static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor); + static const unsigned multiplier_scale = multiplier_limit/n_out; + typedef {accum_t.name} accum_t; + typedef {recurrent_bias_t.name} bias_t; + typedef {recurrent_weight_t.name} weight_t; + + template + using product = nnet::product::{product_type}; +}};\n''' + +################################################ +# Shared Activation Template +################################################ +activ_config_template = '''struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned table_size = {table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + typedef {table_t.name} table_t; +}};\n''' + +################################################ +# GRU Template +################################################ +gru_config_template = '''struct config{index} : nnet::gru_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_units = {n_units}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned n_outputs = {n_outputs}; + static const bool return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + typedef {recurrent_weight_t.name} recurrent_weight_t; + typedef {recurrent_bias_t.name} recurrent_bias_t; + + typedef {config_mult_x} mult_config_x; + typedef {config_mult_h} mult_config_h; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n''' + +gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});' +gru_task_sequence_template = 'task_sequence> {name};' +gru_stream_function_template = '{name}.async({w}, {wr}, {b}, {br});' + + +class GRUConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(GRU) + self.gru_template = gru_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + self.mult_x_template = recr_mult_x_config_template + self.mult_h_template = recr_mult_h_config_template + + def format(self, node): + # Input has shape (n_timesteps, inp_dimensionality) + # Output / hidden units has shape (1 if !return_sequences else n_timesteps , n_units) + params = self._default_config_params(node) + params['n_units'] = node.get_attr('n_out') + params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + params['return_sequences'] = 'true' if node.get_attr('return_sequences', False) else 'false' + params['config_mult_x'] = f'config{node.index}_x_mult' + params['config_mult_h'] = f'config{node.index}_h_mult' + params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act') + gru_config = self.gru_template.format(**params) + + # Activation is on candidate hidden state, dimensionality (1, n_units) + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + # Recurrent activation is on reset and update gates (therefore x2), dimensionality (1, n_units) + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = str(node.get_attr('n_out')) + ' * 2' + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + # Multiplication config for matrix multiplications of type Wx (reset, update and candidate states) + mult_params_x = self._default_config_params(node) + mult_params_x['n_in'] = node.get_attr('n_in') + mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_x['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('weight').type.precision + ) + mult_params_x['index'] = str(node.index) + '_x' + mult_config_x = self.mult_x_template.format(**mult_params_x) + + # Multiplication config for matrix multiplications of type Wh (reset, update and candidate states) + mult_params_h = self._default_config_params(node) + mult_params_h['n_in'] = node.get_attr('n_out') + mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3' + mult_params_h['reuse_factor'] = params['recurrent_reuse_factor'] + mult_params_h['product_type'] = get_backend('oneAPI').product_type( + node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision + ) + mult_params_h['index'] = str(node.index) + '_h' + mult_config_h = self.mult_h_template.format(**mult_params_h) + + return mult_config_x + '\n' + mult_config_h + '\n' + recr_act_config + '\n' + act_config + '\n' + gru_config + + +class GRUFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(GRU, include_header=recurrent_include_list) + self.template = gru_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + return self.template.format(**params) + + +class GRUTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(GRU) + self.template = gru_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +class GRUStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__(GRU) + self.template = gru_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['wr'] = node.get_weights('recurrent_weight').name + params['br'] = node.get_weights('recurrent_bias').name + + return self.template.format(**params) + + +################################################ +# LSTM Template +################################################ +lstm_config_template = """struct config{index} : nnet::lstm_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_i_t.name} weight_i_t; + typedef {bias_i_t.name} bias_i_t; + typedef {weight_f_t.name} weight_f_t; + typedef {bias_f_t.name} bias_f_t; + typedef {weight_c_t.name} weight_c_t; + typedef {bias_c_t.name} bias_c_t; + typedef {weight_o_t.name} weight_o_t; + typedef {bias_o_t.name} bias_o_t; + typedef {recurrent_weight_i_t.name} recurrent_weight_i_t; + typedef {recurrent_weight_f_t.name} recurrent_weight_f_t; + typedef {recurrent_weight_c_t.name} recurrent_weight_c_t; + typedef {recurrent_weight_o_t.name} recurrent_weight_o_t; + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +lstm_function_template = 'nnet::lstm<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class LSTMConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(LSTM) + self.template = lstm_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + lstm_params = self._default_config_params(node) + lstm_params['n_in'] = node.get_attr('n_in') + lstm_params['n_out'] = node.get_attr('n_out') + lstm_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + + lstm_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + lstm_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + lstm_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + lstm_config = self.template.format(**lstm_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + lstm_config + + +class LSTMFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(LSTM, include_header=recurrent_include_list) + self.template = lstm_function_template + + def format(self, node): + params = self._default_function_params(node) + + types = ['i', 'f', 'c', 'o'] + params['weights'] = '' + for t in types: + params['weights'] += f'kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += f'recurrent_kernel_{t}_{str(node.index)},' + for t in types: + params['weights'] += 'bias_{}_{}{}'.format(t, str(node.index), ',' if t != 'o' else '') + + return self.template.format(**params) + + +################################################ +# SimpleRNN Template +################################################ +simple_rnn_config_template = """struct config{index} : nnet::simpleRNN_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_out = {n_out}; + static const unsigned n_outputs = {n_outputs}; + static const unsigned n_timesteps = {n_timesteps}; + static const unsigned return_sequences = {return_sequences}; + + typedef {accum_t.name} accum_t; + typedef {weight_t.name} weight_t; + typedef {bias_t.name} bias_t; + typedef {recurrent_weight_t.name} recurrent_weight_t; + + typedef {act_t} ACT_CONFIG_T; + template + using activation = nnet::activation::{activation}; + + typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T; + template + using activation_recr = nnet::activation::{recurrent_activation}; + + static const unsigned reuse_factor = {reuse}; + static const bool store_weights_in_bram = false; +}};\n""" + +simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});' + + +class SimpleRNNConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SimpleRNN) + self.template = simple_rnn_config_template + self.act_template = activ_config_template + self.recr_act_template = activ_config_template + + def format(self, node): + simple_rnn_params = self._default_config_params(node) + simple_rnn_params['n_in'] = node.get_attr('n_in') + simple_rnn_params['n_out'] = node.get_attr('n_out') + simple_rnn_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1' + simple_rnn_params['return_sequences'] = str(node.get_attr('return_sequences')).lower() + simple_rnn_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act') + simple_rnn_params['act_recurrent_t'] = '{}_config{}'.format( + node.get_attr('recurrent_activation'), str(node.index) + '_rec_act' + ) + simple_rnn_params['recurrent_activation'] = 'relu' + + simple_rnn_config = self.template.format(**simple_rnn_params) + + act_params = self._default_config_params(node) + act_params['type'] = node.get_attr('activation') + act_params['n_in'] = node.get_attr('n_out') + act_params['index'] = str(node.index) + '_act' + act_config = self.act_template.format(**act_params) + + recr_act_params = self._default_config_params(node) + recr_act_params['type'] = node.get_attr('recurrent_activation') + recr_act_params['n_in'] = node.get_attr('n_out') + recr_act_params['index'] = str(node.index) + '_rec_act' + recr_act_config = self.recr_act_template.format(**recr_act_params) + + return act_config + '\n' + recr_act_config + '\n' + simple_rnn_config + + +class SimpleRNNFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SimpleRNN, include_header=recurrent_include_list) + self.template = simple_rnn_function_template + + def format(self, node): + params = self._default_function_params(node) + params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index)) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py new file mode 100644 index 000000000..85357cdb2 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py @@ -0,0 +1,244 @@ +import numpy as np + +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import Reshape, Resize, Transpose, ZeroPadding1D, ZeroPadding2D + +# ZeroPadding templates + +zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{ + static const unsigned in_width = {in_width}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; + static const unsigned n_chan = {n_chan}; + + static const unsigned pad_top = {pad_top}; + static const unsigned pad_bottom = {pad_bottom}; + static const unsigned pad_left = {pad_left}; + static const unsigned pad_right = {pad_right}; +}};\n""" + +zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' +zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});' + +zeropad1d_task_sequence_template = ( + 'task_sequence> {name};' +) +zeropad2d_task_sequence_template = ( + 'task_sequence> {name};' +) + +reshaping_stream_function_template = '{name}.async();' + +padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h'] + + +class ZeroPaddingConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_config_template, + 'ZeroPadding2D': zeropad2d_config_template, + } + + def format(self, node): + params = self._default_config_params(node) + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list) + self.templates = { + 'ZeroPadding1D': zeropad1d_function_template, + 'ZeroPadding2D': zeropad2d_function_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise Exception('oneAPI only supports channels_last data format') + params['data_format'] = 'cl' + + return self.templates[node.class_name].format(**params) + + +class ZeroPaddingTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D)) + self.templates = { + 'ZeroPadding1D': zeropad1d_task_sequence_template, + 'ZeroPadding2D': zeropad2d_task_sequence_template, + } + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('data_format') == 'channels_first': + raise RuntimeError('channels_first not supported on oneAPI') + params['data_format'] = 'cl' + + return self.templates[node.class_name].format(**params) + + +class ReshapingStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((ZeroPadding1D, ZeroPadding2D, Resize, Reshape, Transpose)) + self.template = reshaping_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + + return self.template.format(**params) + + +# Resize templates + +resize_config_template = """struct config{index} : nnet::resize_config {{ + static const unsigned height = {in_height}; + static const unsigned width = {in_width}; + + static const unsigned new_height = {out_height}; + static const unsigned new_width = {out_width}; + + static const unsigned n_chan = {n_chan}; +}};\n""" + +resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {output_t}, {config}>({input}, {output});' +resize_task_sequence_template = ( + 'task_sequence> {name};' +) +resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h'] + + +class ResizeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class ResizeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Resize, include_header=resize_include_list) + self.template = resize_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('algorithm') != 'nearest': + raise Exception('Currently only supporting resize_nearest') + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +class ResizeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Resize) + self.template = resize_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('algorithm') != 'nearest': + raise Exception('Currently only supporting resize_nearest') + params['algorithm'] = node.get_attr('algorithm') + + return self.template.format(**params) + + +# Transpose templates + +transpose_config_template = """struct config{index} : nnet::transpose_config {{ + static const unsigned depth = {depth}; + static const unsigned height = {height}; + static const unsigned width = {width}; + static constexpr unsigned perm[3] = {{{perm_str}}}; +}};\n""" + +transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});' +transpose_task_sequence_template = ( + 'task_sequence> {name};' +) +transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h'] + + +class TransposeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_config_template + + def format(self, node): + params = self._default_config_params(node) + + return self.template.format(**params) + + +class TransposeFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Transpose, include_header=transpose_include_list) + self.template = transpose_function_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) + + +class TransposeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Transpose) + self.template = transpose_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['dim'] = node.get_attr('dim') + + return self.template.format(**params) + + +# Reshape template (only used in streaming) +reshape_task_sequence_template = 'task_sequence> {name};' +reshape_include_list = ['nnet_utils/nnet_stream.h'] + + +class ReshapeConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(Reshape) + + def format(self, node): + return '' + + +class ReshapeFunctionTemplate(FunctionCallTemplate): + """Only used to add the include list""" + + def __init__(self): + super().__init__(Reshape, include_header=reshape_include_list) + + def format(self, node): + return '' + + +class ReshapeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(Reshape) + self.template = reshape_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['size'] = np.prod(node.get_output_variable().shape) + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py new file mode 100644 index 000000000..15af1d197 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/resource_strategy.py @@ -0,0 +1,77 @@ +import numpy as np + +from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SimpleRNN +from hls4ml.model.optimizer import OptimizerPass + + +class ApplyResourceStrategy(OptimizerPass): + '''Transposes the weights to use the dense_resource matrix multiply routine''' + + def match(self, node): + node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN)) + is_resource_strategy = ( + True # node.get_attr('strategy', '').lower() == 'resource' -> oneAPI only supportr Resource strategy + ) + already_transformed = node.get_attr('_weights_transposed', False) is True + return node_matches and is_resource_strategy and not already_transformed + + def transform(self, model, node): + if isinstance(node, Dense) and not node.model.config.get_compression(node): + rf = node.get_attr('reuse_factor') + bf = int((node.attributes['n_in'] * node.attributes['n_out']) / rf) + bf_rounded = int(pow(2, np.ceil(np.log2(bf)))) + rf_rounded = int(pow(2, np.ceil(np.log2(rf)))) + + node.weights['weight'].data = np.transpose(node.weights['weight'].data).flatten() + + if node.attributes['n_in'] * node.attributes['n_out'] > 2048 and rf_rounded != rf: + node.set_attr('rfpad', rf_rounded - rf) + node.set_attr('bfpad', bf_rounded - bf) + + temp = np.empty([bf_rounded, rf_rounded]) + for i in range(rf_rounded): + for j in range(bf_rounded): + if i < rf and j < bf: + w_index = i + rf * j + temp[j][i] = node.weights['weight'].data[w_index] + else: + temp[j][i] = 0 + node.weights['weight'].data = temp.flatten() + node.weights['weight'].data_length = node.weights['weight'].data.size + + elif isinstance(node, Conv1D): + # (W,C,F) => (F,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1]) + + elif isinstance(node, Conv2D): + # (H,W,C,F) => (F,H,W,C) + # IMPORTANT - This format only works with im2col convolution + # - Future commits add new optimizers that further transpose THIS format to a format + # useful for Winograd's minimal filtering algorithm + node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[3, 0, 1, 2]) + + elif isinstance(node, GRU): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, SimpleRNN): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + elif isinstance(node, LSTM): + node.weights['weight'].data = np.transpose(node.weights['weight'].data) + node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data) + + for weight_type in ['i', 'f', 'c', 'o']: + node.weights[f'weight_{weight_type}'].data = np.transpose(node.weights[f'weight_{weight_type}'].data) + node.weights[f'recurrent_weight_{weight_type}'].data = np.transpose( + node.weights[f'recurrent_weight_{weight_type}'].data + ) + + else: + raise Exception(f'Unexpected layer {node.class_name} with resource strategy') + node.set_attr('_weights_transposed', True) + return False diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py new file mode 100644 index 000000000..8a90bad82 --- /dev/null +++ b/hls4ml/backends/oneapi/passes/transform_types.py @@ -0,0 +1,60 @@ +from hls4ml.backends.oneapi.oneapi_types import ( + OneAPIACTypeConverter, + OneAPIArrayVariableConverter, + OneAPIHLSTypeConverter, + OneAPIInplaceArrayVariableConverter, + OneAPIInplaceStreamVariableConverter, + OneAPIInterfaceVariableConverter, + OneAPIStaticWeightVariableConverter, + OneAPIStreamVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + +# from hls4ml.utils.string_utils import convert_to_pascal_case + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = OneAPIHLSTypeConverter(precision_converter=OneAPIACTypeConverter()) + self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter) + self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = OneAPIStaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + if isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var, pragma='stream') + else: + new_var = self.stream_var_converter.convert(var, pragma='stream') + elif io_type == 'io_parallel': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) diff --git a/hls4ml/backends/quartus/passes/bn_quant.py b/hls4ml/backends/quartus/passes/bn_quant.py new file mode 100644 index 000000000..3224b0002 --- /dev/null +++ b/hls4ml/backends/quartus/passes/bn_quant.py @@ -0,0 +1,169 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/backends/quartus/passes/convolution_templates.py b/hls4ml/backends/quartus/passes/convolution_templates.py index 75f8ca687..d1c36fe1b 100644 --- a/hls4ml/backends/quartus/passes/convolution_templates.py +++ b/hls4ml/backends/quartus/passes/convolution_templates.py @@ -46,7 +46,7 @@ static const unsigned dilation = {dilation}; static const unsigned reuse_factor = {reuse}; - static const unsigned parallelisation_factor = {parallelization}; + static const unsigned parallelization_factor = {parallelization}; static const bool store_weights_in_bram = false; static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation}; @@ -127,7 +127,7 @@ def format(self, node): static const unsigned stride_width = {stride_width}; static const unsigned reuse_factor = {reuse}; - static const unsigned parallelisation_factor = {parallelization}; + static const unsigned parallelization_factor = {parallelization}; static const bool store_weights_in_bram = false; static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation}; diff --git a/hls4ml/backends/template.py b/hls4ml/backends/template.py index 9638b53ad..f7f6fe313 100644 --- a/hls4ml/backends/template.py +++ b/hls4ml/backends/template.py @@ -2,6 +2,14 @@ class Template(OptimizerPass): + """The Template base class, should not be instantiated directly + + Args: + name (str): Name of the template. + layer_class (Layer or list, tuple, or aet of Layers): The Layers that this template handles. + attribute_name (str): The type of attribute provided + """ + def __init__(self, name, layer_class, attribute_name): self.name = name self.layer_class = layer_class @@ -36,6 +44,12 @@ def _default_params(self, node): class LayerConfigTemplate(Template): + """Base class for layer config templates: provides the 'config_cpp' attribute + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + """ + def __init__(self, layer_class): if isinstance(layer_class, (list, tuple, set)): name = '_'.join([cls.__name__.lower() for cls in layer_class]) @@ -53,6 +67,13 @@ def _default_config_params(self, layer): class FunctionCallTemplate(Template): + """Base class for function call templates: provides the 'function_cpp' attribute + + Args: + layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles. + include_header (list, tuple, or set of str, or None): The list of needed include files + """ + def __init__(self, layer_class, include_header=None): if isinstance(layer_class, (list, tuple, set)): name = '_'.join([cls.__name__.lower() for cls in layer_class]) diff --git a/hls4ml/backends/vivado/passes/bn_quant.py b/hls4ml/backends/vivado/passes/bn_quant.py new file mode 100644 index 000000000..3224b0002 --- /dev/null +++ b/hls4ml/backends/vivado/passes/bn_quant.py @@ -0,0 +1,169 @@ +import numpy as np + +from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import BatchNormalization, register_layer +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType + +batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_filt = {n_filt}; + static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; +}};\n""" + +batchnorm_quantized_tanh_function_template = ( + 'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});' +) + +bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h'] + + +class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh) + self.template = batchnorm_quantized_tanh_config_template + + def format(self, node): + params = self._default_config_params(node) + params['n_in'] = node.get_input_variable().size_cpp() + + return self.template.format(**params) + + +class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list) + self.template = batchnorm_quantized_tanh_function_template + + def format(self, node): + params = self._default_function_params(node) + if node.get_attr('quantize') == 2: + params['quantize'] = 'binary' + params['threshold'] = node.get_weights('threshold').name + elif node.get_attr('quantize') == 3: + params['quantize'] = 'ternary' + params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name + + return self.template.format(**params) + + +def register_bn_quant(backend): + # Register the layer types to the layer map + register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh) + + # Register the optimization passes + backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh) + backend.register_pass('quantize_dense_output', QuantizeDenseOutput) + + # Register template passes + backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate) + backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate) + + +class MergeBatchNormAndQuantizedTanh(OptimizerPass): + def match(self, node): + is_match = ( + node.class_name == 'Activation' + and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh'] + or node.class_name == 'TernaryTanh' + ) + is_match = is_match and isinstance(node.get_input_node(), BatchNormalization) + return is_match + + def transform(self, model, node): + bn_layer = node.get_input_node() + # Make a new layer with the new attributes + quantize = 0 + if 'binary' in node.get_attr('activation'): + quantize = 2 + if 'ternary' in node.get_attr('activation'): + quantize = 3 + attrs = { + 'name': bn_layer.get_attr('name'), + 'original_name': bn_layer.get_attr('name'), + 'class_name': 'BatchNormalizationQuantizedTanh', + 'n_in': bn_layer.get_attr('n_in'), + 'n_out': bn_layer.get_attr('n_in'), + 'n_filt': bn_layer.get_attr('n_filt'), + 'quantize': quantize, + 'trace': bn_layer.get_attr('trace'), + } + bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs) + bnbt_layer.set_thresholds( + bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5) + ) + # Remove the BatchNormalization layer + model.remove_node(bn_layer, rewire=True) + # Replace the old Activation layer with this one + model.replace_node(node, bnbt_layer) + + return True + + +class QuantizeDenseOutput(OptimizerPass): + def match(self, node): + is_dense = node.class_name == 'Dense' + input_node = node.get_input_node() + is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh' + quantizer = node.get_attr('weight_quantizer') + is_binary_ternary = quantizer is not None and ( + quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer' + ) + return is_dense and is_input_bnqt and is_binary_ternary + + def transform(self, model, node): + # Compute the required precision and update the variables + # Number of bits for output is log2 of number of input nodes + # Since this is the number of uint<1>'s which are summed + nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2) + out_type = IntegerPrecisionType(width=nbits) + accum_t = NamedType(f'layer{node.index}_accum_t', out_type) + node.set_attr('accum_t', accum_t) + out_var = node.get_output_variable() + out_var.type.precision = out_type + + quantized_data = None + quantized_precision = None + quantizer = node.get_attr('weight_quantizer') + if quantizer.__class__.__name__ == 'BinaryQuantizer': + quantized_precision = XnorPrecisionType() + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + quantized_precision = IntegerPrecisionType(width=2) + else: + print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out') + return False + quantizer.bits = quantized_precision.width + quantizer.hls_type = quantized_precision + quantized_data = quantizer(node.weights['weight'].data) + + weights = node.weights['weight'] + weights.data = quantized_data + weights.type.name = f'weight{node.index}_t' + weights.update_precision(quantized_precision) + + bias = node.weights['bias'] + bias.data = np.zeros(shape=(node.get_attr('n_out'))) + bias.type.name = f'bias{node.index}_t' + bias.nzeros = 0 + bias.update_precision(quantized_precision) + + # If followed by the BatchNormalizationBinaryTanh, update its input + # Also requantise the weights + bd_out_nodes = node.get_output_nodes() + for out_node in bd_out_nodes: + if isinstance(out_node, BatchNormalizationQuantizedTanh): + var_names = [] + if quantizer.__class__.__name__ == 'BinaryQuantizer': + var_names.append('threshold') + elif quantizer.__class__.__name__ == 'TernaryQuantizer': + var_names.append('threshold_hi') + var_names.append('threshold_lo') + for var_name in var_names: + threshold_var = out_node.weights[var_name] + threshold_var.update_precision(out_type) + threshold_var.data = np.floor(threshold_var.data) + + return False diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index aff15808a..67798ae7b 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -62,6 +62,10 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader): if layer['class_name'] != 'Activation': layer['activation'] = layer['class_name'] + + if layer['activation'] == 'elu': + layer['class_name'] = 'ELU' # always use ELU type for elu, even if passed as activation + if layer['class_name'] == 'LeakyReLU': layer['activ_param'] = keras_layer['config'].get('alpha', 0.3) elif layer['class_name'] == 'ThresholdedReLU': diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 1ceb6456b..8054f41ee 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -884,7 +884,7 @@ class HardActivation(Activation): def initialize(self): super().initialize() slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False)) - shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False)) + shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=2, integer=0, signed=False)) index = self.get_attr('index') slope_t = NamedType(f'slope{index}_t', precision=slope_prec) shift_t = NamedType(f'shift{index}_t', precision=shift_prec) diff --git a/hls4ml/model/optimizer/passes/stamp.py b/hls4ml/model/optimizer/passes/stamp.py index f29ae2a18..84bb466aa 100644 --- a/hls4ml/model/optimizer/passes/stamp.py +++ b/hls4ml/model/optimizer/passes/stamp.py @@ -1,3 +1,5 @@ +import uuid + from hls4ml.model.optimizer import ModelOptimizerPass @@ -9,11 +11,11 @@ def transform(self, model): def _make_stamp(): """Create a unique identifier for the generated code. This identifier is used to compile a unique library and link it with python.""" - from random import choice - from string import hexdigits length = 8 - return ''.join(choice(hexdigits) for m in range(length)) + + stamp = uuid.uuid4() + return str(stamp)[-length:] model.config.config['Stamp'] = _make_stamp() diff --git a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py index 798542cfc..9374f4aef 100644 --- a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py +++ b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py @@ -32,7 +32,7 @@ def layer_resources(self, layer_attributes): if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9: return [0] else: - # TOOD - Extend for parallelisation factor + # TOOD - Extend for parallelization factor return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor] @classmethod @@ -117,7 +117,7 @@ def layer_resources(self, layer_attributes): if not layer_attributes.weight_shape: return [0] - # TOOD - Extend for parallelisation factor + # TOOD - Extend for parallelization factor if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency': return [ int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor), diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt new file mode 100644 index 000000000..e2b386d70 --- /dev/null +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -0,0 +1,338 @@ +# Direct CMake to use icpx rather than the default C++ compiler/linker on Linux +# and icx-cl on Windows +if(UNIX) + set(CMAKE_CXX_COMPILER icpx) +else() # Windows + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.7.2) + +project(myproject CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +############################################################################### +### Customize these build variables +############################################################################### +set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp) +set(LIBRARY_FILES src/firmware/myproject.cpp src/myproject_bridge.cpp) +set(LIB_STAMP mystamp) +set(TARGET_NAME myproject) +set(LIBRARY_NAME myproject-${LIB_STAMP}) + +# Use cmake -DFPGA_DEVICE=: to choose a +# different device. Here are a few device examples (this list is not +# exhaustive): +# intel_s10sx_pac:pac_s10 +# intel_s10sx_pac:pac_s10_usm +# intel_a10gx_pac:pac_a10 +# Note that depending on your installation, you may need to specify the full +# path to the board support package (BSP), this usually is in your install +# folder. +# +# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a +# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. +if(NOT DEFINED FPGA_DEVICE) + set(FPGA_DEVICE "Arria10") +endif() + +# Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend +# compilation. +set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) + +# Use cmake -DUSER_FLAGS= to set extra flags for general compilation. +set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS}) + +# Use cmake -DUSER_INCLUDE_PATHS= to set extra paths for general +# compilation. +set(USER_INCLUDE_PATHS src;src/firmware;${USER_INCLUDE_PATHS}) + +############################################################################### +### no changes after here +############################################################################### + +# Print the device being used for the compiles +message(STATUS "Configuring the design to run on FPGA board ${FPGA_DEVICE}") + +# Set the names of the makefile targets to be generated by cmake +set(EMULATOR_TARGET fpga_emu) +set(SIMULATOR_TARGET fpga_sim) +set(REPORT_TARGET report) +set(FPGA_TARGET fpga) +set(IP_EXPORT_TARGET fpga_ip_export) +set(LIBRARY_TARGET lib) + +# Set the names of the generated files per makefile target +set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET}) +set(SIMULATOR_OUTPUT_NAME ${TARGET_NAME}.${SIMULATOR_TARGET}) +set(REPORT_OUTPUT_NAME ${TARGET_NAME}.${REPORT_TARGET}) +set(FPGA_OUTPUT_NAME ${TARGET_NAME}.${FPGA_TARGET}) +set(IP_EXPORT_OUTPUT_NAME ${TARGET_NAME}.${IP_EXPORT_TARGET}) + +message(STATUS "Additional USER_FPGA_FLAGS=${USER_FPGA_FLAGS}") +message(STATUS "Additional USER_FLAGS=${USER_FLAGS}") + +include_directories(${USER_INCLUDE_PATHS}) +message(STATUS "Additional USER_INCLUDE_PATHS=${USER_INCLUDE_PATHS}") + +link_directories(${USER_LIB_PATHS}) +message(STATUS "Additional USER_LIB_PATHS=${USER_LIB_PATHS}") + +link_libraries(${USER_LIBS}) +message(STATUS "Additional USER_LIBS=${USER_LIBS}") + +if(WIN32) + # add qactypes for Windows + set(QACTYPES "-Qactypes") + # This is a Windows-specific flag that enables exception handling in host code + set(WIN_FLAG "/EHsc") +else() + # add qactypes for Linux + set(QACTYPES "-qactypes") +endif() + +set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS}) +# for debugging need to do this. Not sure why +# set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) +set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS}) + +# A SYCL ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate +# representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. For +# this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR) +set(LIBRARY_COMPILE_FLAGS -DFPGA_EMULATOR) +set(EMULATOR_LINK_FLAGS ) +set(LIBRARY_LINK_FLAGS -L$ENV{FPGA_VARS_DIR}/host/linux64/lib) +set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early) +set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR) +set(SIMULATOR_LINK_FLAGS -Xssimulation -Xsghdl -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${SIMULATOR_OUTPUT_NAME}) +set(FPGA_COMPILE_FLAGS -DFPGA_HARDWARE) +set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_OUTPUT_NAME}) +# get rid of this once host pipes work properly +set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE) +set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel) + +############################################################################### +### FPGA Emulator library +############################################################################### +add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${LIBRARY_TARGET} PRIVATE ${LIBRARY_COMPILE_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${LIBRARY_TARGET} ${LIBRARY_LINK_FLAGS}) +set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) + +############################################################################### +### FPGA Emulator +############################################################################### +add_executable(${EMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${EMULATOR_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${EMULATOR_TARGET} ${EMULATOR_LINK_FLAGS}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${EMULATOR_OUTPUT_NAME}) + +############################################################################### +### FPGA Simulator +############################################################################### +add_executable(${SIMULATOR_TARGET} ${SOURCE_FILES}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${SIMULATOR_TARGET} PRIVATE ${SIMULATOR_COMPILE_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${SIMULATOR_TARGET} ${SIMULATOR_LINK_FLAGS}) +set_target_properties(${SIMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${SIMULATOR_OUTPUT_NAME}) + +############################################################################### +### Generate Report +############################################################################### +add_executable(${REPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${REPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${REPORT_TARGET} PRIVATE ${REPORT_COMPILE_FLAGS}) + +# The report target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_REPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_REPORT ${QACTYPES}) + +target_link_libraries(${REPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_REPORT}) +target_link_libraries(${REPORT_TARGET} ${REPORT_LINK_FLAGS}) +set_target_properties(${REPORT_TARGET} PROPERTIES OUTPUT_NAME ${REPORT_OUTPUT_NAME}) + +############################################################################### +### FPGA Hardware +############################################################################### +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILES}) +target_compile_options(${FPGA_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${FPGA_TARGET} PRIVATE ${FPGA_COMPILE_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${COMMON_LINK_FLAGS}) +target_link_libraries(${FPGA_TARGET} ${FPGA_LINK_FLAGS}) +set_target_properties(${FPGA_TARGET} PROPERTIES OUTPUT_NAME ${FPGA_OUTPUT_NAME}) + +############################################################################### +### FPGA IP Export (only necessary until native host pipes) +############################################################################### +add_executable(${IP_EXPORT_TARGET} ${SOURCE_FILES}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS}) +target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${IP_EXPORT_COMPILE_FLAGS}) + +# The ip export target does not need the QACTYPES flag at link stage +set(MODIFIED_COMMON_LINK_FLAGS_EXPORT ${COMMON_LINK_FLAGS}) +list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_EXPORT ${QACTYPES}) + +target_link_libraries(${IP_EXPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_EXPORT}) +target_link_libraries(${IP_EXPORT_TARGET} ${IP_EXPORT_LINK_FLAGS}) +set_target_properties(${IP_EXPORT_TARGET} PROPERTIES OUTPUT_NAME ${IP_EXPORT_OUTPUT_NAME}) + +############################################################################### +### This part only manipulates cmake variables to print the commands to the user +############################################################################### + +# set the correct object file extension depending on the target platform +if(WIN32) + set(OBJ_EXTENSION "obj") +else() + set(OBJ_EXTENSION "o") +endif() + +# Set the source file names in a string +set(SOURCE_FILE_NAME "${SOURCE_FILES}") + +function(getCompileCommands common_compile_flags special_compile_flags common_link_flags special_link_flags target output_name) + + set(file_names ${SOURCE_FILE_NAME}) + set(COMPILE_COMMAND ) + set(LINK_COMMAND ) + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source}) + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Creating a string that contains the compile command + # Start by the compiler invocation + set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the potential includes + foreach(INCLUDE ${USER_INCLUDE_PATHS}) + if(NOT IS_ABSOLUTE ${INCLUDE}) + file(RELATIVE_PATH INCLUDE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${INCLUDE}) + endif() + set(COMPILE_COMMAND "${COMPILE_COMMAND} -I${INCLUDE}") + endforeach() + + # Add all the common compile flags + foreach(FLAG ${common_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific compile flags + foreach(FLAG ${special_compile_flags}) + set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}") + endforeach() + + # Get the location of the object file + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(COMPILE_COMMAND "${COMPILE_COMMAND} -c ${CURRENT_SOURCE_FILE} -o ${OBJ_FILE}\n") + endforeach() + + set(COMPILE_COMMAND "${COMPILE_COMMAND}" PARENT_SCOPE) + + # Creating a string that contains the link command + # Start by the compiler invocation + set(LINK_COMMAND "${LINK_COMMAND}${CMAKE_CXX_COMPILER}") + + # Add all the common link flags + foreach(FLAG ${common_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add all the specific link flags + foreach(FLAG ${special_link_flags}) + set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}") + endforeach() + + # Add the output file + set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}") + + foreach(source ${file_names}) + # Get the relative path to the source and object files + file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION}) + + # Add the source file and the output file + set(LINK_COMMAND "${LINK_COMMAND} ${OBJ_FILE}") + endforeach() + + # Add all the potential library paths + foreach(LIB_PATH ${USER_LIB_PATHS}) + if(NOT IS_ABSOLUTE ${LIB_PATH}) + file(RELATIVE_PATH LIB_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${LIB_PATH}) + endif() + if(NOT WIN32) + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH}") + else() + set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH} -Wl,-rpath,${LIB_PATH}") + endif() + endforeach() + + # Add all the potential includes + foreach(LIB ${USER_LIBS}) + set(LINK_COMMAND "${LINK_COMMAND} -l${LIB}") + endforeach() + + set(LINK_COMMAND "${LINK_COMMAND}" PARENT_SCOPE) + +endfunction() + +# Windows executable is going to have the .exe extension +if(WIN32) + set(EXECUTABLE_EXTENSION ".exe") +endif() + +# Display the compile instructions in the emulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${EMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${EMULATOR_LINK_FLAGS}" "${EMULATOR_TARGET}" "${EMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayEmulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${EMULATOR_TARGET} displayEmulationCompileCommands) + +# Display the compile instructions in the simulation flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${SIMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${SIMULATOR_LINK_FLAGS}" "${SIMULATOR_TARGET}" "${SIMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displaySimulationCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${SIMULATOR_TARGET} displaySimulationCompileCommands) + +# Display the compile instructions in the report flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${REPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_REPORT}" "${REPORT_LINK_FLAGS}" "${REPORT_TARGET}" "${REPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayReportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${REPORT_TARGET} displayReportCompileCommands) + +# Display the compile instructions in the IP export flow (Remove after native host pipes work properly) +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${IP_EXPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_EXPORT}" "${IP_EXPORT_LINK_FLAGS}" "${IP_EXPORT_TARGET}" "${IP_EXPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayExportCompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${IP_EXPORT_TARGET} displayExportCompileCommands) + +# Display the compile instructions in the fpga flow +getCompileCommands("${COMMON_COMPILE_FLAGS}" "${FPGA_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${FPGA_LINK_FLAGS}" "${FPGA_TARGET}" "${FPGA_OUTPUT_NAME}${EXECUTABLE_EXTENSION}") + +add_custom_target( displayFPGACompileCommands ALL + ${CMAKE_COMMAND} -E cmake_echo_color --cyan "" + COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}") +add_dependencies(${FPGA_TARGET} displayFPGACompileCommands) diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp new file mode 100644 index 000000000..bb7976f61 --- /dev/null +++ b/hls4ml/templates/oneapi/exception_handler.hpp @@ -0,0 +1,21 @@ +#ifndef __EXCEPTIONHANDLER_HPP__ +#define __EXCEPTIONHANDLER_HPP__ +#include +#include +#include + +namespace fpga_tools { + +void exception_handler(sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl; + } + } +} + +} // namespace fpga_tools + +#endif //__EXCEPTIONHANDLER_HPP__ diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h new file mode 100644 index 000000000..05de507dc --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -0,0 +1,20 @@ +#ifndef DEFINES_H_ +#define DEFINES_H_ + +#include +#include +#include +#include + +// Include nnet::array - a custom array-like struct, mainly used with io_stream +#include "nnet_utils/nnet_types.h" + +// hls-fpga-machine-learning insert numbers + +// hls-fpga-machine-learning insert layer-precision + +#define DIV_ROUNDUP(n, d) ((n + d - 1) / d) +#define MIN(n, d) (n > d ? d : n) +#define MAX(n, d) (n < d ? d : n) + +#endif diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp new file mode 100644 index 000000000..06e7d3fe3 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -0,0 +1,24 @@ +#include "myproject.h" +#include "parameters.h" +#include + +// hls-fpga-machine-learning insert weights + +// The inter-task pipes need to be declared in the global scope +// hls-fpga-machine-learning insert inter-task pipes + +using sycl::ext::intel::experimental::task_sequence; + +void MyProject::operator()() const { + // **************************************** + // NETWORK INSTANTIATION + // **************************************** + + // hls-fpga-machine-learning read in + + // hls-fpga-machine-learning declare task sequences + + // hls-fpga-machine-learning insert layers + + // hls-fpga-machine-learning return +} diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h new file mode 100644 index 000000000..082ae5dc8 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -0,0 +1,29 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include "defines.h" + +// This file defines the interface to the kernel + +// currently this is fixed +using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); + +// Need to declare the input and output pipes + +// hls-fpga-machine-learning insert inputs +// hls-fpga-machine-learning insert outputs + +class MyProjectID; + +struct MyProject { + + // kernel property method to config invocation interface + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>, + sycl::ext::intel::experimental::pipelined<>}; + } + + SYCL_EXTERNAL void operator()() const; +}; + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h new file mode 100644 index 000000000..ab1874ec1 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -0,0 +1,499 @@ +#ifndef NNET_ACTIVATION_H_ +#define NNET_ACTIVATION_H_ + +#include "nnet_common.h" + +namespace nnet { + +struct activ_config { + // IO size + static constexpr unsigned n_in = 10; + + // Internal info + static constexpr unsigned table_size = 512; + + // Resource reuse info + static constexpr unsigned io_type = io_parallel; + static constexpr unsigned reuse_factor = 1; + + // Internal data type definitions + typedef ac_fixed<16, 8> table_t; +}; + +// ************************************************* +// LINEAR Activation -- See Issue 53 +// ************************************************* +template void linear(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + res[ii] = datareg; + } +} + +// ************************************************* +// RELU Activation +// ************************************************* +template void relu(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +template void relu_max(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg < 0) + res[ii] = 0; + else if (datareg > MAX_INT) + res[ii] = MAX_INT; + else + res[ii] = datareg; + } +} + +template void relu6(const data_T &data, res_T &res) { + relu_max(data, res); +} + +template void relu1(const data_T &data, res_T &res) { + relu_max(data, res); +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template void sigmoid(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 8; +#include "activation_tables/sigmoid_table.tb" + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type absoluteValue; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + absoluteValue = -data[ii]; + } else { + absoluteValue = data[ii]; + } + int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (absoluteValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(sigmoid_table[index]); + if (data[ii] < 0) { + res[ii] = 1 - temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; + +template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N - 1); + // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness + if (x != 0 && y == 0) + y[0] = 1; + return y.to_uint(); +} + +template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { + // Number of address bits for table + static constexpr int N = ceillog2::val; + + // Slice the top N bits of the input + [[intel::fpga_register]] ac_int y = x.template slc(x.width - N); + return y.to_uint(); +} + +template void softmax_stable(const data_T &data, res_T &res) { +// Look-up tables +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + // Find maximum + Op_max op_max; + [[intel::fpga_register]] auto x_max = + reduce>(data.data(), op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed + d_xi_xmax[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; + } + + // Explicitly sum previously calculated exponentials with an adder tree + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +// TODO - Improve accuracy +template void softmax_latency(const data_T &data, res_T &res) { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; + } + + // Explicitly sum the results with an adder tree. + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + +template void softmax_legacy(const data_T &data, res_T &res) { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + [[intel::fpga_register]] int data_round[CONFIG_T::n_in]; +New_loop: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int(); + } +NN_Outer: + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + typename CONFIG_T::exp_table_t exp_res_temp = 0; + NN_Inner: + #pragma unroll + for (int jj = 0; jj < CONFIG_T::n_in; jj++) { + if (ii == jj) { + exp_res_temp += 1; + } else { + int _data_cache = (data_round[jj] - data_round[ii]); + int index = _data_cache + 8 * CONFIG_T::table_size / 16; + + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + + typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index]; + exp_res_temp += temp_exp; + } + } + int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + res[ii] = invert_table_legacy[exp_res_index]; + } +} + +template void softmax_argmax(const data_T &data, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + res[i] = static_cast(0); + } + + [[intel::fpga_register]] auto maximum = data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; i < CONFIG_T::n_in; i++) { + if (data[i] > maximum) { + maximum = data[i]; + idx = i; + } + } + + res[idx] = static_cast(1); +} + +template inline void softmax(const data_T &data, res_T &res) { + switch (CONFIG_T::implementation) { + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::latency: + softmax_latency(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + default: + softmax_stable(data, res); + break; + case softmax_implementation::argmax: + softmax_argmax(data, res); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void dense_tanh(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 4; +// Initialize the lookup table +#include "activation_tables/tanh_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type temp; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(tanh_table[index]); + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +template void hard_sigmoid(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + res[ii] = datareg; + } +} + +template void hard_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + res[ii] = 2 * sigmoid - 1; + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template +void leaky_relu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha * datareg; + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu(const data_T &data, const typename CONFIG_T::param_t theta, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > theta) + res[ii] = datareg; + else + res[ii] = 0; + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template void softplus(const data_T &data, res_T &res) { +// Initialize the lookup table +#include "activation_tables/softplus_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int(); + ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = static_cast(softplus_table[index]); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template void softsign(const data_T &data, res_T &res) { + static constexpr int MAX_VALUE = 8; +// Initialize the lookup table +#include "activation_tables/softsign_table.tb" + + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + [[intel::fpga_register]] typename data_T::value_type temp; + [[intel::fpga_register]] typename res_T::value_type temp2; + if (data[ii] < 0) { + temp = -data[ii]; + } else { + temp = data[ii]; + } + ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (temp > MAX_VALUE) + index = CONFIG_T::table_size - 1; + temp2 = static_cast(softsign_table[index]); + if (data[ii] < 0) { + res[ii] = -temp2; + } else { + res[ii] = temp2; + } + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template +void elu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) { +// Initialize the lookup table +#include "activation_tables/elu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg >= 0) { + res[ii] = datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = alpha * elu_table[index]; + } + } +} + +// ************************************************* +// SELU Activation +// ************************************************* +template void selu(const data_T &data, res_T &res) { +// Initialize the lookup table +#include "activation_tables/selu_table.tb" + // Index into the lookup table based on data + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg >= 0) { + res[ii] = static_cast(1.0507009873554804934193349852946) * datareg; + } else { + ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + res[ii] = selu_table[index]; + } + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template +void prelu(const data_T &data, const typename CONFIG_T::param_t &alpha, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + if (datareg > 0) + res[ii] = datareg; + else + res[ii] = alpha[ii] * datareg; + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = data[ii]; + typename res_T::value_type cache; + if (datareg > 0) + cache = 1; + else + cache = -1; + + res[ii] = cache; + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh(const data_T &data, res_T &res) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + auto datareg = 2 * data[ii]; + typename res_T::value_type cache; + if (datareg > 1) + cache = 1; + else if (datareg > -1 && datareg <= 1) + cache = 0; + else + cache = -1; + + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h new file mode 100644 index 000000000..13de5ab3b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -0,0 +1,712 @@ +#ifndef NNET_ACTIVATION_STREAM_H_ +#define NNET_ACTIVATION_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_types.h" + +namespace nnet { + +// ************************************************* +// Linear Activation +// ************************************************* +template void linear_stream() { +LinearActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + out_data[j] = in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ReLU Activation +// ************************************************* +template void relu_stream() { +ReLUActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + ReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Leaky RELU Activation +// ************************************************* +template void leaky_relu_stream(typename CONFIG_T::param_t alpha) { + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +LeakyReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + LeakyReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Thresholded RELU Activation +// ************************************************* +template +void thresholded_relu_stream(typename CONFIG_T::param_t theta) { +ThresholdedReLUActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + ThresholdedReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > theta) + out_data[j] = in_data[j]; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// ELU Activation +// ************************************************* +template void elu_stream(typename CONFIG_T::param_t alpha) { +#include "activation_tables/elu_table.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +EluActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + EluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = alpha * elu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// SeLU Activation +// ************************************************* +template void selu_stream() { +#include "activation_tables/selu_table.tb" + +SeluActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SeluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; + if (datareg >= 0) { + out_data[j] = + typename ExtractPipeType::value_type::value_type(1.0507009873554804934193349852946) * datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = selu_table[index]; + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// PReLU Activation +// ************************************************* +template void prelu_stream(typename CONFIG_T::param_t alpha) { + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +PReLUActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + PReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = in_data[j]; + else + out_data[j] = alpha[i * std::tuple_size::value_type>{} + j] * in_data[j]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softplus Activation +// ************************************************* +template void softplus_stream() { +#include "activation_tables/softplus_table.tb" + +SoftplusActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SoftplusPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); + [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data[j] = softplus_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softsign Activation +// ************************************************* +template void softsign_stream() { +#include "activation_tables/softsign_table.tb" + + static const int MAX_VALUE = 8; + +SoftsignActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SoftsignPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absValue; + ; + if (in_data[j] < 0) { + absValue = -in_data[j]; + } else { + absValue = in_data[j]; + } + ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (absValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + if (in_data[j] < 0) { + out_data[j] = + static_cast::value_type::value_type>(-softsign_table[index]); + } else { + out_data[j] = static_cast::value_type::value_type>(softsign_table[index]); + } + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Softmax Activation +// ************************************************* + +template void softmax_stable_stream() { +#include "activation_tables/exp_table.tb" +#include "activation_tables/invert_table.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_array[std::tuple_size::value_type>{}]; + +SoftmaxArrayLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_pack = data_pipe::read(); + + SoftmaxArrayPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + data_array[j] = in_pack[j]; + } + + // Find the max and compute all delta(x_i, x_max) + Op_max::value_type::value_type> op_max; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = + reduce::value_type::value_type, + std::tuple_size::value_type>{}, + Op_max::value_type::value_type>>(data_array, op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed::value_type::value_type::width, + ExtractPipeType::value_type::value_type::i_width, true, AC_RND, AC_SAT> + d_xi_xmax[std::tuple_size::value_type>{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + d_xi_xmax[j] = data_array[j] - x_max; + } + + // Calculate all the e^x's + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = + exp_table[softmax_stable_idx_from_real_val::value_type::value_type, + CONFIG_T>(d_xi_xmax[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce::value_type>{}, + Op_add>(exp_res, op_add); + + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + typename ExtractPipeType::value_type out_pack; + + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + + // TODO - Find Quartus-equivalent pragma + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_latency_stream() { +#include "activation_tables/exp_table_latency.tb" +#include "activation_tables/invert_table_latency.tb" + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + + // Calculate all the e^x's + [[intel::fpga_register]] + typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; + +SoftmaxExpLoop: + [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_pack = data_pipe::read(); + + SoftmaxExpPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val< + typename ExtractPipeType::value_type::value_type, CONFIG_T>(in_pack[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; + + typename ExtractPipeType::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_pack[j] = exp_res[j] * inv_exp_sum; + } + + res_pipe::write(out_pack); + } +} + +template void softmax_legacy_stream() { +#include "activation_tables/exp_table_legacy.tb" +#include "activation_tables/invert_table_legacy.tb" + + // Index into the lookup table based on data for exponentials + [[intel::fpga_register]] + typename CONFIG_T::table_t exp_res[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type + data_cache[std::tuple_size::value_type>{}]; + +SoftmaxInitLoop: + [[intel::initiation_interval(1)]] for (unsigned s = 0; + s < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + s++) { + auto in_pack = data_pipe::read(); + + SoftmaxInitPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + data_cache[j] = in_pack[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + #pragma unroll + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { + SoftmaxExpInner: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table_legacy[index]; + } + exp_res[i] += exp_diff_res; + } + } + + typename ExtractPipeType::value_type out_pack; + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + out_pack[j] = + static_cast::value_type::value_type>(invert_table_legacy[exp_res_index]); + } + + res_pipe::write(out_pack); + } +} + +template void softmax_argmax_stream() { + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + #pragma unroll + for (int i = 0; i < std::tuple_size::value_type>{}; i++) { + out_data[i] = static_cast::value_type::value_type>(0); + } + + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type maximum = in_data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; + i < std::tuple_size::value_type>{}; i++) { + if (in_data[i] > maximum) { + maximum = in_data[i]; + idx = i; + } + } + + out_data[idx] = static_cast::value_type::value_type>(1); + res_pipe::write(out_data); + } +} + +template void softmax_stream() { + switch (CONFIG_T::implementation) { + case softmax_implementation::latency: + softmax_latency_stream(); + break; + case softmax_implementation::stable: + softmax_stable_stream(); + break; + case softmax_implementation::legacy: + softmax_legacy_stream(); + break; + case softmax_implementation::argmax: + softmax_argmax_stream(); + break; + default: + softmax_stable_stream(); + break; + } +} + +// ************************************************* +// TanH Activation +// ************************************************* +template void dense_tanh_stream() { +#include "activation_tables/tanh_table.tb" + static const int MAX_VALUE = 4; + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +TanHActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + TanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = tanh_table[index]; + else + out_data[j] = -tanh_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Sigmoid Activation +// ************************************************* +template void sigmoid_stream() { +#include "activation_tables/sigmoid_table.tb" + static const int MAX_VALUE = 8; + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +SigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + SigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; + + if (in_data[j] < 0) + absoluteValue = (-1) * in_data[j]; + else + absoluteValue = in_data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data[j] > 0) + out_data[j] = sigmoid_table[index]; + else + out_data[j] = 1 - sigmoid_table[index]; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Hard sigmoid Activation +// ************************************************* +// Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations +template void hard_sigmoid_stream() { + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data[j] = datareg; + } + + res_pipe::write(out_data); + } +} + +template void hard_tanh_stream() { + + constexpr unsigned multiplier_limit = + DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + +HardSigmoidActLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; + i < CONFIG_T::n_in / + std::tuple_size::value_type>{}; + i++) { + + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data[j] = 2 * sigmoid - 1; + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Binary TanH Activation +// ************************************************* +template void binary_tanh_stream() { +BinaryTanHActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + BinaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 0) + out_data[j] = static_cast::value_type::value_type>(1); + else + out_data[j] = static_cast::value_type::value_type>(-1); + } + + res_pipe::write(out_data); + } +} + +// ************************************************* +// Ternary TanH Activation +// ************************************************* +template void ternary_tanh_stream() { +TernaryTanHActLoop: + [[intel::initiation_interval( + 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + TernaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + if (in_data[j] > 1) + out_data[j] = static_cast::value_type::value_type>(1); + else if (in_data[j] <= -1) + out_data[j] = static_cast::value_type::value_type>(-1); + else + out_data[j] = static_cast::value_type::value_type>(0); + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h new file mode 100644 index 000000000..f8e5bcb79 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h @@ -0,0 +1,104 @@ +#ifndef NNET_BATCHNORM_H_ +#define NNET_BATCHNORM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" + +namespace nnet { + +struct batchnorm_config { + // Internal data type definitions + typedef float bias_t; + typedef float scale_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale, + const typename CONFIG_T::bias_t &bias) { +// Calcuate result +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_in; ires++) { + if (CONFIG_T::n_filt == -1) { + res[ires] = + CONFIG_T::template product::product( + data[ires], scale[ires]) + + bias[ires]; + } else { + int norm_index = ires % CONFIG_T::n_filt; + res[ires] = + CONFIG_T::template product::product( + data[ires], scale[norm_index]) + + bias[norm_index]; + } + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +struct batchnorm_quantized_tanh_config { + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_filt = -1; + static const unsigned n_scale_bias = 10; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const unsigned n_zeros = 0; +}; + +template +void normalize_binary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_t &threshold) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<1, false> cache; + auto datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg >= threshold[norm_index]) + cache = 1; + else + cache = 0; + + res[ii] = cache; + } +} + +template +void normalize_ternary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_hi_t &threshold_hi, + const typename CONFIG_T::threshold_lo_t &threshold_lo) { + #pragma unroll + for (int ii = 0; ii < CONFIG_T::n_in; ii++) { + ac_int<2, true> cache; + auto datareg = data[ii]; + int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt; + if (datareg > threshold_hi[norm_index]) + cache = 1; + else if (datareg <= threshold_lo[norm_index]) + cache = -1; + else + cache = 0; + res[ii] = cache; + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h new file mode 100644 index 000000000..128b3ac1a --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -0,0 +1,107 @@ +#ifndef NNET_BATCHNORM_STREAM_H_ +#define NNET_BATCHNORM_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include "nnet_types.h" + +namespace nnet { + +// **************************************************** +// Streaming Batch Normalization +// **************************************************** +template +void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) { + + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit; + constexpr auto datasize = std::tuple_size::value_type>{}; + CONFIG_T::template product::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit); + +BatchNormLoop: + [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + typename ExtractPipeType::value_type out_data; + + BatchNormpack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + out_data[j] = + CONFIG_T::template product::value_type::value_type, + typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) + + bias[norm_index]; + } + + res_pipe::write(out_data); + } +} + +// **************************************************** +// Merged Batch Normalization and Quantized Tanh +// **************************************************** +template +void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) { + constexpr auto datasize = std::tuple_size::value_type>{}; + +BinaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0; + } + + res_pipe::write(out_data); + } +} + +template +void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi, + typename CONFIG_T::threshold_lo_t threshold_lo) { + constexpr auto datasize = std::tuple_size::value_type>{}; + +TernaryNormLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + nnet::array, CONFIG_T::n_scale_bias> out_data; + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + if (in_data[j] > threshold_hi[norm_index]) + out_data[j] = 1; + else if (in_data[j] <= threshold_lo[norm_index]) + out_data[j] = -1; + else + out_data[j] = 0; + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h new file mode 100644 index 000000000..f37a61cb0 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h @@ -0,0 +1,76 @@ +#ifndef NNET_COMMON_H_ +#define NNET_COMMON_H_ + +#include "nnet_helpers.h" +#include +#include +#include + +typedef ac_fixed<16, 6> table_default_t; + +namespace nnet { + +// Common type definitions +enum io_type { io_parallel = 0, io_stream }; + +// Default data types (??) TODO: Deprecate +typedef ac_fixed<16, 4> weight_t_def; +typedef ac_fixed<16, 4> bias_t_def; +typedef ac_fixed<32, 10> accum_t_def; + +template void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) { + #pragma unroll + for (int ii = 0; ii < NIN1; ii++) { + res[ii] = data1[ii]; + } + #pragma unroll + for (int ii = 0; ii < NIN2; ii++) { + res[NIN1 + ii] = data2[ii]; + } +} + +/* --- + * Balanced tree reduce implementation. + * For use in scenarios where Quartus cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ +template T reduce(const T *x, Op op) { + static constexpr int leftN = pow2::val>::val > 0 ? pow2::val>::val : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if constexpr (N == 1) { + return x[0]; + } else if constexpr (N == 2) { + return op(x[0], x[1]); + } else { + return op(reduce(x, op), reduce(x + leftN, op)); + } +} + +// alternate reduce - basic +// template T reduce(const T *x, Op op) { +// if (N == 1) { +// return x[0]; +// } +// auto val = op(x[0], x[1]); +// for (int i = 2; i < N; i++) { +// val = op(val, x[i]); +// } +// return val; +// } + +template class Op_add { + public: + T operator()(T a, T b) { return a + b; } +}; + +template class Op_max { + public: + T operator()(T a, T b) { return a >= b ? a : b; } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h new file mode 100644 index 000000000..38560f120 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h @@ -0,0 +1,61 @@ +#ifndef NNET_CONV1D_H_ +#define NNET_CONV1D_H_ + +#include "nnet_common.h" +#include "nnet_conv1d_resource.h" + +namespace nnet { + +struct conv1d_config { + // I/O sizes + static const unsigned in_width = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_width = 1; + static const unsigned dilation = 1; + + // Run-time Configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelization_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + conv_1d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_width == 1); + pointwise_conv_1d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h new file mode 100644 index 000000000..85009d4a3 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h @@ -0,0 +1,237 @@ +#ifndef NNET_CONV1D_RESOURCE_H_ +#define NNET_CONV1D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" + +namespace nnet { + +enum class conv1d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 1D Convolution algorithm +// **************************************************************** + +template +void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP + // usage + + [[intel::fpga_register]] int index = 0; + +KernelLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + [[intel::fpga_register]] int index_data = + (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } + } +} + +template +void conv_1d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + using data_col_T = array; + using res_col_T = array; + +ColLoop: + #pragma unroll pf + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int i = 0; i < CONFIG_T::out_width; i++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_cl(data, data_col, i); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int j = 0; j < CONFIG_T::n_filt; j++) { + res[i * CONFIG_T::n_filt + j] = res_col[j]; + } + } +} + +// **************************************************************** +// 1D Convolution for 3x1 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd convolution, as explained by Lavin & Gray (2015) +template +inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[4]) { + D[0] = I[0] - I[2]; + D[1] = I[1] + I[2]; + D[2] = -I[1] + I[2]; + D[3] = I[1] - I[3]; +} + +template +void winograd_conv1d_3x1_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_width == 1); + assert(CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +WidthLoop: + #pragma unroll pf + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x1 tile + [[intel::fpga_register]] typename data_T::value_type T[16]; + [[intel::fpga_register]] uint8_t p = 0; + + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + + // Transform input tile + [[intel::fpga_register]] typename CONFIG_T::accum_t D[4]; + winograd_transform_input_tile_3x1_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[4]; + #pragma unroll + for (int i = 0; i < 4; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * col + filter] += static_cast(Y[0] + Y[1] + Y[2]); + if ((col + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * (col + 1) + filter] += + static_cast(Y[1] - Y[2] - Y[3]); + } + } + } +} + +// **************************************************************** +// 1D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + [[intel::fpga_register]] int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + [[intel::fpga_register]] int index_data = + (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel; + if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) { + data_col[index++] = data[index_data]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_width == 1); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + + using data_col_T = array; + using res_col_T = array; + +ColLoop: + #pragma unroll pf + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_1d_pointwise_cl(data, data_col, col); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[col * CONFIG_T::n_filt + k] = res_col[k]; + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + (CONFIG_T::implementation == nnet::conv1d_implementation::combination || + CONFIG_T::implementation == nnet::conv1d_implementation::winograd); + + if (CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv1d_3x1_kernel_cl(data, res, weights, biases); + } else { + conv_1d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h new file mode 100644 index 000000000..1ffd11774 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h @@ -0,0 +1,177 @@ +#ifndef NNET_CONV1D_STREAM_H_ +#define NNET_CONV1D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan], data_window_T &kernel_window) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel]; + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel]; + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_1d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[channel] = in_elem[channel]; + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D layer weights biases - Conv1D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_1d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::n_chan], + data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, + int &pX, int &sX) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::filt_width - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] res_T res_out; + dense_resource(kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_1d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[CONFIG_T::n_chan]; + [[intel::fpga_register]] data_window_T kernel_window; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + constexpr auto padds = zero_array(); + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // X strides + int sX = 0; + +// Input image left-side padding +PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, + biases, pX, sX); + } + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_1d(data_pipe::read(), line_buffer, + kernel_window, weights, biases, pX, sX); + } + +// Input image right-side padding +PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_1d(padds, line_buffer, kernel_window, weights, + biases, pX, sX); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h new file mode 100644 index 000000000..79b1508c5 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h @@ -0,0 +1,67 @@ +#ifndef NNET_CONV2D_H_ +#define NNET_CONV2D_H_ + +#include "nnet_conv2d_resource.h" + +namespace nnet { + +struct conv2d_config { + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + // Number of channels, filters + static const unsigned n_chan = 1; + static const unsigned n_filt = 1; + + // Original filter size + static const unsigned filt_height = 1; + static const unsigned filt_width = 1; + static const unsigned kernel_size = filt_height * filt_width; + + // Modified filter size (post-Wionograd transformation, if applied) + static const unsigned impl_filt_height = 1; + static const unsigned impl_filt_width = 1; + + // Padding, stride, dilation + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const unsigned stride_height = 1; + static const unsigned stride_width = 1; + static const unsigned dilation_height = 1; + static const unsigned dilation_width = 1; + + // Run-time configuration + static const unsigned n_zeros = 0; + static const unsigned reuse_factor = 1; + static const unsigned parallelization_factor = 1; + + // TODO: BRAM Storage on Quartus + static const bool store_weights_in_bram = false; + + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; +}; + +template +void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + conv_2d_resource_cl(data, res, weights, biases); +} + +template +void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + pointwise_conv_2d_resource_cl(data, res, weights, biases); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h new file mode 100644 index 000000000..7265d90e1 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h @@ -0,0 +1,297 @@ +#ifndef NNET_CONV2D_RESOURCE_H_ +#define NNET_CONV2D_RESOURCE_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_helpers.h" + +namespace nnet { + +enum class conv2d_implementation { combination, im2col, winograd }; + +// **************************************************************** +// im2col - General-purpose 2D Convolution algorithm +// **************************************************************** + +template +void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { + // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little + // DSP usage + + [[intel::fpga_register]] int index = 0; + +FiltHeightLoop: + #pragma unroll + for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) { + [[intel::fpga_register]] int input_row = + -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height; + + FiltWidthLoop: + #pragma unroll + for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) { + [[intel::fpga_register]] int input_col = + -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width; + + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } + } + } +} + +template +void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // im2col performs no filter transformations; therefore, filter size remains constant + assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width); + + // Unroll factors for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height); + + using data_col_T = + array; + using res_col_T = array; + +HeightLoop: + #pragma unroll pfr + for (int i = 0; i < CONFIG_T::out_height; i++) { + WidthLoop: + #pragma unroll pfc + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::out_width; j++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_cl(data, data_col, i, j); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + // Unroll fully, since + // (1) n_filt is usually low in io_parallel (< 32) + // (2) no complex operations handled in loop, this loop performs a simple register writing operation + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// 2D Convolution for 3x3 kernels from Winograd's algoirithm +// **************************************************************** + +// Explicity transofrmed input (B'dB) needed for Winograd calculation, as explained by Lavin & Gray, 2015 +template +inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D[16]) { + D[0] = I[0] - I[2] - I[8] + I[10]; + D[1] = I[1] + I[2] - I[9] - I[10]; + D[2] = -I[1] + I[2] + I[9] - I[10]; + D[3] = I[1] - I[3] - I[9] + I[11]; + + D[4] = I[4] - I[6] + I[8] - I[10]; + D[5] = I[5] + I[6] + I[9] + I[10]; + D[6] = -I[5] + I[6] - I[9] + I[10]; + D[7] = I[5] - I[7] + I[9] - I[11]; + + D[8] = -I[4] + I[6] + I[8] - I[10]; + D[9] = -I[5] - I[6] + I[9] + I[10]; + D[10] = I[5] - I[6] - I[9] + I[10]; + D[11] = -I[5] + I[7] + I[9] - I[11]; + + D[12] = I[4] - I[6] - I[12] + I[14]; + D[13] = I[5] + I[6] - I[13] - I[14]; + D[14] = I[6] - I[5] + I[13] - I[14]; + D[15] = I[5] - I[7] - I[13] + I[15]; +} + +template +void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + // Ensure Winograd conditions are met + assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3); + assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1); + assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom); + assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2); + + // Unroll factor for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2)); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2)); + + // Initialise result to bias + // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value + #pragma unroll + for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) { + int offset = CONFIG_T::n_filt * i; + #pragma unroll + for (int f = 0; f < CONFIG_T::n_filt; f++) { + res[offset + f] = static_cast(biases[f]); + } + } + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row += 2) { + WidthLoop: + #pragma unroll pfc + for (int col = 0; col < CONFIG_T::out_width; col += 2) { + ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + // Get current 4x4 tile + [[intel::fpga_register]] typename data_T::value_type T[16]; + [[intel::fpga_register]] typename CONFIG_T::accum_t D[16]; + [[intel::fpga_register]] uint8_t p = 0; + + #pragma unroll + for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) { + #pragma unroll + for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) { + if (r < CONFIG_T::in_height && r >= 0 && c < CONFIG_T::in_width && c >= 0) { + T[p++] = data[r * CONFIG_T::in_width * CONFIG_T::n_chan + c * CONFIG_T::n_chan + channel]; + } else { + T[p++] = 0; + } + } + } + + // Transform input tile + winograd_transform_input_tile_3x3_kernel(T, D); + + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel); + + // Hadamard product between transformed input tile and kernel + [[intel::fpga_register]] typename CONFIG_T::accum_t Y[16]; + #pragma unroll + for (int i = 0; i < 16; i++) { + Y[i] = static_cast(D[i] * weights[filter_offset + i]); + } + + // Explicitly transform intermediate result Z = A'YA and save to output + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] += + static_cast(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + + Y[10]); + if ((col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - + Y[11]); + if ((row + 1) < CONFIG_T::out_width) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] += + static_cast(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - + Y[13] - Y[14]); + if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height) + res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] += + static_cast(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - + Y[13] + Y[14]); + } + } + } + } +} + +// **************************************************************** +// 2D Convolution for 1x1 kernels using optimized im2col +// **************************************************************** + +template +void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) { + // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations + + [[intel::fpga_register]] int index = 0; + +ChannelLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + + [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height; + [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width; + + if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) { + data_col[index++] = + data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel]; + } else { + data_col[index++] = 0; + } + } +} + +template +void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1); + + // Unroll factors for loop traversing input image, derived from parallelization_factor + // Outer loop only gets unrolled after inner loop is fully unrolled + static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width); + static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height); + + using data_col_T = array; + using res_col_T = array; + +HeightLoop: + #pragma unroll pfr + for (int row = 0; row < CONFIG_T::out_height; row++) { + WidthLoop: + #pragma unroll pfc + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) { + // Loop variables should always be declared in the deepest scope available + // See Intel's HLS - Loop Best Practices + // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html + + [[intel::fpga_register]] data_col_T data_col; + im2col_2d_pointwise_cl(data, data_col, row, col); + + [[intel::fpga_register]] res_col_T res_col; + dense_resource(data_col, res_col, weights, biases); + + FiltLoop: + #pragma unroll + for (int k = 0; k < CONFIG_T::n_filt; k++) { + res[row * CONFIG_T::out_width * CONFIG_T::n_filt + col * CONFIG_T::n_filt + k] = res_col[k]; + } + } + } +} + +// **************************************************************** +// Top-level function - handles different implementations +// **************************************************************** +template +void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + static constexpr bool winograd_conditions = + // Winograd's minimal filtering algorithm not applicable to stride != 1 + CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 && + + // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once + CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 && + + // Verify user opted for Winograd + (CONFIG_T::implementation == nnet::conv2d_implementation::combination || + CONFIG_T::implementation == nnet::conv2d_implementation::winograd); + + if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) { + winograd_conv2d_3x3_kernel_cl(data, res, weights, biases); + } else { + conv_2d_im2col_cl(data, res, weights, biases); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h new file mode 100644 index 000000000..08f0eaa87 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h @@ -0,0 +1,241 @@ +#ifndef NNET_CONV2D_STREAM_H_ +#define NNET_CONV2D_STREAM_H_ + +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void kernel_shift(shift_buffer, kernel_window) + * + * Args: + * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation + * kernel_window - array of values from the input curently being convolved with the kernel + * + * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved + */ +template +void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan], + data_window_T &kernel_window) { +/* + * Manually shift kernel_window by one step to the left + * Not possible to use nnet::shift_reg as the kernel window is convolved with the kernel weights using dense matrix + * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are + * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see + * synthesis report in report.html > Area Analysis of System + */ +KernelShiftWidth: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_width - 1; col++) { + KernelShiftHeight: + #pragma unroll + for (int row = 0; row < CONFIG_T::filt_height; row++) { + KernelShiftChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + col * CONFIG_T::n_chan + channel] = + kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + (col + 1) * CONFIG_T::n_chan + channel]; + } + } + } + +// Insert shift_buffer values into the last column of the kernel window +KernelPushHeight: + #pragma unroll + for (int col = 0; col < CONFIG_T::filt_height; col++) { + KernelPushChannel: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan + + channel] = shift_buffer[col][channel]; + } + } +} + +/* + * void shift_line_buffer(in_element, line_buffer, shift_buffer) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer - + * array elements popped from the line the buffer during the shift operation + * + * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one + * Popped elements are later used to update the kernel window, during the kernel_shift operation + */ +template +void shift_line_buffer_2d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) { +// For every channel, insert the incoming pixel at end of the shift buffer +UpdateBuffer: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + shift_buffer[CONFIG_T::filt_height - 1][channel] = in_elem[channel]; + } + +// Shift line buffer and save popped values to shift buffer +LineBufferDataIn: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_chan; channel++) { + LineBufferShift: + #pragma unroll + for (unsigned col = 1; col < CONFIG_T::filt_height; col++) { + // Shift the line buffer, return the popped pixel + typename data_T::value_type pop = + line_buffer[col - 1][channel].shift(shift_buffer[CONFIG_T::filt_height - col][channel]); + + // Place popped pixed into the shift buffer, one row above + shift_buffer[CONFIG_T::filt_height - col - 1][channel] = pop; + } + } +} + +/* + * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved + * with the kernel weights - Conv1D/Conv2D layer weights biases - Conv1D/Conv2D layer biases + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and + * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride + */ +template +void compute_output_buffer_2d( + const data_T &in_elem, + nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan], + data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases, + int &pX, int &pY, int &sX, int &sY) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::filt_width - 1; + constexpr int lShiftY = CONFIG_T::filt_height - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full kernel + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + // Step 3 - Dense matrix multiplication + [[intel::fpga_register]] res_T res_out; + dense_resource(kernel_window, res_out, weights, biases); + + // Write result to output stream + [[intel::fpga_register]] res_T res_pack; + CastLoop: + #pragma unroll + for (int channel = 0; channel < CONFIG_T::n_filt; channel++) { + res_pack[channel] = res_out[channel]; + } + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template +void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan]; + [[intel::fpga_register]] data_window_T kernel_window; + + // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel) + constexpr auto padds = zero_array(); + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // Y position pixel + int pY = 0; + // X strides + int sX = 0; + // Y strides + int sY = 0; + +// Padding above input image +PaddingTopHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_top; row++) { + PaddingTopWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } + +ReadInputHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) { + // Input image left-side padding + PaddingLeftWidth: + for (int col = 0; col < CONFIG_T::pad_left; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_output_buffer_2d( + data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY); + } + + // Input image right-side padding + PaddingRightWidth: + for (int col = 0; col < CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } + +// Padding below input image +PaddingBottomHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_bottom; row++) { + PaddingBottomWidth: + for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) { + compute_output_buffer_2d(padds, line_buffer, kernel_window, + weights, biases, pX, pY, sX, sY); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h new file mode 100644 index 000000000..dc7618908 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -0,0 +1,164 @@ +#ifndef NNET_DENSE_LARGE_H_ +#define NNET_DENSE_LARGE_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include "nnet_mult.h" +#include + +namespace nnet { + +struct dense_config { + // Internal data type definitions + typedef float bias_t; + typedef float weight_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 10; + static const unsigned n_out = 10; + + static const unsigned reuse_factor = 1; + static const unsigned block_factor = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + static const unsigned multiplier_limit = 1; // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor) + static const unsigned multiplier_factor = 1; // min n_in, rf + static const unsigned multiplier_scale = 1; // M_LIMIT/CONFIG_T::n_out; + static const unsigned reciprocal = 1; // 2^35 / 25 + static const unsigned rf_pad = 0; + static const unsigned bf_pad = 0; + // Resource reuse info + static const unsigned io_type = io_parallel; + static const bool store_weights_in_bram = false; + static const unsigned n_zeros = 0; + // partitioning arrays cyclically to go with roll factors? + + // Default multiplication + template using product = nnet::product::mult; +}; + +template +void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN"); + //#pragma ii CONFIG_T::reuse_factor + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +Load: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } + [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + [[intel::fpga_register]] int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor]; + + #pragma unroll + for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + CONFIG_T::reuse_factor * im; + out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor); + d_index[ir][im] = w_index % CONFIG_T::n_in; + } + } +Product1: + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor]; + Product2: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded) + continue; + int data_index = d_index[ir][im]; + // Modified this + tmp_acc[im] = + CONFIG_T::template product::product( + data[data_index], weights[w_index]); + } + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit]; + ResetMult: + #pragma unroll + for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) { + mult[imult] = 0; + } + AccumLoop1: + #pragma unroll + for (int im = 0; im < CONFIG_T::block_factor; im++) { + int o_index = out_index[ir][im]; + if (o_index >= CONFIG_T::n_out) + continue; // check out of bounds + mult[o_index] += tmp_acc[im]; + } + AccumLoop2: + #pragma unroll + for (int im = 0; im < CONFIG_T::multiplier_limit; im++) { + acc[im] += mult[im]; + } + } +Store: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); // acc[jj]; + } +} +template +void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) && + "The current Reuse Factor is not allowed"); + assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN"); + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; +InitAccum: + #pragma unroll + for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) { + acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc]; + } +ReuseLoop: + [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) { + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor]; + MultLoop: + #pragma unroll + for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) { + uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im; + if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out) + continue; + // Modified this + mult[im] = + CONFIG_T::template product::product( + data[in_index], weights[w_index]); + in_index += CONFIG_T::reuse_factor; + if (in_index >= CONFIG_T::n_in) + in_index = ir; + } + AccumLoop: + #pragma unroll + for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) { + acc[out_index] += mult[im]; + if (acc_step + 1 >= CONFIG_T::multiplier_scale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } +// Cast to "res_t" type +Result: + #pragma unroll + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + res[ires] = cast(acc[ires]); + } +} +template +void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_rf_lt(data, res, weights, biases); + } else { + dense_rf_gt(data, res, weights, biases); + } +} +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h new file mode 100644 index 000000000..92c9adc3b --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -0,0 +1,23 @@ +#ifndef NNET_DENSE_STREAM_H_ +#define NNET_DENSE_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_types.h" + +namespace nnet { + +// Note: DataPack logic removed, at least in the initial version +template +void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { + + [[intel::fpga_register]] typename ExtractPipeType::value_type res; + [[intel::fpga_register]] auto data = data_pipe::read(); + dense_resource::value_type, typename ExtractPipeType::value_type, + CONFIG_T>(data, res, weights, biases); + res_pipe::write(res); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h new file mode 100644 index 000000000..1188fe3ec --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h @@ -0,0 +1,43 @@ +#ifndef NNET_EMBED_H_ +#define NNET_EMBED_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" + +namespace nnet { + +struct embed_config { + // Internal data type definitions + typedef float embeddings_t; + + // (Default layer sizes, overwritten form the backend + static const unsigned n_in = 10; + static const unsigned n_out = 16; + static const unsigned vocab_size = 50; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; +}; + +template +void embedding(const data_T &data, res_T &res, const typename CONFIG_T::embeddings_t &embeddings) { + + /* + * Can store embeddings[] in a register, but a large multiiplexer + * is created due to a non-constant access pattern + */ + +InputSequence: + #pragma unroll + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::n_in; j++) { + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res[j * CONFIG_T::n_out + i] = embeddings[data[j].to_uint() * CONFIG_T::n_out + i]; + } + } +} + +} // namespace nnet +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h new file mode 100644 index 000000000..0f2acb098 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h @@ -0,0 +1,31 @@ +#ifndef NNET_EMBED_STREAM_H_ +#define NNET_EMBED_STREAM_H_ + +namespace nnet { + +template +void embedding_stream(typename CONFIG_T::embeddings_t embeddings) { + + using res_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size::value_type>{}; + + auto in_data = data_pipe::read(); + +InputSequence: + [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < datasize; j++) { + + res_T res_pack; + + DenseEmbedding: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_out; i++) { + res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i]; + } + + res_pipe::write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h new file mode 100644 index 000000000..c7af2e7a6 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -0,0 +1,118 @@ +#ifndef NNET_HELPERS_H +#define NNET_HELPERS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nnet { + +template void convert_data(sycl::queue &q, srcType *src) { + constexpr auto dstTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / dstTypeSize; i++) { + typename ExtractPipeType::value_type ctype; + for (size_t j = 0; j < dstTypeSize; j++) { + ctype[j] = src[i * dstTypeSize + j]; + } + dest_pipe::write(q, ctype); + } +} + +template void convert_data_back(sycl::queue &q, dstType *dst) { + constexpr auto srcTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / srcTypeSize; i++) { + auto ctype = src_pipe::read(q); + for (size_t j = 0; j < srcTypeSize; j++) { + dst[i * srcTypeSize + j] = ctype[j].to_double(); + } + } +} + +extern bool trace_enabled; +extern std::map *trace_outputs; +extern size_t trace_type_size; + +// constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } +// replace with template metaprogramming +template struct ceillog2 { + enum { val = 1 + ceillog2<((n + 1) / 2)>::val }; +}; + +template <> struct ceillog2<2> { + enum { val = 1 }; +}; + +template <> struct ceillog2<1> { + enum { val = 0 }; +}; + +// constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); } +// replace with template metaprogramming +template struct floorlog2 { + enum { val = 1 + floorlog2<(n / 2)>::val }; +}; + +template <> struct floorlog2<1> { + enum { val = 0 }; +}; + +template <> struct floorlog2<0> { + enum { val = 0 }; +}; + +// constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); } +// replace with template metaprogramming +template struct pow2 { + enum { val = 2 * pow2<(n - 1)>::val }; +}; + +template <> struct pow2<0> { + enum { val = 1 }; +}; + +template void save_output_array(data_T *data, save_T *ptr, size_t layer_size) { + for (int i = 0; i < layer_size; i++) { + ptr[i] = static_cast(data[i].to_double()); + } +} + +// We don't want to include save_T in this function because it will be inserted into myproject.cpp +// so a workaround with element size is used +template void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) { + if (!trace_enabled) + return; + + if (trace_outputs) { + if (trace_outputs->count(layer_name) > 0) { + if (trace_type_size == 4) { + save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size); + } else if (trace_type_size == 8) { + save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size); + } else { + std::cout << "Unknown trace type!" << std::endl; + } + } else { + std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl; + } + } else { + std::ostringstream filename; + filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data + std::fstream out; + out.open(filename.str(), std::ios::app); + assert(out.is_open()); + for (int i = 0; i < layer_size; i++) { + out << data[i] << " "; // We don't care about precision in text files + } + out << std::endl; + out.close(); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h new file mode 100644 index 000000000..550663b88 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h @@ -0,0 +1,232 @@ +#ifndef NNET_MERGE_H_ +#define NNET_MERGE_H_ + +#include "nnet_mult.h" + +namespace nnet { + +struct merge_config { + static const unsigned n_elem = 10; +}; + +struct dot_config { + static const unsigned n_in = 10; + static const unsigned n_out = 1; + + static const unsigned reuse_factor = 1; + + typedef float accum_t; + + template using product = nnet::product::mult; +}; + +struct concat_config { + static const unsigned n_elem1_0 = 10; + static const unsigned n_elem1_1 = 10; + static const unsigned n_elem1_2 = 10; + static const unsigned n_elem2_0 = 10; + static const unsigned n_elem2_1 = 10; + static const unsigned n_elem2_2 = 10; + + static const unsigned axis = -1; +}; + +template +void add(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] + data2[i]); + } +} + +template +void subtract(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] - data2[i]); + } +} + +template +void multiply(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast(data1[i] * data2[i]); + } +} + +template +void average(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] + data2[i]) / 2); + } +} + +template +void maximum(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] > data2[i]) ? data1[i] : data2[i]); + } +} + +template +void minimum(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem; i++) { + res[i] = static_cast((data1[i] < data2[i]) ? data1[i] : data2[i]); + } +} + +template +void dot1d(const input1_T &data1, const input2_T &data2, res_T &res) { + constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); + + [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::n_in]; +Product: + #pragma unroll multiplier_limit + for (int i = 0; i < CONFIG_T::n_in; i++) { + mult[i] = CONFIG_T::template product::product( + data1[i], data2[i]); + } + + [[intel::fpga_register]] typename CONFIG_T::accum_t acc = 0; +Accum: + #pragma unroll + for (int i = 0; i < CONFIG_T::n_in; i++) { + acc += mult[i]; + } + + res[0] = static_cast(acc); +} + +template +void concatenate1d(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + res[CONFIG_T::n_elem1_0 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_0(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast(data2[i]); + } +} + +template +void concatenate2d_1(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] = + static_cast(data1[i * CONFIG_T::n_elem1_1 + j]); + } + + #pragma unroll + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] = + static_cast(data2[i * CONFIG_T::n_elem2_1 + j]); + } + } +} + +template +void concatenate2d(const input1_T &data1, const input2_T &data2, res_T &res) { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1(data1, data2, res); + } else { + concatenate2d_0(data1, data2, res); + } +} + +template +void concatenate3d_0(const input1_T &data1, const input2_T &data2, res_T &res) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) { + res[i] = static_cast(data1[i]); + } + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) { + res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = + static_cast(data2[i]); + } +} + +template +void concatenate3d_1(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = + i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + } + + for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem2_2; k++) { + int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + + (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d_2(const input1_T &data1, const input2_T &data2, res_T &res) { + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k; + int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k; + res[res_idx] = static_cast(data1[data_idx]); + } + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_elem1_2; k++) { + int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + + j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2; + int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k; + res[res_idx] = static_cast(data2[data_idx]); + } + } + } +} + +template +void concatenate3d(const input1_T &data1, const input2_T &data2, res_T &res) { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2(data1, data2, res); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1(data1, data2, res); + } else { + concatenate3d_0(data1, data2, res); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h new file mode 100644 index 000000000..60028ea52 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h @@ -0,0 +1,359 @@ +#ifndef NNET_MERGE_STREAM_H_ +#define NNET_MERGE_STREAM_H_ + +namespace nnet { + +template void add_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +AddLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + AddPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] + in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void subtract_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +SubtractLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + SubtractPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] - in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void multiply_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MultLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MultPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>(in_data1[j] * in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void average_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +AvgLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + AvgPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] + in_data2[j]) / (typename ExtractPipeType::value_type::value_type)2); + } + + res_pipe::write(out_data); + } +} + +template void maximum_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MaxLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MaxPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void minimum_stream() { + // both inputs are the same size + constexpr auto inputSize = std::tuple_size::value_type>{}; + constexpr auto outputSize = std::tuple_size::value_type>{}; + +MinLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + MinPack: + #pragma unroll + for (int j = 0; j < outputSize; j++) { + out_data[j] = static_cast::value_type::value_type>( + (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate1d_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + +ConcatLoop1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input2Size; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + ConcatPack1: + #pragma unroll + for (int j = 0; j < input1Size; j++) { + out_data[j + (i * input1Size)] = + static_cast::value_type::value_type>(in_data1[j]); + } + } + +ConcatLoop2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2Size; i++) { + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + ConcatPack2: + #pragma unroll + for (int j = 0; j < input2Size; j++) { + out_data[j + (i * input2Size) + (CONFIG_T::n_elem1_0)] = + static_cast::value_type::value_type>(in_data2[j]); + } + } + res_pipe::write(out_data); +} + +template void concatenate2d_0_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight1: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + +ConcatLoopHeight2: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate2d_1_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[input1Size + k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } +} + +template void concatenate2d_stream() { + if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) { + concatenate2d_1_stream(); + } else { + concatenate2d_0_stream(); + } +} + +template void concatenate3d_0_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight1: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + } + +ConcatLoopHeight2: + for (int i = 0; i < CONFIG_T::n_elem2_0; i++) { + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_1_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth1: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + res_pipe::write(out_data); + } + ConcatLoopWidth2: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) { + + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_2_stream() { + constexpr auto input1Size = std::tuple_size::value_type>{}; + constexpr auto input2Size = std::tuple_size::value_type>{}; + +ConcatLoopHeight: + for (int i = 0; i < CONFIG_T::n_elem1_0; i++) { + ConcatLoopWidth: + [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) { + + [[intel::fpga_register]] auto in_data1 = input1_pipe::read(); + [[intel::fpga_register]] auto in_data2 = input2_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + ConcatPackInput1: + #pragma unroll + for (int k = 0; k < input1Size; k++) { + out_data[k] = static_cast::value_type::value_type>(in_data1[k]); + } + + ConcatPackInput2: + #pragma unroll + for (int k = 0; k < input2Size; k++) { + out_data[input1Size + k] = + static_cast::value_type::value_type>(in_data2[k]); + } + + res_pipe::write(out_data); + } + } +} + +template void concatenate3d_stream() { + if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) { + concatenate3d_2_stream(); + } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) { + concatenate3d_1_stream(); + } else { + concatenate3d_0_stream(); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h new file mode 100644 index 000000000..c7dfc2d7c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h @@ -0,0 +1,113 @@ +#ifndef NNET_MULT_H_ +#define NNET_MULT_H_ + +#include "nnet_common.h" +#include "nnet_helpers.h" +#include + +namespace nnet { + +// Different methods to perform the product of input and weight, depending on their types. +namespace product { + +class Product { + public: + static void limit(unsigned multiplier_limit) {} +}; + +template class both_binary : public Product { + public: + inline static x_T product(x_T a, w_T w) { + // specialisation for 1-bit weights and incoming data + return a == w; + } +}; + +template class weight_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 1-bit weights, arbitrary data + if (w == 0) + return -a; + else + return a; + } +}; + +template class data_binary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-w) { + // Specialisation for 1-bit data, arbitrary weight + if (a == 0) + return -w; + else + return w; + } +}; + +template class weight_ternary : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(-a) { + // Specialisation for 2-bit weights, arbitrary data + if (w == 0) + return 0; + else if (w == -1) + return -a; + else + return a; // if(w == 1) + } +}; + +template class mult : public Product { + public: + inline static auto product(x_T a, w_T w) -> decltype(a * w) { + // 'Normal' product + return a * w; + } + static void limit(unsigned multiplier_limit) { + // TODO: Implement for Quartus + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS + // pragma + } +}; + +template class weight_exponential : public Product { + public: + using r_T = ac_fixed<2 * (w_T::second_type::width + x_T::width), (w_T::second_type::width + x_T::width), true>; + inline static r_T product(x_T a, w_T w) { + // Shift product for exponential weights + // Shift by the exponent. Negative weights shift right + r_T y = static_cast(a) << w.second; + + // Negate or not depending on weight sign + return w.first == 1 ? y : static_cast(-y); + } +}; +} // namespace product + +// TO-DO: These may need extra variants if ac_int types are used in more places +template +inline typename std::enable_if>::value && + std::is_same>::value, + ac_int>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int()); +} + +template +inline typename std::enable_if>::value && + !std::is_same>::value, + res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +template +inline typename std::enable_if<(!std::is_same>::value), res_T>::type +cast(typename CONFIG_T::accum_t x) { + return static_cast(x); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h new file mode 100644 index 000000000..e8e3d6509 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h @@ -0,0 +1,104 @@ +#ifndef NNET_PADDING_H_ +#define NNET_PADDING_H_ + +namespace nnet { + +struct padding1d_config { + static const unsigned in_width = 10; + static const unsigned out_width = 10; + static const unsigned n_chan = 10; + + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template void zeropad1d_cl(const data_T &data, res_T &res) { + + auto resIter = res.begin(); + auto dataIter = data.cbegin(); + + for (int i = 0; i < CONFIG_T::pad_left; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = 0; + } + } + + for (int i = 0; i < CONFIG_T::in_width; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = static_cast(*(dataIter++)); + } + } + + for (int i = 0; i < CONFIG_T::pad_right; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::n_chan; j++) { + *(resIter++) = 0; + } + } +} + +struct padding2d_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; + + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; +}; + +template void zeropad2d_cl(const data_T &data, res_T &res) { + + auto resIter = res.begin(); + auto dataIter = data.cbegin(); + + for (int i = 0; i < CONFIG_T::pad_top; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::pad_left; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + for (int j = 0; j < CONFIG_T::in_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = static_cast(*(dataIter++)); + } + } + for (int j = 0; j < CONFIG_T::pad_right; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } + + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + for (int j = 0; j < CONFIG_T::out_width; j++) { + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + *(resIter++) = 0; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h new file mode 100644 index 000000000..adb2efee2 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h @@ -0,0 +1,81 @@ +#ifndef NNET_PADDING_STREAM_H_ +#define NNET_PADDING_STREAM_H_ + +namespace nnet { + +template inline void fill_zero() { + [[intel::fpga_register]] typename ExtractPipeType::value_type res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = 0; + } + res_pipe::write(res_part); +} + +template inline void fill_data() { + [[intel::fpga_register]] auto data_part = data_pipe::read(); + [[intel::fpga_register]] typename ExtractPipeType::value_type res_part; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_chan; i++) { + res_part[i] = data_part[i]; + } + res_pipe::write(res_part); +} + +template void zeropad1d_cl_stream() { +PadLeft: + for (int i = 0; i < CONFIG_T::pad_left; i++) { + fill_zero(); + } + +CopyMain: + for (int i = 0; i < CONFIG_T::in_width; i++) { + fill_data(); + } + +PadRight: + for (int i = 0; i < CONFIG_T::pad_right; i++) { + fill_zero(); + } +} + +template void zeropad2d_cl_stream() { +PadTop: + [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::pad_top; i++) { + PadTopWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(); + } + } + +PadMain: + [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::in_height; i++) { + + PadLeft: + for (int j = 0; j < CONFIG_T::pad_left; j++) { + fill_zero(); + } + + CopyMain: + for (int j = 0; j < CONFIG_T::in_width; j++) { + fill_data(); + } + + PadRight: + for (int j = 0; j < CONFIG_T::pad_right; j++) { + fill_zero(); + } + } + +PadBottom: + for (int i = 0; i < CONFIG_T::pad_bottom; i++) { + PadBottomWidth: + for (int j = 0; j < CONFIG_T::out_width; j++) { + fill_zero(); + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h new file mode 100644 index 000000000..d4ae91533 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h @@ -0,0 +1,257 @@ +#ifndef NNET_POOLING_H_ +#define NNET_POOLING_H_ + +#include "nnet_common.h" + +namespace nnet { + +// Returns the maximum value from an array of size N +template accum_t max(T x[N]) { + [[intel::fpga_register]] T y = x[0]; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + [[intel::disable_loop_pipelining]] for (int i = 1; i < N; i++) { + if (x[i] > y) + y = x[i]; + } + + return y; +} + +// Returns the mean value of an array of size N +template accum_t avg(T x[N], unsigned length) { + [[intel::fpga_register]] accum_t y = 0; + + // Due to loop dependencies, pipelining & unrolling is not possible + // Explictily disabling pipeline significantly reduces resource usage + [[intel::disable_loop_pipelining]] for (int i = 0; i < N; i++) { y += x[i]; } + + y /= length; + return y; +} + +// Enumeration for pooling functions +enum Pool_Op { Max, Average }; +template accum_t pool_op(T x[N], unsigned length) { + switch (op) { + case Max: + return max(x); + case Average: + return avg(x, length); + } +} + +template accum_t pool_op(T (&x)[N]) { + return pool_op(x, N); +} + +/* + * In Tensorflow, pooling ignores the value in the padded cells + * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.) + * For ax pooling, return the most negative value for the type. + */ +template inline T pad_val() { + switch (op) { + case Max: { + T x = 0; + x[x.width - 1] = 1; + return x; + } + case Average: + return 0; + } +} + +struct pooling1d_config { + // Pooling paramaters + static const unsigned pool_width = 2; + static const unsigned stride_width = 2; + + // I/O sizes + static const unsigned n_in = 10; + static const unsigned n_out = (n_in - pool_width) / stride_width + 1; + static const unsigned n_filt = 4; + + // Padding + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template void pooling1d_cl(const data_T &data, res_T &res) { + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_width; + inp_col += CONFIG_T::stride_width) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + [[intel::fpga_register]] unsigned img_overlap = 0; + + PoolWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) { + if (inp_col + pool_col < CONFIG_T::pad_left || + inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col] = pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col] = data[(inp_col + pool_col - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = static_cast( + pool_op( + pool, img_overlap)); + } + } +} + +template void global_pooling1d_cl(const data_T &data, res_T &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::n_in]; + + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int col = 0; col < CONFIG_T::n_in; col++) { + pool[col] = data[col * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast( + pool_op(pool)); + } +} + +struct pooling2d_config { + // Pooling parameters + static const unsigned stride_height = 2; + static const unsigned stride_width = 2; + static const unsigned pool_height = 2; + static const unsigned pool_width = 2; + + // I/O sizes + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_filt = 4; + + static const unsigned out_height = (in_height - pool_height) / stride_height + 1; + static const unsigned out_width = (in_width - pool_width) / stride_width + 1; + + // Padding + static const unsigned pad_top = 0; + static const unsigned pad_bottom = 0; + static const unsigned pad_left = 0; + static const unsigned pad_right = 0; + static const bool count_pad = false; + + // Pooling function + static const Pool_Op pool_op = Max; +}; + +template void pooling2d_cl(const data_T &data, res_T &res) { + // Add padding and reduce input width to area covered by pooling function + static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right; + static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom; + static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width; + static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height; + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + InputHeightLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_height; + inp_col += CONFIG_T::stride_height) { + InputWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < restricted_padded_width; + inp_width += CONFIG_T::stride_width) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling + [[intel::fpga_register]] unsigned img_overlap = 0; + + PoolHeightLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) { + PoolWidthLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int pool_row = 0; pool_row < CONFIG_T::stride_width; + pool_row++) { + if (inp_col + pool_col < CONFIG_T::pad_top || + inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) || + inp_width + pool_row < CONFIG_T::pad_left || + inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) { + // Add padding + pool[pool_col * CONFIG_T::stride_width + pool_row] = + pad_val(); + if (CONFIG_T::count_pad) + img_overlap++; + } else { + // Current element is from input image + pool[pool_col * CONFIG_T::stride_width + pool_row] = + data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt + + (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt]; + img_overlap++; + } + } + } + + // Pooling operation + res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt + + (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = + static_cast( + pool_op(pool, img_overlap)); + } + } + } +} + +template void global_pooling2d_cl(const data_T &data, res_T &res) { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pool_height == CONFIG_T::stride_height); + +FiltLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) { + [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::in_height * CONFIG_T::in_width]; + + InputLoop: + #pragma unroll + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) { + pool[i] = data[i * CONFIG_T::n_filt + filt]; + } + + res[filt] = static_cast( + pool_op(pool)); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h new file mode 100644 index 000000000..9c30aab67 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h @@ -0,0 +1,322 @@ +#ifndef NNET_POOLING_STREAM_H_ +#define NNET_POOLING_STREAM_H_ + +#include "nnet_conv1d_stream.h" +#include "nnet_conv2d_stream.h" +#include "nnet_pooling.h" +#include "nnet_types.h" + +namespace nnet { + +/* + * void compute_pool_buffer_1d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_1d(const data_T &in_elem, + nnet::shift_reg line_buffer[CONFIG_T::n_filt], + data_window_T &kernel_window, int &pX, int &sX) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + constexpr int lShiftX = CONFIG_T::pool_width - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt]; + nnet::shift_line_buffer_1d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_1d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op( + pool_window)); + } + + // Write result to output stream + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + // Move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling1d_cl_stream() { + assert(CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg line_buffer[CONFIG_T::n_filt]; + [[intel::fpga_register]] data_window_T kernel_window; + + // move former static variables outside the function calls + // X position pixel + int pX = 0; + // X strides + int sX = 0; + +// Read input image +ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_1d(data_pipe::read(), line_buffer, kernel_window, + pX, sX); + } +} + +/* + * void compute_pool_buffer_2d(in_element, res_stream, line_buffer, kernel_window) + * + * Args: + * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number + * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift + * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled + * + * Function executes 4 steps: + * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last + * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from + * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4) + * Counter housekeeping - performs the required pooling operation + * + */ +template +void compute_pool_buffer_2d(const data_T &in_elem, + nnet::shift_reg + line_buffer[CONFIG_T::pool_height - 1][CONFIG_T::n_filt], + data_window_T &kernel_window, int &pX, int &pY, int &sX, int &sY) { + + using res_T = typename ExtractPipeType::value_type; + + // Thresholds + static constexpr int lShiftX = CONFIG_T::pool_width - 1; + static constexpr int lShiftY = CONFIG_T::pool_height - 1; + + // Step 1 - Shift line buffer + [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt]; + nnet::shift_line_buffer_2d(in_elem, line_buffer, shift_buffer); + + // Step 2 - Kernel shift + nnet::kernel_shift_2d(shift_buffer, kernel_window); + + // Check to see if we have a full pool window + if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) { + [[intel::fpga_register]] res_T res_pack; + + FiltLoop: + #pragma unroll + for (int filter = 0; filter < CONFIG_T::n_filt; filter++) { + [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width]; + + // Retrieve data for current channel + PoolLoop: + #pragma unroll + for (int i = 0; i < CONFIG_T::pool_height * CONFIG_T::pool_width; i++) { + pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter]; + } + + // Step 3 - Pooling + res_pack[filter] = static_cast( + pool_op(pool_window)); + } + + // Write result to output stream + res_pipe::write(res_pack); + } + + // Reached end of image + if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) && + (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) { + pX = 0; + sX = 0; + pY = 0; + sY = 0; + // Reached end of row + } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) { + pX = 0; + sX = 0; + pY++; + sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1); + // Same row, same colum, therefore, move to the right + } else { + pX++; + sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); + } +} + +template void pooling2d_cl_stream() { + assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width); + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + using data_arr_T = typename ExtractPipeType::value_type; + using data_element_T = typename data_arr_T::value_type; + using data_window_T = array; + + // Line buffer and kernel window + [[intel::fpga_register]] nnet::shift_reg + line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt]; + [[intel::fpga_register]] data_window_T kernel_window; + + // former static variables + // X, Y position pixels + int pX = 0; + int pY = 0; + + // X, Y strides + int sX = 0; + int sY = 0; + +ReadInputHeight: + [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) { + // Read input image + ReadInputWidth: + for (int col = 0; col < CONFIG_T::in_width; col++) { + compute_pool_buffer_2d(data_pipe::read(), line_buffer, + kernel_window, pX, pY, sX, sY); + } + } +} + +/* + * A function used with Global Pooling + * Updates the output pooling value + * Max : Return the maximum between the previous maximum and current input + * Avg : Returns the cumulative sum + */ +template inline T_y reduce_global_pool(T_y y, T_x x) { + if (op == Max) { + return (x > y) ? (T_y)x : y; + } else { + return (T_y)(x + y); + } +} + +/* + * A function used with Global Pooling + * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max) + */ +template void compute_global_pool(const data_T &in_elem, res_T &data_input) { + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = reduce_global_pool( + data_input[i], in_elem[i]); + } +} + +template void global_pooling1d_cl_stream() { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + using accum_arr_t = array; + + [[intel::fpga_register]] accum_arr_t data_input; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = pad_val(); + } + + for (int i = 0; i < CONFIG_T::n_in; i++) { + compute_global_pool(data_pipe::read(), data_input); + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i] / CONFIG_T::n_in); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res_pipe::write(res_pack); +} + +template void global_pooling2d_cl_stream() { + assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0); + assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0); + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + using accum_arr_t = array; + + [[intel::fpga_register]] accum_arr_t data_input; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + data_input[i] = pad_val(); + } + + for (int i = 0; i < CONFIG_T::in_height; i++) { + for (int j = 0; j < CONFIG_T::in_width; j++) { + compute_global_pool(data_pipe::read(), data_input); + } + } + + [[intel::fpga_register]] res_T res_pack; + if (CONFIG_T::pool_op == Average) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = + static_cast(data_input[i] / (CONFIG_T::in_width * CONFIG_T::in_height)); + } + } else { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_filt; i++) { + res_pack[i] = static_cast(data_input[i]); + } + } + + res_pipe::write(res_pack); +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h new file mode 100644 index 000000000..5fec90d1a --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h @@ -0,0 +1,18 @@ +#ifndef NNET_PRINTF_H_ +#define NNET_PRINTF_H_ + +#ifdef __SYCL_DEVICE_ONLY__ +#define CL_CONSTANT __attribute__((opencl_constant)) +#else +#define CL_CONSTANT +#endif + +using namespace sycl; + +#define PRINTF(format, ...) \ + { \ + static const CL_CONSTANT char _format[] = format; \ + ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \ + } + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h new file mode 100644 index 000000000..4c20f28d1 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h @@ -0,0 +1,566 @@ +#ifndef NNET_RECURRENT_H_ +#define NNET_RECURRENT_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { + +//---------------------- +// Utils +//---------------------- + +template +void multiply_W(const data_T &input, res_T &out, const weight_t &weight) { +MULTIPLY_W_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_W_LOOP_J: + #pragma unroll + for (int j = 0; j < N_IN; j++) { + out[i] += input[j] * weight[i * N_IN + j]; + } + } +} + +template +void multiply_U(const data_T &input, res_T &out, const weight_t &weight) { +MULTIPLY_U_LOOP_I: + #pragma unroll + for (int i = 0; i < N_OUT; i++) { + out[i] = 0; + + MULTIPLY_U_LOOP_J: + #pragma unroll + for (int j = 0; j < N_OUT; j++) { + out[i] += input[j] * weight[i * N_OUT + j]; + } + } +} + +template +void add_bias(const data_T &inputs, res_T &out, const bias_t &bias) { +ADD_BIAS_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = inputs[i] + bias[i]; + } +} + +template +void multiply_vectors(const data1_T &in1, const data2_T &in2, res_T &out) { +MULTIPLY_VECT_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] * in2[i]; + } +} + +template +void add_vectors(const data1_T &in1, const data2_T &in2, res_T &out) { +ADD_VECTOR_LOOP: + #pragma unroll + for (int i = 0; i < N; i++) { + out[i] = in1[i] + in2[i]; + } +} + +//---------------------- +// GRU +//---------------------- + +struct gru_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_units = 1; + static const unsigned n_timesteps = 1; + static const unsigned n_outputs = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { + static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor; + // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and + // candidate state gates, for each of the units + + using accum_array_T = array; + + [[intel::fpga_register]] accum_array_T mat_mul_x_w; + nnet::dense_resource(x, mat_mul_x_w, weights, bias); + + // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights), + // for update, reset and candidate state gates, for each of the units + [[intel::fpga_register]] accum_array_T mat_mul_h_wr; + nnet::dense_resource(h, mat_mul_h_wr, recurrent_weights, + recurrent_bias); + + // A vector containing both the values of z(t) and r(t) for every state + using z_activ_array_T = array; + [[intel::fpga_register]] z_activ_array_T z_r; + + // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1) + // Unrolled fully, no DSPs used + #pragma unroll + for (int i = 0; i < (2 * CONFIG_T::n_units); i++) { + z_r[i] = mat_mul_x_w[i] + mat_mul_h_wr[i]; + } + + // Activation on z(t) and r(t) + [[intel::fpga_register]] z_activ_array_T z_r_act; + CONFIG_T::template activation_recr::activation(z_r, z_r_act); + + // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h + using h_activ_array_T = array; + [[intel::fpga_register]] h_activ_array_T hadamard_r_h; + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units]; + } + + // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h} + [[intel::fpga_register]] h_activ_array_T h_cand; + // Addition - can unroll fully; no DSPs used here + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h_cand[i] = mat_mul_x_w[i + 2 * CONFIG_T::n_units] + hadamard_r_h[i]; + } + + // Activation on candidate state + [[intel::fpga_register]] h_activ_array_T h_cand_act; + CONFIG_T::template activation::activation(h_cand, + h_cand_act); + + // Update state + #pragma unroll recurrent_unroll_factor + for (int i = 0; i < (CONFIG_T::n_units); i++) { + h[i] = static_cast(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]); + } +} + +template +void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias, + const typename CONFIG_T::recurrent_bias_t &recurrent_bias) { + + using h_T = array; + [[intel::fpga_register]] data_T x; + [[intel::fpga_register]] h_T h; + + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + // Loop depedency - cannot pipeline + [[intel::disable_loop_pipelining]] for (int t = 0; t < CONFIG_T::n_timesteps; t++) { + // Get data at current time step + #pragma unroll + for (int j = 0; j < CONFIG_T::n_in; j++) { + x[j] = data[j + t * CONFIG_T::n_in]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + res[CONFIG_T::n_units * t + i] = h[i]; + } + } + } + + if (!CONFIG_T::return_sequences) { + #pragma unroll + for (int i = 0; i < (CONFIG_T::n_units); i++) { + res[i] = h[i]; + } + } +} + +//---------------------- +// SimpleRNN +//---------------------- + +struct simpleRNN_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { + + using accum_array_T = array; + // Weight multiplication + [[intel::fpga_register]] accum_array_T afterW; + multiply_W(inputs, afterW, kernel); + + // Bias addition + [[intel::fpga_register]] accum_array_T afterBias; + add_bias(afterW, afterBias, bias); + + // Hidden state + [[intel::fpga_register]] accum_array_T hiddenCand; + multiply_U(hidden_state, hiddenCand, + rec_kernel); + + // Vector addition + [[intel::fpga_register]] accum_array_T afterAdd; + add_vectors(afterBias, hiddenCand, afterAdd); + + // Activation + CONFIG_T::template activation::activation(afterAdd, hidden_state_o); +} + +template +void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &kernel, + const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) { + + using in_T = array; + using h_T = array; + + [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T hidden_state_temp; + [[intel::fpga_register]] h_T h; + [[intel::fpga_register]] in_T in; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[0][x] = 0; + } + + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + + // Data at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[i][x]; + } + + // Do SimpleRNN + simple_rnn_cell(in, hidden_state_temp, h, kernel, rec_kernel, bias); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[i + 1][x] = h[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[CONFIG_T::n_timesteps][x]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + #pragma unroll + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h]; + } + } + } +} + +//---------------------- +// LSTM +//---------------------- + +struct lstm_config { + // Internal data type definitions + typedef float weight_t; + typedef float bias_t; + typedef float accum_t; + + // Layer Sizes + static const unsigned n_in = 1; + static const unsigned n_out = 1; + static const unsigned n_outputs = 1; + + static const unsigned n_timesteps = 1; + static const bool return_sequences = false; + + // Resource reuse info + static const unsigned io_type = io_parallel; + static const unsigned reuse_factor = 1; + static const bool store_weights_in_bram = false; + + // Activation + template using activation_recr = nnet::activation::relu; + + template using activation = nnet::activation::relu; +}; + +template +void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o, + const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t BF, + const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t BO) { + + using accum_array_T = array; + + // Internals definitions + [[intel::fpga_register]] accum_array_T i_afterW; + [[intel::fpga_register]] accum_array_T i_afterBias; + [[intel::fpga_register]] accum_array_T c_afterW; + [[intel::fpga_register]] accum_array_T c_afterBias; + [[intel::fpga_register]] accum_array_T o_afterW; + [[intel::fpga_register]] accum_array_T o_afterBias; + [[intel::fpga_register]] accum_array_T f_afterW; + [[intel::fpga_register]] accum_array_T f_afterBias; + + // Hidden state Gate candidates, intermediate variables + [[intel::fpga_register]] accum_array_T i_hiddenCand; + [[intel::fpga_register]] accum_array_T f_hiddenCand; + [[intel::fpga_register]] accum_array_T c_hiddenCand; + [[intel::fpga_register]] accum_array_T o_hiddenCand; + + // After addition, intermediate variables + [[intel::fpga_register]] accum_array_T i_afterAdd; + [[intel::fpga_register]] accum_array_T f_afterAdd; + [[intel::fpga_register]] accum_array_T c_afterAdd; + [[intel::fpga_register]] accum_array_T o_afterAdd; + + // Gate outputs + [[intel::fpga_register]] accum_array_T gate_i; + [[intel::fpga_register]] accum_array_T gate_f; + [[intel::fpga_register]] accum_array_T gate_c; + [[intel::fpga_register]] accum_array_T gate_o; + [[intel::fpga_register]] accum_array_T gate_ic; + [[intel::fpga_register]] accum_array_T gate_forget; + [[intel::fpga_register]] accum_array_T h; + + // Intermediate variable cell calculation + [[intel::fpga_register]] accum_array_T cell_act_multp; + [[intel::fpga_register]] accum_array_T cell_act_add; + + //-----------Gate I Calculations + // Weight multiplication + multiply_W(inputs, i_afterW, WI); + + // Bias addition + add_bias(i_afterW, i_afterBias, BI); + + // Hidden Candidate + multiply_U(hidden_state, i_hiddenCand, + RWI); + + // Vector addition + add_vectors(i_afterBias, i_hiddenCand, i_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + i_afterAdd, gate_i); + + //-----------Gate F Calculations + // Weight multiplication + multiply_W(inputs, f_afterW, WF); + + // Bias addition + add_bias(f_afterW, f_afterBias, BF); + + // Hidden Candidate + multiply_U(hidden_state, f_hiddenCand, + RWF); + + // Vector addition + add_vectors(f_afterBias, f_hiddenCand, f_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + f_afterAdd, gate_f); + + //-----------Gate C Calculations + // Weight multiplication + multiply_W(inputs, c_afterW, WC); + + // Bias addition + add_bias(c_afterW, c_afterBias, BC); + + // Hidden Candidate + multiply_U(hidden_state, c_hiddenCand, + RWC); + + // Vector addition + add_vectors(c_afterBias, c_hiddenCand, c_afterAdd); + + // Activation + CONFIG_T::template activation::activation( + c_afterAdd, gate_c); + + //-----------gate I and C multiply + // Vector multiplication + multiply_vectors(gate_i, gate_c, gate_ic); + + //-----------Gate O Calculations + // Weight multiplication + multiply_W(inputs, o_afterW, WO); + + // Bias addition + add_bias(o_afterW, o_afterBias, BO); + + // Hidden Candidate + multiply_U(hidden_state, o_hiddenCand, + RWO); + + // Vector addition + add_vectors(o_afterBias, o_hiddenCand, o_afterAdd); + + // Activation + CONFIG_T::template activation_recr::activation( + o_afterAdd, gate_o); + + //-----------Cell State Calculation + // Vector multiplication + multiply_vectors(gate_f, cell_state, cell_act_multp); + + // Vector addition + add_vectors(gate_ic, cell_act_multp, cell_act_add); + + //-----------Forget gate Calculation + // Activation + CONFIG_T::template activation::activation( + cell_act_add, gate_forget); + + // Vector multiplication + multiply_vectors(gate_o, gate_forget, h); + +OUTPUT_WRITE_LOOP: + #pragma unroll + for (int x = (CONFIG_T::n_out - 1); x >= 0; x--) { + hidden_state_o[x] = h[x]; + cell_state_o[x] = cell_act_add[x]; + } +} + +template +void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF, + const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO, + const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF, + const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO, + const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t &BF, + const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t &BO) { + + // Note: currently this does not support recurrent bias + + using in_T = array; + using h_T = array; + + [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T hidden_state_temp; + [[intel::fpga_register]] h_T cell_state[CONFIG_T::n_timesteps + 1]; + [[intel::fpga_register]] h_T cell_state_temp; + [[intel::fpga_register]] h_T h; + [[intel::fpga_register]] h_T c; + [[intel::fpga_register]] in_T in; + +// Set initially hidden state (output) to zero +INIT_LOOP: + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[0][x] = 0; + cell_state[0][x] = 0; + } + + // Input dimension + [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) { + // Data at current time step + for (int x = 0; x < CONFIG_T::n_in; x++) { + in[x] = data[x + i * CONFIG_T::n_in]; + } + + // Hidden state at current time step + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state_temp[x] = hidden_state[i][x]; + cell_state_temp[x] = cell_state[i][x]; + } + + // Do LSTM + lstm_cell(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI, + BF, BC, BO); + + // Write result + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + hidden_state[i + 1][x] = h[x]; + cell_state[i + 1][x] = c[x]; + } + } + + if (CONFIG_T::return_sequences == 0) { + // Output when return_sequences is false + #pragma unroll + for (int x = 0; x < CONFIG_T::n_out; x++) { + res[x] = hidden_state[CONFIG_T::n_timesteps][x]; + } + } else { + // Output when return_sequences is true + #pragma unroll + for (int x = 0; x < CONFIG_T::n_timesteps; x++) { + for (int h = 0; h < CONFIG_T::n_out; h++) { + res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h new file mode 100644 index 000000000..893fd027c --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h @@ -0,0 +1,47 @@ +#ifndef NNET_RECR_ACTIVATION_H_ +#define NNET_RECR_ACTIVATION_H_ + +#include "nnet_activation.h" +#include "nnet_common.h" + +namespace nnet { + +namespace activation { + +template class Activation { + public: + // ************************************************* + // Blank Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) {} +}; + +template class relu : public Activation { + public: + // ************************************************* + // Relu Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::relu(data, res); } +}; + +template class sigmoid : public Activation { + public: + // ************************************************* + // Sigmoid Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::sigmoid(data, res); } +}; + +template class tanh : public Activation { + public: + // ************************************************* + // TanH Activation + // ************************************************* + static void activation(const data_T &data, res_T &res) { nnet::dense_tanh(data, res); } +}; + +} // namespace activation + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h new file mode 100644 index 000000000..7429419cd --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h @@ -0,0 +1,68 @@ +#ifndef NNET_RECURRENT_STREAM_H_ +#define NNET_RECURRENT_STREAM_H_ + +#include "nnet_common.h" +#include "nnet_dense.h" +#include "nnet_recurrent_activation.h" + +namespace nnet { +template +void gru_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::recurrent_weight_t recurrent_weights, + typename CONFIG_T::bias_t bias, typename CONFIG_T::recurrent_bias_t recurrent_bias) { + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + using h_T = array; + + constexpr auto datasize = std::tuple_size{}; + constexpr auto ressize = std::tuple_size{}; + + [[intel::fpga_register]] h_T h; + #pragma unroll + for (int i = 0; i < CONFIG_T::n_units; i++) { + h[i] = 0; + } + + [[intel::fpga_register]] data_T x; + +DataPropagation: + for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / datasize; i_in++) { + auto data_pack = data_pipe::read(); + + DataPack: + #pragma unroll + for (int i_pack = 0; i_pack < datasize; i_pack++) { + x[i_pack] = data_pack[i_pack]; + } + + nnet::gru_cell(x, h, weights, recurrent_weights, bias, recurrent_bias); + + if (CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < ressize; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_pipe::write(res_pack); + } + } + + if (!CONFIG_T::return_sequences) { + res_T res_pack; + + ResPackNoRetSeq: + #pragma unroll + for (int i_pack = 0; i_pack < ressize; i_pack++) { + res_pack[i_pack] = h[i_pack]; + } + + res_pipe::write(res_pack); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h new file mode 100644 index 000000000..c461e337d --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h @@ -0,0 +1,36 @@ +#ifndef NNET_IMAGE_H_ +#define NNET_IMAGE_H_ + +namespace nnet { + +struct resize_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + + static const unsigned out_height = 10; + static const unsigned out_width = 10; + + static const unsigned n_chan = 10; +}; + +template void resize_nearest(const data_T &image, res_T &resized) { + int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1; + int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1; + + for (int i = 0; i < CONFIG_T::new_height; i++) { + for (int j = 0; j < CONFIG_T::new_width; j++) { + int x = ((j * x_ratio) >> 16); + int y = ((i * y_ratio) >> 16); + + #pragma unroll + for (int k = 0; k < CONFIG_T::n_chan; k++) { + resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] = + image[(y * CONFIG_T::width * CONFIG_T::n_chan) + x * CONFIG_T::n_chan + k]; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h new file mode 100644 index 000000000..9a37f098e --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h @@ -0,0 +1,58 @@ +#ifndef NNET_IMAGE_STREAM_H_ +#define NNET_IMAGE_STREAM_H_ + +#include "nnet_common.h" + +namespace nnet { + +template void resize_nearest_stream() { + assert(CONFIG_T::new_height % CONFIG_T::height == 0); + assert(CONFIG_T::new_width % CONFIG_T::width == 0); + + using data_T = typename ExtractPipeType::value_type; + + constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height; + constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width; + +ImageHeight: + for (unsigned h = 0; h < CONFIG_T::height; h++) { + [[intel::fpga_register]] data_T data_in_row[CONFIG_T::width]; + + ImageWidth: + for (unsigned i = 0; i < CONFIG_T::width; i++) { + [[intel::fpga_register]] auto in_data = data_pipe::read(); + + ImageChan: + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_chan; j++) { + data_in_row[i][j] = in_data[j]; + } + } + + ResizeHeight: + for (unsigned i = 0; i < ratio_height; i++) { + + ImageWidth2: + for (unsigned l = 0; l < CONFIG_T::width; l++) { + + ResizeWidth: + for (unsigned j = 0; j < ratio_width; j++) { + + [[intel::fpga_register]] data_T out_data; + + ResizeChan: + #pragma unroll + for (unsigned k = 0; k < CONFIG_T::n_chan; k++) { + out_data[k] = data_in_row[l][k]; + } + + res_pipe::write(out_data); + } + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h new file mode 100644 index 000000000..6e5e86a58 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h @@ -0,0 +1,126 @@ +#ifndef NNET_CLONE_H +#define NNET_CLONE_H + +#include "nnet_common.h" + +namespace nnet { + +struct broadcast_config { + static const unsigned in_height = 10; + static const unsigned in_width = 10; + static const unsigned n_chan = 1; + static const unsigned n_dupl = 2; +}; + +template void clone_stream() { + using data_T = typename ExtractPipeType::value_type; + using res1_T = typename ExtractPipeType::value_type; + using res2_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + data_T in_data = data_pipe::read(); + res1_T out_data1; + res2_T out_data2; + + ClonePack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + } + + res1_pipe::write(out_data1); + res2_pipe::write(out_data2); + } +} + +template void clone_stream() { + using data_T = typename ExtractPipeType::value_type; + using res1_T = typename ExtractPipeType::value_type; + using res2_T = typename ExtractPipeType::value_type; + using res3_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; +CloneLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + data_T in_data = data_pipe::read(); + res1_T out_data1; + res2_T out_data2; + res3_T out_data3; + + ClonePack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data1[j] = in_data[j]; + out_data2[j] = in_data[j]; + out_data3[j] = in_data[j]; + } + + res1_pipe::write(out_data1); + res2_pipe::write(out_data2); + res3_pipe::write(out_data3); + } +} + +template void repack_stream() { + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + constexpr auto datasize = std::tuple_size{}; + constexpr auto ressize = std::tuple_size{}; + + if constexpr (datasize == ressize) { + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + [[intel::fpga_memory]] res_T out_data; + + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data[j] = in_data[j]; + } + + res_pipe::write(out_data); + } + } else if constexpr (datasize > ressize) { + constexpr unsigned pack_diff = datasize / ressize; + + for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + [[intel::fpga_memory]] res_T out_data; + + [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) { + + #pragma unroll + for (int k = 0; k < ressize; k++) { + out_data[k] = in_data[j * ressize + k]; + } + res_pipe::write(out_data); + } + } + } else { // datasize < ressize + [[intel::fpga_memory]] res_T out_data; + constexpr unsigned pack_diff = ressize / datasize; + unsigned pack_cnt = 0; + [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) { + + [[intel::fpga_memory]] auto in_data = data_pipe::read(); + + #pragma unroll + for (int j = 0; j < datasize; j++) { + out_data[pack_cnt * datasize + j] = in_data[j]; + } + + if (pack_cnt == pack_diff - 1) { + res_pipe::write(out_data); + pack_cnt = 0; + } else { + pack_cnt++; + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h new file mode 100644 index 000000000..2c4991a13 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h @@ -0,0 +1,48 @@ +#ifndef NNET_TRANSPOSE_H_ +#define NNET_TRANSPOSE_H_ + +namespace nnet { + +struct transpose_config { + static const unsigned height = 10; + static const unsigned width = 10; + static const unsigned depth = 10; + static constexpr unsigned perm[3] = {2, 0, 1}; +}; + +template void transpose_2d(const data_T &data, res_T &res) { + for (int i = 0; i < CONFIG_T::height; i++) { + #pragma unroll + for (int j = 0; j < CONFIG_T::width; j++) { + res[j * CONFIG_T::height + i] = static_cast(data[i * CONFIG_T::width + j]); + } + } +} + +template void transpose_3d(const data_T &data, res_T &res) { + static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width}; + static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]], + dim_data[CONFIG_T::perm[2]]}; + + int index_data[3] = {0}, index_res[3] = {0}; + + for (index_data[0] = 0; index_data[0] < dim_data[0]; index_data[0]++) { + #pragma unroll + for (index_data[1] = 0; index_data[1] < dim_data[1]; index_data[1]++) { + #pragma unroll + for (index_data[2] = 0; index_data[2] < dim_data[2]; index_data[2]++) { + index_res[0] = index_data[CONFIG_T::perm[0]]; + index_res[1] = index_data[CONFIG_T::perm[1]]; + index_res[2] = index_data[CONFIG_T::perm[2]]; + + res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = + static_cast( + data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]); + } + } + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h new file mode 100644 index 000000000..e15f63c13 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h @@ -0,0 +1,39 @@ +#ifndef NNET_TRANSPOSE_STREAM_H_ +#define NNET_TRANSPOSE_STREAM_H_ + +namespace nnet { + +template void transpose_2d_stream() { + + using data_T = typename ExtractPipeType::value_type; + using res_T = typename ExtractPipeType::value_type; + + constexpr auto data_size = std::tuple_size::value_type>{}; + constexpr auto res_size = std::tuple_size::value_type>{}; + + [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width]; + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_size; i++) { + [[intel::fpga_register]] data_T in_data = data_pipe::read(); + + #pragma unroll + for (int j = 0; j < data_size; j++) { + data_array[i * data_size + j] = typename data_T::value_type(in_data[j]); + } + } + + for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_size; i++) { + [[intel::fpga_register]] res_T out_data; + + #pragma unroll + for (int j = 0; j < res_size; j++) { + out_data[j] = typename res_T::value_type(data_array[j * data_size + i]); + } + + res_pipe::write(out_data); + } +} + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h new file mode 100644 index 000000000..8cf883c1d --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -0,0 +1,71 @@ +#ifndef NNET_TYPES_H_ +#define NNET_TYPES_H_ + +#include +#include +#include +#include +#include +#include + +namespace nnet { + +// Define the pipe type that we use +template using array = std::array; + +// T should be an array +template constexpr T zero_array() { + T ar; + #pragma unroll + for (auto &a : ar) { + a = 0; + } + return ar; +} + +// This is a helper to extract the value_type of a pipe +template struct ExtractPipeType { typedef T value_type; }; + +template