From ce287f050529a788aebae942c3cd148ca0ad41ea Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 20 Dec 2023 23:17:44 -0600
Subject: [PATCH 001/100] snapshot adding oneapi

---
 hls4ml/backends/__init__.py                   |   2 +
 hls4ml/backends/oneapi/__init__.py            |   0
 hls4ml/backends/oneapi/oneapi_backend.py      | 338 ++++++++++
 hls4ml/backends/oneapi/passes/__init__.py     |   0
 .../oneapi/passes/convolution_templates.py    | 183 ++++++
 .../oneapi/passes/convolution_winograd.py     | 177 ++++++
 .../backends/oneapi/passes/core_templates.py  | 221 +++++++
 .../backends/oneapi/passes/merge_templates.py | 108 ++++
 hls4ml/backends/oneapi/passes/pointwise.py    |  95 +++
 .../oneapi/passes/pooling_templates.py        | 111 ++++
 .../oneapi/passes/quantization_templates.py   |  36 ++
 .../oneapi/passes/recurrent_templates.py      | 305 +++++++++
 .../oneapi/passes/reshaping_templates.py      | 138 +++++
 .../oneapi/passes/resource_strategy.py        |  77 +++
 .../backends/oneapi/passes/transform_types.py |  54 ++
 hls4ml/templates/oneapi/CMakeLists.txt        | 320 ++++++++++
 hls4ml/templates/oneapi/exception_handler.hpp |  22 +
 hls4ml/templates/oneapi/firmware/defines.h    |  21 +
 .../templates/oneapi/firmware/myproject.cpp   |  20 +
 hls4ml/templates/oneapi/firmware/myproject.h  |  36 ++
 .../firmware/nnet_utils/nnet_activation.h     | 516 ++++++++++++++++
 .../firmware/nnet_utils/nnet_batchnorm.h      | 104 ++++
 .../oneapi/firmware/nnet_utils/nnet_common.h  |  78 +++
 .../oneapi/firmware/nnet_utils/nnet_conv1d.h  |  64 ++
 .../nnet_utils/nnet_conv1d_resource.h         | 241 ++++++++
 .../oneapi/firmware/nnet_utils/nnet_conv2d.h  |  72 +++
 .../nnet_utils/nnet_conv2d_resource.h         | 303 +++++++++
 .../oneapi/firmware/nnet_utils/nnet_dense.h   | 170 +++++
 .../nnet_utils/nnet_dense_compressed.h        |  81 +++
 .../oneapi/firmware/nnet_utils/nnet_embed.h   |  45 ++
 .../oneapi/firmware/nnet_utils/nnet_helpers.h | 119 ++++
 .../oneapi/firmware/nnet_utils/nnet_merge.h   | 249 ++++++++
 .../oneapi/firmware/nnet_utils/nnet_mult.h    | 113 ++++
 .../oneapi/firmware/nnet_utils/nnet_padding.h |  99 +++
 .../oneapi/firmware/nnet_utils/nnet_pooling.h | 319 ++++++++++
 .../firmware/nnet_utils/nnet_recurrent.h      | 583 ++++++++++++++++++
 .../nnet_utils/nnet_recurrent_activation.h    |  53 ++
 .../oneapi/firmware/nnet_utils/nnet_resize.h  |  38 ++
 .../firmware/nnet_utils/nnet_transpose.h      |  50 ++
 .../oneapi/firmware/nnet_utils/nnet_types.h   |  44 ++
 hls4ml/templates/oneapi/firmware/parameters.h |  11 +
 hls4ml/templates/oneapi/myproject_test.cpp    | 167 +++++
 42 files changed, 5783 insertions(+)
 create mode 100644 hls4ml/backends/oneapi/__init__.py
 create mode 100644 hls4ml/backends/oneapi/oneapi_backend.py
 create mode 100644 hls4ml/backends/oneapi/passes/__init__.py
 create mode 100644 hls4ml/backends/oneapi/passes/convolution_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/convolution_winograd.py
 create mode 100644 hls4ml/backends/oneapi/passes/core_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/merge_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/pointwise.py
 create mode 100644 hls4ml/backends/oneapi/passes/pooling_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/quantization_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/recurrent_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/reshaping_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/resource_strategy.py
 create mode 100644 hls4ml/backends/oneapi/passes/transform_types.py
 create mode 100644 hls4ml/templates/oneapi/CMakeLists.txt
 create mode 100644 hls4ml/templates/oneapi/exception_handler.hpp
 create mode 100644 hls4ml/templates/oneapi/firmware/defines.h
 create mode 100644 hls4ml/templates/oneapi/firmware/myproject.cpp
 create mode 100644 hls4ml/templates/oneapi/firmware/myproject.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
 create mode 100644 hls4ml/templates/oneapi/firmware/parameters.h
 create mode 100644 hls4ml/templates/oneapi/myproject_test.cpp

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 6396d7815..cbd39813f 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -3,6 +3,7 @@
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
+from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
@@ -13,3 +14,4 @@
 register_backend('Vitis', VitisBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
+register_backend('oneAPI', OneAPIBackend)
diff --git a/hls4ml/backends/oneapi/__init__.py b/hls4ml/backends/oneapi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
new file mode 100644
index 000000000..799c28963
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -0,0 +1,338 @@
+import os
+from contextlib import contextmanager
+
+import numpy as np
+
+from hls4ml.backends import FPGABackend
+from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
+from hls4ml.model.flow import register_flow
+from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
+from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
+#from hls4ml.report import parse_oneapi_report
+
+
+@contextmanager
+def chdir(newdir):
+    prevdir = os.getcwd()
+    os.chdir(os.path.expanduser(newdir))
+    try:
+        yield
+    finally:
+        os.chdir(prevdir)
+
+
+class OneAPIBackend(FPGABackend):
+    def __init__(self):
+        super().__init__('oneAPI')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_layer_attributes(self):
+        # Add RNN-specific recurrent_reuse_factor attribute
+        rnn_layers = [
+            SimpleRNN,
+            LSTM,
+            GRU,
+        ]
+
+        for layer in rnn_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('table_size', default=1024))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            self.attribute_map[layer] = attrs
+
+    def _register_flows(self):
+        initializers = self._get_layer_initializers()
+        init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
+
+        streaming_passes = ['oneapi:reshape_stream', 'oneapi:clone_output']
+        streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
+
+        oneapi_types = [
+            'oneapi:transform_types',
+            'oneapi:register_bram_weights',
+            'oneapi:apply_resource_strategy',
+            'oneapi:apply_winograd_kernel_transformation',
+        ]
+        oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=[init_flow], backend=self.name)
+
+        quantization_passes = [
+            'oneapi:merge_batch_norm_quantized_tanh',
+            'oneapi:quantize_dense_output',
+            'fuse_consecutive_batch_normalization',
+            'oneapi:xnor_pooling',
+        ]
+        quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
+
+        optimization_passes = [
+            'oneapi:remove_final_reshape',
+            'oneapi:optimize_pointwise_conv',
+            'oneapi:inplace_parallel_reshape',
+            'oneapi:inplace_stream_flatten',
+            'oneapi:skip_softmax',
+            'oneapi:fix_softmax_table_size',
+        ]
+        optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
+
+        templates = self._get_layer_templates()
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
+
+        writer_passes = ['make_stamp', 'oneapi:write_hls']
+
+        self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name)
+
+        all_passes = get_backend_passes(self.name)
+
+        extras = [
+            # Ideally this should be empty
+            opt_pass
+            for opt_pass in all_passes
+            if opt_pass
+            not in initializers
+            + streaming_passes
+            + oneapi_types
+            + quantization_passes
+            + templates
+            + optimization_passes
+            + writer_passes
+        ]
+
+        if len(extras) > 0:
+            extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name)
+        else:
+            extras_flow = None
+
+        ip_flow_requirements = [
+            'optimize',
+            init_flow,
+            streaming_flow,
+            quantization_flow,
+            optimization_flow,
+            oneapi_types_flow,
+            extras_flow,
+            template_flow,
+        ]
+        ip_flow_requirements = list(filter(None, ip_flow_requirements))
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'):
+        config = {}
+
+        config['Part'] = part if part is not None else 'Arria10'
+        config['ClockPeriod'] = clock_period
+        config['IOType'] = io_type
+        config['HLSConfig'] = {}
+
+        return config
+
+    def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False):
+        """
+        Builds the project using Intel DPC++ (oneAPI) compiler.
+
+        Args:
+            model (ModelGraph): The model to build
+            synth, optional: Whether to run HLS synthesis
+            fpgasynth, optional:  Whether to run FPGA synthesis (oneAPI Compile)
+            log_level, optional: Logging level to be displayed during HLS synthesis (0, 1, 2)
+            cont_if_large_area: Instruct the HLS compiler to continue synthesis if the estimated resource usage exceeds
+                device resources
+        Errors raise exceptions
+        """
+
+        # Check software needed is present
+        pass
+
+    @layer_optimizer(Layer)
+    def init_base_layer(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('reuse_factor', reuse_factor)
+
+        target_cycles = layer.model.config.get_target_cycles(layer)
+        layer.set_attr('target_cycles', target_cycles)
+
+    @layer_optimizer(Dense)
+    def init_dense(self, layer):
+        index_t = IntegerPrecisionType(width=1, signed=False)
+
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        if layer.model.config.get_compression(layer):
+            layer.set_attr('strategy', 'compressed')
+        else:
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            layer.set_attr('strategy', 'resource')
+
+        if layer.model.config.is_resource_strategy(layer):
+            if layer.model.config.get_compression(layer):
+                index_t = layer.get_weights('weight').type.index_precision
+
+        layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t))
+
+    @layer_optimizer(Activation)
+    def init_activation(self, layer):
+        if layer.get_attr('activation') == 'tanh':
+            layer.set_attr('activation', 'dense_tanh')
+        if layer.get_attr('recurrent_activation') == 'tanh':
+            layer.set_attr('recurrent_activation', 'dense_tanh')
+
+    @layer_optimizer(Softmax)
+    def init_softmax(self, layer):
+        if layer.model.config.get_config_value('IOType') == 'io_parallel':
+            assert (
+                len(layer.get_input_variable().shape) == 1
+            ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
+
+    @layer_optimizer(Embedding)
+    def init_embed(self, layer):
+        if layer.attributes['n_in'] is None:
+            raise Exception('Input length of Embedding layer must be specified.')
+
+    @layer_optimizer(GRU)
+    def init_gru(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # Dense multiplication properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        index_t = IntegerPrecisionType(width=1, signed=False)
+        layer.set_attr('index_t', index_t)
+
+        if 'table_t' not in layer.attributes:
+            layer.set_attr(
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8))
+            )
+        if 'table_size' not in layer.attributes:
+            layer.set_attr('table_size', 1024)
+        if True:  # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+
+        layer.set_attr('index_t', index_t)
+
+    @layer_optimizer(Conv1D)
+    def init_conv1d(self, layer):
+        # This can happen if we assign weights of Dense layer to 1x1 Conv1D
+        if len(layer.weights['weight'].data.shape) == 2:
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        # Dense matrix multiply properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        # Reuse and parallelization factors
+        layer.set_attr('strategy', 'resource')
+        n_in, n_out = self.get_layer_mult_size(layer)
+        self.set_target_reuse_factor(layer)
+        self.set_closest_reuse_factor(layer, n_in, n_out)
+        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # impl_filt_width determines the filter size post-Winograd transformation
+        layer.set_attr('impl_filt_width', layer.get_attr('filt_width'))
+
+        # Implementation:
+        # - combination - at compile-time, the decision between Winograd and im2col is made
+        # - im2col - specifically use im2col
+        # - Winograd - use Winograd, if possible
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend
+
+    @layer_optimizer(Conv2D)
+    def init_conv2d(self, layer):
+        # This can happen if we assign weights of Dense layer to 1x1 Conv2D
+        if len(layer.weights['weight'].data.shape) == 2:
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        # Dense matrix multiply properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        # Reuse and parallelization factors
+        layer.set_attr('strategy', 'resource')
+        n_in, n_out = self.get_layer_mult_size(layer)
+        self.set_target_reuse_factor(layer)
+        self.set_closest_reuse_factor(layer, n_in, n_out)
+        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation
+        layer.set_attr('impl_filt_height', layer.get_attr('filt_height'))
+        layer.set_attr('impl_filt_width', layer.get_attr('filt_width'))
+
+        # Implementation:
+        # - combination - at compile-time, the decision between Winograd and im2col is made
+        # - im2col - specifically use im2col
+        # - Winograd - use Winograd, if possible
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend
+
+    @layer_optimizer(LSTM)
+    def init_lstm(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # We don't use RF yet
+        if True:  # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+
+        # Split weights for easier storage in on-chip memory and implementation in HLS
+        weights_data = layer.weights['weight'].data
+        rec_weights_data = layer.weights['recurrent_weight'].data
+        bias_data = layer.weights['bias'].data
+
+        weight_types = ['i', 'f', 'c', 'o']
+        for i in range(0, 4):
+            layer.add_weights_variable(
+                name=f'weight_{weight_types[i]}',
+                var_name=f'kernel_{weight_types[i]}_{{index}}',
+                data=weights_data[
+                    0 : layer.get_attr('n_in'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out')
+                ],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+            layer.add_weights_variable(
+                name=f'recurrent_weight_{weight_types[i]}',
+                var_name=f'recurrent_kernel_{weight_types[i]}_{{index}}',
+                data=rec_weights_data[
+                    0 : layer.get_attr('n_out'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out')
+                ],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+            layer.add_weights_variable(
+                name=f'bias_{weight_types[i]}',
+                var_name=f'bias_{weight_types[i]}_{{index}}',
+                data=bias_data[i * layer.get_attr('n_out') : (i + 1) * (layer.get_attr('n_out'))],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+
+    @layer_optimizer(SimpleRNN)
+    def init_simple_rnn(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # TODO - Consider setting and using RF
diff --git a/hls4ml/backends/oneapi/passes/__init__.py b/hls4ml/backends/oneapi/passes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
new file mode 100644
index 000000000..75f8ca687
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -0,0 +1,183 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
+
+# TODO - Dilation rate ?
+
+''' Shared mutliplication config '''
+conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+''' 1D Conv '''
+conv1d_config_template = """struct config{index} : nnet::conv1d_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned filt_width = {filt_width};
+    static const unsigned impl_filt_width = {impl_filt_width};
+    static const unsigned kernel_size = filt_width;
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned dilation = {dilation};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned parallelisation_factor = {parallelization};
+    static const bool store_weights_in_bram = false;
+
+    static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation};
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+}};
+"""
+
+conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
+
+
+class Conv1DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Conv1D)
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        conv_params = self._default_config_params(node)
+        conv_params['dilation'] = node.get_attr('dilation', 1)
+        if conv_params['dilation'] != 1:
+            raise Exception('dilation != 1 not supported yet')
+        conv_params['config_t'] = f'config{node.index}_mult'
+        conv_config = self.template.format(**conv_params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Conv1D, include_header=conv1d_include_list)
+        self.template = conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+''' 2D Conv '''
+conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned filt_height = {filt_height};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned impl_filt_height = {impl_filt_height};
+    static const unsigned impl_filt_width = {impl_filt_width};
+    static const unsigned kernel_size = filt_height * filt_width;
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned parallelisation_factor = {parallelization};
+    static const bool store_weights_in_bram = false;
+
+    static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation};
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+}};\n"""
+
+conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
+
+
+class Conv2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm))
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        conv_params = self._default_config_params(node)
+        conv_params['dilation'] = node.get_attr('dilation', 1)
+        if conv_params['dilation'] != 1:
+            raise Exception('dilation != 1 not supported yet')
+        conv_params['config_t'] = f'config{node.index}_mult'
+        conv_config = self.template.format(**conv_params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list)
+        self.template = conv2d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported for Quartus')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py
new file mode 100644
index 000000000..9a6686412
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py
@@ -0,0 +1,177 @@
+import math
+
+import numpy as np
+
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyWinogradKernelTransformation(OptimizerPass):
+    '''
+    Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution
+    For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks
+    '''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Conv1D, Conv2D))
+
+        # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters
+        weights_transformed = node.get_attr('_weights_transposed', False) is True
+
+        # User opted for Winograd
+        implementation_is_winograd = (
+            node.get_attr('implementation', 'combination') == 'combination'
+            or node.get_attr('implementation', 'combination') == 'winograd'
+        )
+
+        parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel'
+
+        # Winograd algorithm-specific conditions
+        if isinstance(node, Conv1D):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with stride != 1
+            stride_is_one = node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_width') > 2
+
+            winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type
+
+        elif isinstance(node, (Conv2D)):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with striede != 1
+            stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2
+
+            padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr(
+                'pad_left', 0
+            ) == node.get_attr('pad_right', 0)
+
+            winograd_conditions = (
+                filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type
+            )
+
+        else:
+            winograd_conditions = False
+
+        # Check any previous transformations
+        already_transformed = node.get_attr('_winograd_transformation_applied', False) is True
+
+        if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd':
+            raise RuntimeError(
+                'Not possible to use Winograd algorithm with current architecture. '
+                'Please set implementation to im2col or combination'
+            )
+
+        return (
+            node_matches
+            and weights_transformed
+            and winograd_conditions
+            and not already_transformed
+            and implementation_is_winograd
+        )
+
+    def transform(self, model, node):
+        if isinstance(node, Conv1D):
+            if node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C)
+                # Therefore, (F, W, C) => (F, C, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3) => (4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4))
+
+                # Transformation matrices for 3x1 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel])
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.fractional += 2
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_width', 4)
+
+        elif isinstance(node, Conv2D):
+            if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C)
+                # Therefore, (F, H, W, C) => (F, C, H, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3x3) => (4x4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4))
+
+                # Transformation matrices for 3x3 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+                GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT)
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.fractional += 2
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_height', 4)
+                node.set_attr('impl_filt_width', 4)
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer')
+
+        node.set_attr('_winograd_transformation_applied', True)
+
+        return False
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
new file mode 100644
index 000000000..aece9fc22
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -0,0 +1,221 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+
+# Dense templates
+
+dense_config_template = """struct config{index} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const bool store_weights_in_bram = false;
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor);
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {index_t.name} index_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
+
+
+class DenseConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['nzeros'] = node.get_weights('weight').nzeros
+        params['nonzeros'] = node.get_weights('weight').nonzeros
+        params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class DenseFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense, include_header=dense_include_list)
+        self.template = dense_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# BatchNormalization templates
+
+batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+    typedef {bias_t.name} bias_t;
+    typedef {scale_t.name} scale_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
+
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# Activation templates
+
+activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+hard_activ_config_template = """struct {type}_config{index} {{
+    static const unsigned n_in = {n_in};
+    static const {slope_t.name} slope;
+    static const {shift_t.name} shift;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};
+const {slope_t.name} {type}_config{index}::slope = {slope};
+const {shift_t.name} {type}_config{index}::shift = {shift};\n"""
+
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef {exp_table_t.name} exp_table_t;
+    typedef {inv_table_t.name} inv_table_t;
+}};\n"""
+
+activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
+param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
+
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
+
+
+class ActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Activation, ParametrizedActivation, PReLU))
+        self.template = activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class HardActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HardActivation)
+        self.template = hard_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class SoftmaxConfigTemplate(ActivationConfigTemplate):
+    def __init__(self):
+        super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
+        self.template = softmax_config_template
+
+
+class ActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
+
+
+class ParametrizedActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node._get_act_function_name()
+        params['param'] = node.get_attr('activ_param', 1.0)
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
+
+
+class PReLUFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PReLU, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['param'] = node.get_weights('alpha').name
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
new file mode 100644
index 000000000..0cf612166
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -0,0 +1,108 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Dot, Merge
+
+# TODO - Very similar to vivado/merge_templates.py - only difference is on line 67:
+# TODO -    get_backend('vivado').product_type(inp1.type.precision, inp2.type.precision)
+# TODO - Look into ways of having passes similar accross many backends in a shared folder thorugh inheritance and overriding.
+
+# Merge templates
+merge_config_template = """struct config{index} : nnet::merge_config {{
+    static const unsigned n_elem = {n_elem};
+}};\n"""
+
+merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
+merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h']
+
+
+class MergeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Merge)
+        self.template = merge_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp()
+
+        return self.template.format(**params)
+
+
+class MergeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list)
+        self.template = merge_function_template
+
+    def format(self, node):
+        params = {}
+        params['merge'] = node.get_attr('op').lower()
+        params['config'] = f'config{node.index}'
+        params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name
+        params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
+        params['output_t'] = node.get_output_variable().type.name
+        params['input1'] = node.get_input_variable(node.inputs[0]).name
+        params['input2'] = node.get_input_variable(node.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+
+        return self.template.format(**params)
+
+
+# Dot templates
+dot_config_template = """struct config{index} : nnet::dot_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned reuse_factor = {reuse};
+
+    typedef {accum_t.name} accum_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+
+class DotConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dot)
+        self.template = dot_config_template
+
+    def format(self, node):
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        params = self._default_config_params(node)
+        params['n_out'] = 1
+        params['n_in'] = inp1.shape[0]
+        params['product_type'] = get_backend('quartus').product_type(inp1.type.precision, inp2.type.precision)
+
+        return self.template.format(**params)
+
+
+# Concatenate templates
+concat_config_template = """struct config{index} : nnet::concat_config {{
+    static const unsigned n_elem1_0 = {n_elem1_0};
+    static const unsigned n_elem1_1 = {n_elem1_1};
+    static const unsigned n_elem1_2 = {n_elem1_2};
+    static const unsigned n_elem2_0 = {n_elem2_0};
+    static const unsigned n_elem2_1 = {n_elem2_1};
+    static const unsigned n_elem2_2 = {n_elem2_2};
+
+    static const int axis = {axis};
+}};\n"""
+
+
+class ConcatenateConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Concatenate)
+        self.template = concat_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        for i in range(3):
+            params.setdefault(f'n_elem1_{i}', 0)
+            params.setdefault(f'n_elem2_{i}', 0)
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)):
+            params[f'n_elem1_{i}'] = s1
+            params[f'n_elem2_{i}'] = s2
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
new file mode 100644
index 000000000..84ae79e49
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -0,0 +1,95 @@
+from copy import copy
+
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
+from hls4ml.backends.quartus.passes.convolution_templates import (
+    Conv1DConfigTemplate,
+    Conv1DFunctionTemplate,
+    Conv2DConfigTemplate,
+    Conv2DFunctionTemplate,
+    conv1d_config_template,
+    conv2d_config_template,
+    conv_mult_config_template,
+)
+from hls4ml.model.layers import register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+'''
+Custom hls4ml layer implementation for 1x1 Conv filters using im2col
+Allows lower latency andresource usage, due to less loop invocations
+'''
+
+pointwise_conv1d_function_template = (
+    'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+pointwise_conv2d_function_template = (
+    'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h']
+
+
+class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate):
+    def __init__(self):
+        super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D)
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(PointwiseConv1D, include_header=sepconv1d_include_list)
+        self.template = pointwise_conv1d_function_template
+
+
+class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
+    def __init__(self):
+        super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D)
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(PointwiseConv2D, include_header=sepconv2d_include_list)
+        self.template = pointwise_conv2d_function_template
+
+
+def register_pointwise(backend):
+    # Register the layer types to the layer map
+    register_layer('PointwiseConv1D', PointwiseConv1D)
+    register_layer('PointwiseConv2D', PointwiseConv2D)
+
+    # Register the optimization passes
+    backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv)
+
+    # Register template passes
+    backend.register_template(PointwiseConv1DConfigTemplate)
+    backend.register_template(PointwiseConv1DFunctionTemplate)
+    backend.register_template(PointwiseConv2DConfigTemplate)
+    backend.register_template(PointwiseConv2DFunctionTemplate)
+
+
+class OptimizePointwiseConv(OptimizerPass):
+    def match(self, node):
+        return (
+            node.class_name in ('Conv1D', 'Conv2D')
+            and node.get_attr('filt_height', 1) == 1
+            and node.get_attr('filt_width') == 1
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
+
+    def transform(self, model, node):
+        dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+        pw_node = model.make_node(
+            'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy()
+        )
+        if len(node.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
+            expand_axis = tuple(range(int(dim[0])))
+            pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis)
+        pw_node.weights['bias'].data = node.weights['bias'].data
+        model.replace_node(node, pw_node)
+
+        return True
diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py
new file mode 100644
index 000000000..9a3ee4192
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/pooling_templates.py
@@ -0,0 +1,111 @@
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D
+
+# TODO - Move to ../fpga/passes, once streaming is supported on Quartus (should be identical to Vivado)
+
+pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned stride_width = {stride_width};
+    static const unsigned pool_width = {pool_width};
+
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned filt_width = {pool_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
+
+    static const unsigned in_width = {n_in};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+
+    static const unsigned pool_height = {pool_height};
+    static const unsigned pool_width = {pool_width};
+    static const unsigned filt_height = {pool_height};
+    static const unsigned filt_width = {pool_width};
+
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+global_pooling1d_function_template = (
+    'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+global_pooling2d_function_template = (
+    'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+
+pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']
+
+
+class PoolingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.templates = {
+            'Pooling1D': pooling1d_config_template,
+            'Pooling2D': pooling2d_config_template,
+            'GlobalPooling1D': global_pooling1d_config_template,
+            'GlobalPooling2D': global_pooling2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list)
+        self.templates = {
+            'Pooling1D': pooling1d_function_template,
+            'Pooling2D': pooling2d_function_template,
+            'GlobalPooling1D': global_pooling1d_function_template,
+            'GlobalPooling2D': global_pooling2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported for Quartus')
+        params['data_format'] = 'cl'
+        return self.templates[node.class_name].format(**params)
diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py
new file mode 100644
index 000000000..d6cf2d2da
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/quantization_templates.py
@@ -0,0 +1,36 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.quartus.passes.core_templates import (
+    batchnorm_config_template,
+    batchnorm_function_template,
+    batchnorm_include_list,
+)
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha
+
+
+class ApplyAlphaConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
new file mode 100644
index 000000000..2bf45351b
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -0,0 +1,305 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GRU, LSTM, SimpleRNN
+
+recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h']
+
+################################################
+# Shared Matrix Multiplication Template (Dense)
+################################################
+recr_mult_config_template = '''struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n'''
+
+################################################
+# Shared Activation Template
+################################################
+activ_config_template = '''struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n'''
+
+################################################
+# GRU Template
+################################################
+gru_config_template = '''struct config{index} : nnet::gru_config {{
+    static const unsigned n_in  = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_units = {n_units};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned n_outputs = {n_outputs};
+    static const bool return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;
+    typedef {bias_t.name} bias_t;
+
+    typedef {config_mult_x} mult_config_x;
+    typedef {config_mult_h} mult_config_h;
+
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n'''
+
+gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});'
+
+
+class GRUConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.gru_template = gru_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+        self.mult_x_template = recr_mult_config_template
+        self.mult_h_template = recr_mult_config_template
+
+    def format(self, node):
+        # Input has shape (n_timesteps, inp_dimensionality)
+        # Output / hidden units has shape (1 if !return_sequences else n_timesteps , n_units)
+        params = self._default_config_params(node)
+        params['n_units'] = node.get_attr('n_out')
+        params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+        params['return_sequences'] = 'true' if node.get_attr('return_sequences', False) else 'false'
+        params['config_mult_x'] = f'config{node.index}_x_mult'
+        params['config_mult_h'] = f'config{node.index}_h_mult'
+        params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act')
+        gru_config = self.gru_template.format(**params)
+
+        # Activation is on candidate hidden state, dimensionality (1, n_units)
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        # Recurrent activation is on reset and update gates (therefore x2), dimensionality (1, n_units)
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = str(node.get_attr('n_out')) + ' * 2'
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        # Multiplication config for matrix multiplications of type Wx (reset, update and candidate states)
+        mult_params_x = self._default_config_params(node)
+        mult_params_x['n_in'] = node.get_attr('n_in')
+        mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3'
+        mult_params_x['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_params_x['index'] = str(node.index) + '_x'
+        mult_config_x = self.mult_x_template.format(**mult_params_x)
+
+        # Multiplication config for matrix multiplications of type Wh (reset, update and candidate states)
+        mult_params_h = self._default_config_params(node)
+        mult_params_h['n_in'] = node.get_attr('n_out')
+        mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3'
+        mult_params_h['reuse_factor'] = params['recurrent_reuse_factor']
+        mult_params_h['product_type'] = get_backend('quartus').product_type(
+            node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision
+        )
+        mult_params_h['index'] = str(node.index) + '_h'
+        mult_config_h = self.mult_h_template.format(**mult_params_h)
+
+        return mult_config_x + '\n' + mult_config_h + '\n' + recr_act_config + '\n' + act_config + '\n' + gru_config
+
+
+class GRUFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(GRU, include_header=recurrent_include_list)
+        self.template = gru_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['wr'] = node.get_weights('recurrent_weight').name
+        params['br'] = node.get_weights('recurrent_bias').name
+        return self.template.format(**params)
+
+
+################################################
+# LSTM Template
+################################################
+lstm_config_template = """struct config{index} : nnet::lstm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;
+    typedef {bias_t.name} bias_t;
+
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n"""
+
+lstm_function_template = 'nnet::lstm<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+
+
+class LSTMConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(LSTM)
+        self.template = lstm_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+
+    def format(self, node):
+        lstm_params = self._default_config_params(node)
+        lstm_params['n_in'] = node.get_attr('n_in')
+        lstm_params['n_out'] = node.get_attr('n_out')
+        lstm_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+
+        lstm_params['return_sequences'] = str(node.get_attr('return_sequences')).lower()
+        lstm_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        lstm_params['act_recurrent_t'] = '{}_config{}'.format(
+            node.get_attr('recurrent_activation'), str(node.index) + '_rec_act'
+        )
+        lstm_config = self.template.format(**lstm_params)
+
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = node.get_attr('n_out')
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        return act_config + '\n' + recr_act_config + '\n' + lstm_config
+
+
+class LSTMFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(LSTM, include_header=recurrent_include_list)
+        self.template = lstm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        types = ['i', 'f', 'c', 'o']
+        params['weights'] = ''
+        for t in types:
+            params['weights'] += f'kernel_{t}_{str(node.index)},'
+        for t in types:
+            params['weights'] += f'recurrent_kernel_{t}_{str(node.index)},'
+        for t in types:
+            params['weights'] += 'bias_{}_{}{}'.format(t, str(node.index), ',' if t != 'o' else '')
+
+        return self.template.format(**params)
+
+
+################################################
+# SimpleRNN Template
+################################################
+simple_rnn_config_template = """struct config{index} : nnet::simpleRNN_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_outputs = {n_outputs};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;
+    typedef {bias_t.name} bias_t;
+
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n"""
+
+simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+
+
+class SimpleRNNConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SimpleRNN)
+        self.template = simple_rnn_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+
+    def format(self, node):
+        simple_rnn_params = self._default_config_params(node)
+        simple_rnn_params['n_in'] = node.get_attr('n_in')
+        simple_rnn_params['n_out'] = node.get_attr('n_out')
+        simple_rnn_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+        simple_rnn_params['return_sequences'] = str(node.get_attr('return_sequences')).lower()
+        simple_rnn_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        simple_rnn_params['act_recurrent_t'] = '{}_config{}'.format(
+            node.get_attr('recurrent_activation'), str(node.index) + '_rec_act'
+        )
+        simple_rnn_params['recurrent_activation'] = 'relu'
+
+        simple_rnn_config = self.template.format(**simple_rnn_params)
+
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = node.get_attr('n_out')
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        return act_config + '\n' + recr_act_config + '\n' + simple_rnn_config
+
+
+class SimpleRNNFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SimpleRNN, include_header=recurrent_include_list)
+        self.template = simple_rnn_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
new file mode 100644
index 000000000..0db01e654
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -0,0 +1,138 @@
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D
+
+# ZeroPadding templates
+
+zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned out_width = {out_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+
+padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
+
+
+class ZeroPaddingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_config_template,
+            'ZeroPadding2D': zeropad2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class ZeroPaddingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list)
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_function_template,
+            'ZeroPadding2D': zeropad2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('Quartus only supports channels_last data format')
+        params['data_format'] = 'cl'
+
+        return self.templates[node.class_name].format(**params)
+
+
+# Resize templates
+
+resize_config_template = """struct config{index} : nnet::resize_config {{
+    static const unsigned height = {in_height};
+    static const unsigned width = {in_width};
+
+    static const unsigned new_height = {out_height};
+    static const unsigned new_width = {out_width};
+
+    static const unsigned n_chan = {n_chan};
+}};\n"""
+
+resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});'
+resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h']
+
+
+class ResizeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Resize)
+        self.template = resize_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class ResizeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Resize, include_header=resize_include_list)
+        self.template = resize_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('algorithm') != 'nearest':
+            raise Exception('Currently only supporting resize_nearest')
+        params['algorithm'] = node.get_attr('algorithm')
+
+        return self.template.format(**params)
+
+
+# Transpose templates
+
+transpose_config_template = """struct config{index} : nnet::transpose_config {{
+    static const unsigned depth = {depth};
+    static const unsigned height = {height};
+    static const unsigned width = {width};
+    static constexpr unsigned perm[3] = {{{perm_str}}};
+}};\n"""
+
+transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
+transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']
+
+
+class TransposeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Transpose)
+        self.template = transpose_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class TransposeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Transpose, include_header=transpose_include_list)
+        self.template = transpose_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dim'] = node.get_attr('dim')
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py
new file mode 100644
index 000000000..00fe89038
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/resource_strategy.py
@@ -0,0 +1,77 @@
+import numpy as np
+
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SimpleRNN
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyResourceStrategy(OptimizerPass):
+    '''Transposes the weights to use the dense_resource matrix multiply routine'''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN))
+        is_resource_strategy = (
+            True  # node.get_attr('strategy', '').lower() == 'resource' -> Quartus only supportr Resource strategy
+        )
+        already_transformed = node.get_attr('_weights_transposed', False) is True
+        return node_matches and is_resource_strategy and not already_transformed
+
+    def transform(self, model, node):
+        if isinstance(node, Dense) and not node.model.config.get_compression(node):
+            rf = node.get_attr('reuse_factor')
+            bf = int((node.attributes['n_in'] * node.attributes['n_out']) / rf)
+            bf_rounded = int(pow(2, np.ceil(np.log2(bf))))
+            rf_rounded = int(pow(2, np.ceil(np.log2(rf))))
+
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data).flatten()
+
+            if node.attributes['n_in'] * node.attributes['n_out'] > 2048 and rf_rounded != rf:
+                node.set_attr('rfpad', rf_rounded - rf)
+                node.set_attr('bfpad', bf_rounded - bf)
+
+                temp = np.empty([bf_rounded, rf_rounded])
+                for i in range(rf_rounded):
+                    for j in range(bf_rounded):
+                        if i < rf and j < bf:
+                            w_index = i + rf * j
+                            temp[j][i] = node.weights['weight'].data[w_index]
+                        else:
+                            temp[j][i] = 0
+                node.weights['weight'].data = temp.flatten()
+                node.weights['weight'].data_length = node.weights['weight'].data.size
+
+        elif isinstance(node, Conv1D):
+            # (W,C,F) => (F,W,C)
+            # IMPORTANT - This format only works with im2col convolution
+            #           - Future commits add new optimizers that further transpose THIS format to a format
+            #                 useful for Winograd's minimal filtering algorithm
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1])
+
+        elif isinstance(node, Conv2D):
+            # (H,W,C,F) => (F,H,W,C)
+            # IMPORTANT - This format only works with im2col convolution
+            #           - Future commits add new optimizers that further transpose THIS format to a format
+            #                 useful for Winograd's minimal filtering algorithm
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[3, 0, 1, 2])
+
+        elif isinstance(node, GRU):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+        elif isinstance(node, SimpleRNN):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+        elif isinstance(node, LSTM):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+            for weight_type in ['i', 'f', 'c', 'o']:
+                node.weights[f'weight_{weight_type}'].data = np.transpose(node.weights[f'weight_{weight_type}'].data)
+                node.weights[f'recurrent_weight_{weight_type}'].data = np.transpose(
+                    node.weights[f'recurrent_weight_{weight_type}'].data
+                )
+
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
+        node.set_attr('_weights_transposed', True)
+        return False
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
new file mode 100644
index 000000000..67de32ab6
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -0,0 +1,54 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ACTypeConverter,
+    HLSTypeConverter,
+    QuartusArrayVariableConverter,
+    QuartusInplaceArrayVariableConverter,
+    QuartusInplaceStreamVariableConverter,
+    QuartusStreamVariableConverter,
+    QuartusStructMemberVariableConverter,
+    StaticWeightVariableConverter,
+)
+from hls4ml.model.optimizer import GlobalOptimizerPass
+from hls4ml.model.types import InplaceTensorVariable
+
+
+class TransformTypes(GlobalOptimizerPass):
+    def __init__(self):
+        self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
+        self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter)
+        self.inplace_array_var_converter = QuartusInplaceArrayVariableConverter(type_converter=self.type_converter)
+        self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter)
+        self.inplace_stream_var_converter = QuartusInplaceStreamVariableConverter(type_converter=self.type_converter)
+        self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
+
+    def transform(self, model, node):
+        io_type = node.model.config.get_config_value('IOType')
+
+        for out_name, var in node.variables.items():
+            if io_type == 'io_stream':
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_stream_var_converter.convert(var)
+                else:
+                    new_var = self.stream_var_converter.convert(var)
+            elif io_type == 'io_parallel':
+                if out_name in node.model.inputs:
+                    new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs')
+                elif out_name in node.model.outputs:
+                    new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='outputs')
+                elif isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_array_var_converter.convert(var, pragma='')
+                else:
+                    new_var = self.array_var_converter.convert(var, pragma='hls_register')
+            else:
+                raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})')
+
+            node.set_attr(out_name, new_var)
+
+        for w_name, weight in node.weights.items():
+            new_weight = self.weight_var_converter.convert(weight)
+            node.set_attr(w_name, new_weight)
+
+        for t_name, type in node.types.items():
+            new_type = self.type_converter.convert(type)
+            node.set_attr(t_name, new_type)
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
new file mode 100644
index 000000000..a3a6e5c4a
--- /dev/null
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -0,0 +1,320 @@
+# Direct CMake to use icpx rather than the default C++ compiler/linker on Linux
+# and icx-cl on Windows
+if(UNIX)
+    set(CMAKE_CXX_COMPILER icpx)
+else() # Windows
+    include (CMakeForceCompiler)
+    CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
+    include (Platform/Windows-Clang)
+endif()
+
+cmake_minimum_required (VERSION 3.7.2)
+
+project(fpga_template CXX)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+###############################################################################
+### Customize these build variables
+###############################################################################
+set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp)
+set(TARGET_NAME fpga_template)
+
+# Use cmake -DFPGA_DEVICE=<board-support-package>:<board-variant> to choose a
+# different device. Here are a few device examples (this list is not
+# exhaustive):
+#   intel_s10sx_pac:pac_s10
+#   intel_s10sx_pac:pac_s10_usm
+#   intel_a10gx_pac:pac_a10
+# Note that depending on your installation, you may need to specify the full 
+# path to the board support package (BSP), this usually is in your install 
+# folder.
+#
+# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
+# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
+if(NOT DEFINED FPGA_DEVICE)
+    set(FPGA_DEVICE "Arria10")
+endif()
+
+# Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
+# compilation. 
+set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+
+# Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
+set(USER_FLAGS -Wno-unused-label ${USER_FLAGS})
+
+# Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
+# compilation.
+set(USER_INCLUDE_PATHS src;src/firmware;${USER_INCLUDE_PATHS})
+
+###############################################################################
+### no changes after here
+###############################################################################
+
+# Print the device being used for the compiles
+message(STATUS "Configuring the design to run on FPGA board ${FPGA_DEVICE}")
+
+# Set the names of the makefile targets to be generated by cmake
+set(EMULATOR_TARGET fpga_emu)
+set(SIMULATOR_TARGET fpga_sim)
+set(REPORT_TARGET report)
+set(FPGA_TARGET fpga)
+set(IP_EXPORT_TARGET fpga_ip_export)
+
+# Set the names of the generated files per makefile target
+set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET})
+set(SIMULATOR_OUTPUT_NAME ${TARGET_NAME}.${SIMULATOR_TARGET})
+set(REPORT_OUTPUT_NAME ${TARGET_NAME}.${REPORT_TARGET})
+set(FPGA_OUTPUT_NAME ${TARGET_NAME}.${FPGA_TARGET})
+set(IP_EXPORT_OUTPUT_NAME ${TARGET_NAME}.${IP_EXPORT_TARGET})
+
+message(STATUS "Additional USER_FPGA_FLAGS=${USER_FPGA_FLAGS}")
+message(STATUS "Additional USER_FLAGS=${USER_FLAGS}")
+
+include_directories(${USER_INCLUDE_PATHS})
+message(STATUS "Additional USER_INCLUDE_PATHS=${USER_INCLUDE_PATHS}")
+
+link_directories(${USER_LIB_PATHS})
+message(STATUS "Additional USER_LIB_PATHS=${USER_LIB_PATHS}")
+
+link_libraries(${USER_LIBS})
+message(STATUS "Additional USER_LIBS=${USER_LIBS}")
+
+if(WIN32)
+    # add qactypes for Windows
+    set(QACTYPES "-Qactypes")
+    # This is a Windows-specific flag that enables exception handling in host code
+    set(WIN_FLAG "/EHsc")
+else()
+    # add qactypes for Linux
+    set(QACTYPES "-qactypes")
+endif()
+
+set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS})
+set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+
+# A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
+# 1. The "compile" stage compiles the device code to an intermediate
+#    representation (SPIR-V).
+# 2. The "link" stage invokes the compiler's FPGA backend before linking. For
+#    this reason, FPGA backend flags must be passed as link flags in CMake.
+set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR)
+set(EMULATOR_LINK_FLAGS )
+set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early)
+set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR)
+set(SIMULATOR_LINK_FLAGS -Xssimulation -Xsghdl -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${SIMULATOR_OUTPUT_NAME})
+set(FPGA_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_OUTPUT_NAME})
+# get rid of this once host pipes work properly
+set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel)
+
+###############################################################################
+### FPGA Emulator
+###############################################################################
+add_executable(${EMULATOR_TARGET} ${SOURCE_FILES})
+target_compile_options(${EMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${EMULATOR_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS})
+target_link_libraries(${EMULATOR_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${EMULATOR_TARGET} ${EMULATOR_LINK_FLAGS})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${EMULATOR_OUTPUT_NAME})
+
+###############################################################################
+### FPGA Simulator
+###############################################################################
+add_executable(${SIMULATOR_TARGET} ${SOURCE_FILES})
+target_compile_options(${SIMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${SIMULATOR_TARGET} PRIVATE ${SIMULATOR_COMPILE_FLAGS})
+target_link_libraries(${SIMULATOR_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${SIMULATOR_TARGET} ${SIMULATOR_LINK_FLAGS})
+set_target_properties(${SIMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${SIMULATOR_OUTPUT_NAME})
+
+###############################################################################
+### Generate Report
+###############################################################################
+add_executable(${REPORT_TARGET} ${SOURCE_FILES})
+target_compile_options(${REPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${REPORT_TARGET} PRIVATE ${REPORT_COMPILE_FLAGS})
+
+# The report target does not need the QACTYPES flag at link stage
+set(MODIFIED_COMMON_LINK_FLAGS_REPORT ${COMMON_LINK_FLAGS})
+list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_REPORT ${QACTYPES})
+
+target_link_libraries(${REPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_REPORT})
+target_link_libraries(${REPORT_TARGET} ${REPORT_LINK_FLAGS})
+set_target_properties(${REPORT_TARGET} PROPERTIES OUTPUT_NAME ${REPORT_OUTPUT_NAME})
+
+###############################################################################
+### FPGA Hardware
+###############################################################################
+add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILES})
+target_compile_options(${FPGA_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${FPGA_TARGET} PRIVATE ${FPGA_COMPILE_FLAGS})
+target_link_libraries(${FPGA_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${FPGA_TARGET} ${FPGA_LINK_FLAGS})
+set_target_properties(${FPGA_TARGET} PROPERTIES OUTPUT_NAME ${FPGA_OUTPUT_NAME})
+
+###############################################################################
+### FPGA IP Export (only necessary until native host pipes)
+###############################################################################
+add_executable(${IP_EXPORT_TARGET} ${SOURCE_FILES})
+target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${IP_EXPORT_COMPILE_FLAGS})
+
+# The ip export target does not need the QACTYPES flag at link stage
+set(MODIFIED_COMMON_LINK_FLAGS_EXPORT ${COMMON_LINK_FLAGS})
+list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_EXPORT ${QACTYPES})
+
+target_link_libraries(${IP_EXPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_EXPORT})
+target_link_libraries(${IP_EXPORT_TARGET} ${IP_EXPORT_LINK_FLAGS})
+set_target_properties(${IP_EXPORT_TARGET} PROPERTIES OUTPUT_NAME ${IP_EXPORT_OUTPUT_NAME})
+
+###############################################################################
+### This part only manipulates cmake variables to print the commands to the user
+###############################################################################
+
+# set the correct object file extension depending on the target platform
+if(WIN32)
+    set(OBJ_EXTENSION "obj")
+else()
+    set(OBJ_EXTENSION "o")
+endif()
+
+# Set the source file names in a string
+set(SOURCE_FILE_NAME "${SOURCE_FILES}")
+
+function(getCompileCommands common_compile_flags special_compile_flags common_link_flags special_link_flags target output_name)
+
+    set(file_names ${SOURCE_FILE_NAME})
+    set(COMPILE_COMMAND )
+    set(LINK_COMMAND )
+
+    foreach(source ${file_names})
+        # Get the relative path to the source and object files
+        file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source})
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+        
+        # Creating a string that contains the compile command
+        # Start by the compiler invocation
+        set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}")
+
+        # Add all the potential includes
+        foreach(INCLUDE ${USER_INCLUDE_PATHS})
+            if(NOT IS_ABSOLUTE ${INCLUDE})
+                file(RELATIVE_PATH INCLUDE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${INCLUDE})
+            endif()
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} -I${INCLUDE}")
+        endforeach()
+
+        # Add all the common compile flags
+        foreach(FLAG ${common_compile_flags})
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}")
+        endforeach()
+
+        # Add all the specific compile flags
+        foreach(FLAG ${special_compile_flags})
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}")
+        endforeach()
+
+        # Get the location of the object file
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+
+        # Add the source file and the output file
+        set(COMPILE_COMMAND "${COMPILE_COMMAND} -c ${CURRENT_SOURCE_FILE} -o ${OBJ_FILE}\n")
+    endforeach()
+
+    set(COMPILE_COMMAND "${COMPILE_COMMAND}" PARENT_SCOPE)
+
+    # Creating a string that contains the link command
+    # Start by the compiler invocation
+    set(LINK_COMMAND "${LINK_COMMAND}${CMAKE_CXX_COMPILER}")
+
+    # Add all the common link flags
+    foreach(FLAG ${common_link_flags})
+        set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}")
+    endforeach()
+
+    # Add all the specific link flags
+    foreach(FLAG ${special_link_flags})
+        set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}")
+    endforeach()    
+
+    # Add the output file
+    set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}")
+
+    foreach(source ${file_names})
+        # Get the relative path to the source and object files
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+
+        # Add the source file and the output file
+        set(LINK_COMMAND "${LINK_COMMAND} ${OBJ_FILE}")
+    endforeach()
+
+    # Add all the potential library paths
+    foreach(LIB_PATH ${USER_LIB_PATHS})
+        if(NOT IS_ABSOLUTE ${LIB_PATH})
+            file(RELATIVE_PATH LIB_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${LIB_PATH})
+        endif()
+        if(NOT WIN32)
+            set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH}")
+        else()
+            set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH} -Wl,-rpath,${LIB_PATH}")
+        endif()
+    endforeach()
+
+    # Add all the potential includes
+    foreach(LIB ${USER_LIBS})
+        set(LINK_COMMAND "${LINK_COMMAND} -l${LIB}")
+    endforeach()
+
+    set(LINK_COMMAND "${LINK_COMMAND}" PARENT_SCOPE)
+
+endfunction()
+
+# Windows executable is going to have the .exe extension
+if(WIN32)
+    set(EXECUTABLE_EXTENSION ".exe")
+endif()
+
+# Display the compile instructions in the emulation flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${EMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${EMULATOR_LINK_FLAGS}" "${EMULATOR_TARGET}" "${EMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayEmulationCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${EMULATOR_TARGET} displayEmulationCompileCommands)
+
+# Display the compile instructions in the simulation flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${SIMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${SIMULATOR_LINK_FLAGS}" "${SIMULATOR_TARGET}" "${SIMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displaySimulationCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${SIMULATOR_TARGET} displaySimulationCompileCommands)
+
+# Display the compile instructions in the report flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${REPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_REPORT}" "${REPORT_LINK_FLAGS}" "${REPORT_TARGET}" "${REPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayReportCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${REPORT_TARGET} displayReportCompileCommands)
+
+# Display the compile instructions in the IP export flow (Remove after native host pipes work properly)
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${IP_EXPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_EXPORT}" "${IP_EXPORT_LINK_FLAGS}" "${IP_EXPORT_TARGET}" "${IP_EXPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayExportCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${IP_EXPORT_TARGET} displayExportCompileCommands)
+
+# Display the compile instructions in the fpga flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${FPGA_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${FPGA_LINK_FLAGS}" "${FPGA_TARGET}" "${FPGA_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayFPGACompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${FPGA_TARGET} displayFPGACompileCommands)
diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp
new file mode 100644
index 000000000..f5b9c8433
--- /dev/null
+++ b/hls4ml/templates/oneapi/exception_handler.hpp
@@ -0,0 +1,22 @@
+#ifndef __EXCEPTIONHANDLER_HPP__
+#define __EXCEPTIONHANDLER_HPP__
+#include <sycl/sycl.hpp>
+#include <exception>
+#include <iostream>
+
+namespace fpga_tools {
+
+void exception_handler(sycl::exception_list exceptions) {
+  for (std::exception_ptr const &e : exceptions) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception const &e) {
+      std::cout << "Caught asynchronous SYCL exception:\n"
+                << e.what() << std::endl;
+    }
+  }
+}
+
+} // namespace fpga_tools
+
+#endif //__EXCEPTIONHANDLER_HPP__
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
new file mode 100644
index 000000000..622d9f2bf
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -0,0 +1,21 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+#include <sycl/sycl.hpp>
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
+#include <sycl/ext/intel/ac_types/ac_fixed.hpp>
+#include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
new file mode 100644
index 000000000..93f11c837
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -0,0 +1,20 @@
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert weights
+
+void MyProject::operator()() const {
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    auto inputsArr = InPipe::read();
+
+// hls-fpga-machine-learning insert layers
+
+// hls-fpga-machine-learning return
+
+    OutPipe::write(outData);
+}
+
+
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
new file mode 100644
index 000000000..f01b5978c
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -0,0 +1,36 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include "defines.h"
+
+// This file defines the interface to the kernel
+
+
+using input_data_t = std::array<input_t, N_INPUT>;
+using output_data_t = std::array<result_t, N_RESULT>;
+
+class InPipeID;
+class OutPipeID;
+
+using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(
+    sycl::ext::intel::experimental::ready_latency<0>));
+
+using InPipe = sycl::ext::intel::experimental::pipe<InPipeID, input_data_t, 0, PipeProps>;
+using OutPipe = sycl::ext::intel::experimental::pipe<OutPipeID, output_data_t, 0, PipeProps>;
+
+class MyProjectID;
+
+struct MyProject {
+
+    // kernel property method to config invocation interface
+    auto get(sycl::ext::oneapi::experimental::properties_tag) {
+        return sycl::ext::oneapi::experimental::properties{
+            sycl::ext::intel::experimental::streaming_interface<>,
+            sycl::ext::intel::experimental::pipelined<>};
+    }
+
+    SYCL_EXTERNAL void operator()() const;
+};
+
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
new file mode 100644
index 000000000..d874741ec
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -0,0 +1,516 @@
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 512;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ac_fixed<16, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        res[ii] = datareg;
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    static const int MAX_VALUE = 8;
+#include "activation_tables/sigmoid_table.tb"
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T absoluteValue hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
+            absoluteValue = -data[ii];
+        } else {
+            absoluteValue = data[ii];
+        }
+        int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (absoluteValue > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)sigmoid_table[index];
+        if (data[ii] < 0) {
+            res[ii] = 1 - temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
+    // Number of address bits for table
+    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+
+    // Slice the top N bits of the input
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
+    if (x != 0 && y == 0)
+        y[0] = 1;
+    return y.to_uint();
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
+    // Number of address bits for table
+    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+
+    // Slice the top N bits of the input
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N);
+    return y.to_uint();
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Look-up tables
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
+
+    // Find maximum
+    Op_max<data_T> op_max;
+    hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
+    }
+
+    // Explicitly sum previously calculated exponentials with an adder tree
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+// TODO - Improve accuracy
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<data_T, CONFIG_T>(data[i])];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
+
+    hls_register int data_round[CONFIG_T::n_in];
+New_loop:
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int();
+    }
+NN_Outer:
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        typename CONFIG_T::exp_table_t exp_res_temp = 0;
+    NN_Inner:
+        #pragma unroll
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj) {
+                exp_res_temp += 1;
+            } else {
+                int _data_cache = (data_round[jj] - data_round[ii]);
+                int index = _data_cache + 8 * CONFIG_T::table_size / 16;
+
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+
+                typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
+                exp_res_temp += temp_exp;
+            }
+        }
+        int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int();
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        res[ii] = invert_table_legacy[exp_res_index];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = (res_T)0;
+    }
+
+    hls_register data_T maximum = data[0];
+    hls_register int idx = 0;
+
+    #pragma ii 1
+    for (int i = 1; i < CONFIG_T::n_in; i++) {
+        if (data[i] > maximum) {
+            maximum = data[i];
+            idx = i;
+        }
+    }
+
+    res[idx] = (res_T)1;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    default:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    static const int MAX_VALUE = 4;
+// Initialize the lookup table
+#include "activation_tables/tanh_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T temp hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
+            temp = -data[ii];
+        } else {
+            temp = data[ii];
+        }
+        ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)tanh_table[index];
+        if (data[ii] < 0) {
+            res[ii] = -temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/softplus_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int();
+        ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    static const int MAX_VALUE = 8;
+// Initialize the lookup table
+#include "activation_tables/softsign_table.tb"
+
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T temp hls_register;
+        res_T temp2 hls_register;
+        if (data[ii] < 0) {
+            temp = -data[ii];
+        } else {
+            temp = data[ii];
+        }
+        ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = (res_T)softsign_table[index];
+        if (data[ii] < 0) {
+            res[ii] = -temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/elu_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+// Initialize the lookup table
+#include "activation_tables/selu_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = data[ii];
+        res_T cache;
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_T datareg = 2 * data[ii];
+        res_T cache;
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
new file mode 100644
index 000000000..7b84a9c0f
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -0,0 +1,104 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+// Calcuate result
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
+                           const data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<1, false> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
+                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
+                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<2, true> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
new file mode 100644
index 000000000..0c2e94e02
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
@@ -0,0 +1,78 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+
+#include "nnet_helpers.h"
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
+#include <sycl/ext/intel/ac_types/ac_fixed.hpp>
+#include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
+
+typedef ac_fixed<16, 6> table_default_t;
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+
+// Default data types (??) TODO: Deprecate
+typedef ac_fixed<16, 4> weight_t_def;
+typedef ac_fixed<16, 4> bias_t_def;
+typedef ac_fixed<32, 10> accum_t_def;
+
+template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
+    #pragma unroll
+    for (int ii = 0; ii < NIN1; ii++) {
+        res[ii] = data1[ii];
+    }
+    #pragma unroll
+    for (int ii = 0; ii < NIN2; ii++) {
+        res[NIN1 + ii] = data2[ii];
+    }
+}
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Quartus cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+// template <class T, int N, class Op> T reduce(const T *x, Op op) {
+//     static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ? pow2<floorlog2<N - 1>::val>::val : 0;
+//     static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+//     if (N == 1) {
+//         return x[0];
+//     }
+//     if (N == 2) {
+//         return op(x[0], x[1]);
+//     }
+//     return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+// }
+
+// alternate reduce - basic
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    if (N == 1) {
+        return x[0];
+    }
+    auto val = op(x[0], x[1]);
+    for (int i = 2; i < N; i++) {
+        val = op(val, x[i]);
+    }
+    return val;
+}
+
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
new file mode 100644
index 000000000..8897e1315
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
@@ -0,0 +1,64 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+
+namespace nnet {
+
+struct conv1d_config {
+    // I/O sizes
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+
+    // Run-time Configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelisation_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 000000000..a110d6d42
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,241 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+enum class conv1d_implementation { combination, im2col, winograd };
+
+// ****************************************************************
+//      im2col - General-purpose 1D Convolution algorithm
+// ****************************************************************
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                  data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int col) {
+    // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP
+    // usage
+
+    hls_register int index = 0;
+
+KernelLoop:
+    #pragma unroll
+    for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) {
+    ChannelLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            hls_register int index_data =
+                (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+            if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+                data_col[index++] = data[index_data];
+            } else {
+                data_col[index++] = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_im2col_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // im2col performs no filter transformations; therefore, filter size remains constant
+    assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
+
+    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+
+ColLoop:
+    #pragma unroll pf
+    #pragma ii CONFIG_T::reuse_factor
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        // Loop variables should always be declared in the deepest scope available
+        // See Intel's HLS - Loop Best Practices
+        // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+        hls_register data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
+        im2col_1d_cl<data_T, CONFIG_T>(data, data_col, i);
+
+        hls_register res_T res_col[CONFIG_T::n_filt];
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+    // Unroll fully, since
+    // (1) n_filt is usually low in io_parallel (< 32)
+    // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+    FiltLoop:
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            res[i * CONFIG_T::n_filt + j] = res_col[j];
+        }
+    }
+}
+
+// ****************************************************************
+//       1D Convolution for 3x1 kernels from Winograd's algoirithm
+// ****************************************************************
+
+// Explicity transofrmed input (B'dB) needed for Winograd convolution, as explained by Lavin & Gray (2015)
+template <typename data_T, typename res_T>
+inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[4]) {
+    D[0] = I[0] - I[2];
+    D[1] = I[1] + I[2];
+    D[2] = -I[1] + I[2];
+    D[3] = I[1] - I[3];
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void winograd_conv1d_3x1_kernel_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Ensure Winograd conditions are met
+    assert(CONFIG_T::filt_width == 3);
+    assert(CONFIG_T::stride_width == 1);
+    assert(CONFIG_T::out_width > 2);
+
+    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+
+    // Initialise result to bias
+    // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        int offset = CONFIG_T::n_filt * i;
+        #pragma unroll
+        for (int f = 0; f < CONFIG_T::n_filt; f++) {
+            res[offset + f] = static_cast<res_T>(biases[f]);
+        }
+    }
+
+WidthLoop:
+    #pragma unroll pf
+    for (int col = 0; col < CONFIG_T::out_width; col += 2) {
+    ChannelLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            // Get current 4x1 tile
+            hls_register data_T T[16];
+            hls_register uint8_t p = 0;
+
+            #pragma unroll
+            for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) {
+                if (c < CONFIG_T::in_width && c >= 0) {
+                    T[p++] = data[c * CONFIG_T::n_chan + channel];
+                } else {
+                    T[p++] = 0;
+                }
+            }
+
+            // Transform input tile
+            hls_register typename CONFIG_T::accum_t D[4];
+            winograd_transform_input_tile_3x1_kernel<data_T, typename CONFIG_T::accum_t>(T, D);
+
+            #pragma unroll
+            for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+                hls_register int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel);
+
+                // Hadamard product between transformed input tile and kernel
+                hls_register typename CONFIG_T::accum_t Y[4];
+                #pragma unroll
+                for (int i = 0; i < 4; i++) {
+                    Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
+                }
+
+                // Explicitly transform intermediate result Z = A'YA and save to output
+                res[CONFIG_T::n_filt * col + filter] += static_cast<res_T>(Y[0] + Y[1] + Y[2]);
+                if ((col + 1) < CONFIG_T::out_width)
+                    res[CONFIG_T::n_filt * (col + 1) + filter] += static_cast<res_T>(Y[1] - Y[2] - Y[3]);
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       1D Convolution for 1x1 kernels using optimized im2col
+// ****************************************************************
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan],
+                            const int col) {
+    // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
+
+    hls_register int index = 0;
+
+ChannelLoop:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        hls_register int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+        if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+            data_col[index++] = data[index_data];
+        } else {
+            data_col[index++] = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                   res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                   const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                   const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+
+ColLoop:
+    #pragma unroll pf
+    #pragma ii CONFIG_T::reuse_factor
+    for (int col = 0; col < CONFIG_T::out_width; col++) {
+        // Loop variables should always be declared in the deepest scope available
+        // See Intel's HLS - Loop Best Practices
+        // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+        hls_register data_T data_col[CONFIG_T::n_chan];
+        im2col_1d_pointwise_cl<data_T, CONFIG_T>(data, data_col, col);
+
+        hls_register res_T res_col[CONFIG_T::n_filt];
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+    // Unroll fully, since
+    // (1) n_filt is usually low in io_parallel (< 32)
+    // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+    FiltLoop:
+        #pragma unroll
+        for (int k = 0; k < CONFIG_T::n_filt; k++) {
+            res[col * CONFIG_T::n_filt + k] = res_col[k];
+        }
+    }
+}
+
+// ****************************************************************
+//      Top-level function - handles different implementations
+// ****************************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(
+    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    static constexpr bool winograd_conditions =
+        // Winograd's minimal filtering algorithm not applicable to stride != 1
+        CONFIG_T::stride_width == 1 &&
+
+            // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+            CONFIG_T::out_width > 2 &&
+
+            // Verify user opted for Winograd
+            CONFIG_T::implementation == nnet::conv1d_implementation::combination ||
+        CONFIG_T::implementation == nnet::conv1d_implementation::winograd;
+
+    if (CONFIG_T::filt_width == 3 && winograd_conditions) {
+        winograd_conv1d_3x1_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_im2col_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
new file mode 100644
index 000000000..3aa71a74b
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
@@ -0,0 +1,72 @@
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_conv2d_resource.h"
+
+namespace nnet {
+
+struct conv2d_config {
+    // I/O sizes
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    // Run-time configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelisation_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                const typename CONFIG_T::weight_t
+                    weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+    pointwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 000000000..73ad45592
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,303 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+enum class conv2d_implementation { combination, im2col, winograd };
+
+// ****************************************************************
+//      im2col - General-purpose 2D Convolution algorithm
+// ****************************************************************
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                  data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int row,
+                  const int col) {
+    // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little
+    // DSP usage
+
+    hls_register int index = 0;
+
+FiltHeightLoop:
+    #pragma unroll
+    for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) {
+        hls_register int input_row =
+            -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height;
+
+    FiltWidthLoop:
+        #pragma unroll
+        for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) {
+            hls_register int input_col =
+                -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width;
+
+        ChannelLoop:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) {
+                    data_col[index++] =
+                        data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+                } else {
+                    data_col[index++] = 0;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                       res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                       const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
+                                                                 CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // im2col performs no filter transformations; therefore, filter size remains constant
+    assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
+
+    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int i = 0; i < CONFIG_T::out_height; i++) {
+    WidthLoop:
+        #pragma unroll pfc
+        #pragma ii CONFIG_T::reuse_factor
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            // Loop variables should always be declared in the deepest scope available
+            // See Intel's HLS - Loop Best Practices
+            // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+            hls_register data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
+            im2col_2d_cl<data_T, CONFIG_T>(data, data_col, i, j);
+
+            hls_register res_T res_col[CONFIG_T::n_filt];
+            dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+        // Unroll fully, since
+        // (1) n_filt is usually low in io_parallel (< 32)
+        // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+        FiltLoop:
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       2D Convolution for 3x3 kernels from Winograd's algoirithm
+// ****************************************************************
+
+// Explicity transofrmed input (B'dB) needed for Winograd calculation, as explained by Lavin & Gray, 2015
+template <typename data_T, typename res_T>
+inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D[16]) {
+    D[0] = I[0] - I[2] - I[8] + I[10];
+    D[1] = I[1] + I[2] - I[9] - I[10];
+    D[2] = -I[1] + I[2] + I[9] - I[10];
+    D[3] = I[1] - I[3] - I[9] + I[11];
+
+    D[4] = I[4] - I[6] + I[8] - I[10];
+    D[5] = I[5] + I[6] + I[9] + I[10];
+    D[6] = -I[5] + I[6] - I[9] + I[10];
+    D[7] = I[5] - I[7] + I[9] - I[11];
+
+    D[8] = -I[4] + I[6] + I[8] - I[10];
+    D[9] = -I[5] - I[6] + I[9] + I[10];
+    D[10] = I[5] - I[6] - I[9] + I[10];
+    D[11] = -I[5] + I[7] + I[9] - I[11];
+
+    D[12] = I[4] - I[6] - I[12] + I[14];
+    D[13] = I[5] + I[6] - I[13] - I[14];
+    D[14] = I[6] - I[5] + I[13] - I[14];
+    D[15] = I[5] - I[7] - I[13] + I[15];
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void winograd_conv2d_3x3_kernel_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    const typename CONFIG_T::weight_t
+        weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Ensure Winograd conditions are met
+    assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3);
+    assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1);
+    assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom);
+    assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2);
+
+    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
+    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
+
+    // Initialise result to bias
+    // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) {
+        int offset = CONFIG_T::n_filt * i;
+        #pragma unroll
+        for (int f = 0; f < CONFIG_T::n_filt; f++) {
+            res[offset + f] = static_cast<res_T>(biases[f]);
+        }
+    }
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int row = 0; row < CONFIG_T::out_height; row += 2) {
+    WidthLoop:
+        #pragma unroll pfc
+        for (int col = 0; col < CONFIG_T::out_width; col += 2) {
+        ChannelLoop:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                // Get current 4x4 tile
+                hls_register data_T T[16];
+                hls_register typename CONFIG_T::accum_t D[16];
+                hls_register uint8_t p = 0;
+
+                #pragma unroll
+                for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) {
+                    #pragma unroll
+                    for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) {
+                        if (r < CONFIG_T::in_height && r >= 0 && c < CONFIG_T::in_width && c >= 0) {
+                            T[p++] = data[r * CONFIG_T::in_width * CONFIG_T::n_chan + c * CONFIG_T::n_chan + channel];
+                        } else {
+                            T[p++] = 0;
+                        }
+                    }
+                }
+
+                // Transform input tile
+                winograd_transform_input_tile_3x3_kernel<data_T, typename CONFIG_T::accum_t>(T, D);
+
+                #pragma unroll
+                for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+                    hls_register int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel);
+
+                    // Hadamard product between transformed input tile and kernel
+                    hls_register typename CONFIG_T::accum_t Y[16];
+                    #pragma unroll
+                    for (int i = 0; i < 16; i++) {
+                        Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
+                    }
+
+                    // Explicitly transform intermediate result Z = A'YA and save to output
+                    res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] +=
+                        static_cast<res_T>(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + Y[10]);
+                    if ((col + 1) < CONFIG_T::out_height)
+                        res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] +=
+                            static_cast<res_T>(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - Y[11]);
+                    if ((row + 1) < CONFIG_T::out_width)
+                        res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] +=
+                            static_cast<res_T>(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - Y[13] - Y[14]);
+                    if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height)
+                        res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] +=
+                            static_cast<res_T>(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - Y[13] + Y[14]);
+                }
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       2D Convolution for 1x1 kernels using optimized im2col
+// ****************************************************************
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T data_col[CONFIG_T::n_chan], const int row, const int col) {
+    // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
+
+    hls_register int index = 0;
+
+ChannelLoop:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+
+        hls_register int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height;
+        hls_register int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width;
+
+        if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) {
+            data_col[index++] =
+                data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+        } else {
+            data_col[index++] = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                                   res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                                   const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                   const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int row = 0; row < CONFIG_T::out_height; row++) {
+    WidthLoop:
+        #pragma unroll pfc
+        #pragma ii CONFIG_T::reuse_factor
+        for (int col = 0; col < CONFIG_T::out_width; col++) {
+            // Loop variables should always be declared in the deepest scope available
+            // See Intel's HLS - Loop Best Practices
+            // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+            hls_register data_T data_col[CONFIG_T::n_chan];
+            im2col_2d_pointwise_cl<data_T, CONFIG_T>(data, data_col, row, col);
+
+            hls_register res_T res_col[CONFIG_T::n_filt];
+            dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+        FiltLoop:
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[row * CONFIG_T::out_width * CONFIG_T::n_filt + col * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//      Top-level function - handles different implementations
+// ****************************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                         res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                         const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
+                                                                   CONFIG_T::n_chan * CONFIG_T::n_filt],
+                         const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    static constexpr bool winograd_conditions =
+        // Winograd's minimal filtering algorithm not applicable to stride != 1
+        CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 &&
+
+            // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+            CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 &&
+
+            // Verify user opted for Winograd
+            CONFIG_T::implementation == nnet::conv2d_implementation::combination ||
+        CONFIG_T::implementation == nnet::conv2d_implementation::winograd;
+
+    if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) {
+        winograd_conv2d_3x3_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_im2col_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
new file mode 100644
index 000000000..c1786ef78
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -0,0 +1,170 @@
+#ifndef NNET_DENSE_LARGE_H_
+#define NNET_DENSE_LARGE_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <cstdint>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    static const unsigned reuse_factor = 1;
+    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
+    static const unsigned multiplier_factor = 1; // min n_in, rf
+    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
+    static const unsigned reciprocal = 1;        // 2^35 / 25
+    static const unsigned rf_pad = 0;
+    static const unsigned bf_pad = 0;
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+    //#pragma ii CONFIG_T::reuse_factor
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+Load:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + CONFIG_T::reuse_factor * im;
+            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor);
+            d_index[ir][im] = w_index % CONFIG_T::n_in;
+        }
+    }
+Product1:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+    Product2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
+                continue;
+            int data_index = d_index[ir][im];
+            // Modified this
+            tmp_acc[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
+        }
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+    ResetMult:
+        #pragma unroll
+        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            int o_index = out_index[ir][im];
+            if (o_index >= CONFIG_T::n_out)
+                continue; // check out of bounds
+            mult[o_index] += tmp_acc[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
+            acc[im] += mult[im];
+        }
+    }
+Store:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+InitAccum:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+    MultLoop:
+        #pragma unroll
+        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue;
+            // Modified this
+            mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in)
+                in_index = ir;
+        }
+    AccumLoop:
+        #pragma unroll
+        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
+            acc[out_index] += mult[im];
+            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+// Cast to "res_t" type
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
new file mode 100644
index 000000000..ba50a631b
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
@@ -0,0 +1,81 @@
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <cstdint>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32_t w = ir + CONFIG_T::reuse_factor * im;
+            inputs[ir][im] = data[weights[w].row_index];
+            out_index[ir][im] = weights[w].col_index;
+        }
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
+    CompressedMultLoop:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32_t w = ir + CONFIG_T::reuse_factor * im;
+            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
+            typename CONFIG_T::accum_t prod = mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
+            #pragma unroll
+            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
+                inputs[is][im] = inputs[is + 1][im];
+            }
+        }
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
+    ResetMult:
+        #pragma unroll
+        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
+            tmp_acc[tacc] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            int col = out_index[ir][im];
+            tmp_acc[col] += mult[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += tmp_acc[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
new file mode 100644
index 000000000..5191239b6
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
@@ -0,0 +1,45 @@
+#ifndef NNET_EMBED_H_
+#define NNET_EMBED_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+struct embed_config {
+    // Internal data type definitions
+    typedef float embeddings_t;
+
+    // (Default layer sizes, overwritten form the backend
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 16;
+    static const unsigned vocab_size = 50;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out],
+               const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+
+    /*
+     * Can store embeddings[] in a register, but a large multiiplexer
+     * is created due to a non-constant access pattern
+     */
+
+InputSequence:
+    #pragma ii CONFIG_T::reuse_factor
+    #pragma unroll
+    for (int j = 0; j < CONFIG_T::n_in; j++) {
+    DenseEmbedding:
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            res[j * CONFIG_T::n_out + i] = embeddings[data[j].to_uint() * CONFIG_T::n_out + i];
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
new file mode 100644
index 000000000..888ea4a6f
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -0,0 +1,119 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <cmath>
+#include <sstream>
+#include <cstdio>
+#include <cstdlib>
+
+namespace nnet {
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = static_cast<dstType>(src[i].to_double());
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+// constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+// replace with template metaprogramming
+template<int n> struct ceillog2
+{
+    enum { val = 1 + ceillog2<((n + 1) / 2)>::val };
+};
+
+template<> struct ceillog2<2>
+{
+    enum { val = 1 };
+};
+
+template<> struct ceillog2<1>
+{
+    enum { val = 0 };
+};
+
+
+// constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+// replace with template metaprogramming
+template<int n> struct floorlog2
+{
+    enum { val = 1 + floorlog2<(n / 2)>::val };
+};
+
+template<> struct floorlog2<1>
+{
+    enum { val = 0 };
+};
+
+template<> struct floorlog2<0>
+{
+    enum { val = 0 };
+};
+
+// constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+// replace with template metaprogramming
+template<int n> struct pow2
+{
+    enum { val = 2 * pow2<(n - 1)>::val };
+};
+
+template<> struct pow2<0>
+{
+    enum { val = 1 };
+};
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
new file mode 100644
index 000000000..766ef2e20
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -0,0 +1,249 @@
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    static const unsigned reuse_factor = 1;
+
+    typedef float accum_t;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] + data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] - data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] * data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+
+    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+Product:
+    #pragma unroll multiplier_limit
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
+    }
+
+    hls_register typename CONFIG_T::accum_t acc = 0;
+Accum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        acc += mult[i];
+    }
+
+    res[0] = static_cast<res_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
+                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
+                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx =
+                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+        }
+
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
+                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
new file mode 100644
index 000000000..5be772832
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
@@ -0,0 +1,113 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+//  Different methods to perform the product of input and weight, depending on their types.
+namespace product {
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {}
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    inline static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        // TODO: Implement for Quartus
+        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
+        // pragma
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
+    inline static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+} // namespace product
+
+// TO-DO: These may need extra variants if ac_int types are used in more places
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
new file mode 100644
index 000000000..a95f9ab00
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
@@ -0,0 +1,99 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
new file mode 100644
index 000000000..bbfc0908e
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
@@ -0,0 +1,319 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+// Returns the maximum value from an array of size N
+template <typename T, int N> T max(T x[N]) {
+    hls_register T y = x[0];
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    #pragma disable_loop_pipelining
+    for (int i = 1; i < N; i++) {
+        if (x[i] > y)
+            y = x[i];
+    }
+
+    return y;
+}
+
+// Returns the mean value of an array of size N
+template <typename T, int N> T avg(T (&x)[N]) {
+    hls_register T y = 0;
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+
+    y /= N;
+    return y;
+}
+
+// Returns the mean value of an array of size N
+// Overload of the above function; using a wider accumulator than the input to avoid overflow
+template <int W, int N> ac_int<W, true> avg(ac_int<W, true> (&x)[N]) {
+    hls_register ac_int<W + ceillog2(N), true> tmp = 0;
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+
+    tmp /= N;
+
+    // Cast back to original type
+    ac_int<W, true> y = static_cast<ac_int<W, true>>(tmp);
+    return tmp;
+}
+
+// Returns the mean value of an array of size N
+// Overload of the above function; using a wider accumulator than the input to avoid overflow
+template <int W, int I, int N> ac_fixed<W, I, true> avg(ac_fixed<W, I, true> (&x)[N]) {
+    hls_register ac_fixed<W + ceillog2(N), I + ceillog2(N), true> tmp = 0;
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+
+    tmp /= N;
+
+    // Cast back to original type
+    ac_fixed<W, I, true> y = tmp;
+    return y;
+}
+
+// Enumeration for pooling functions
+enum Pool_Op { Max, Average };
+template <typename T, int N, Pool_Op op> T pool_op(T (&x)[N]) {
+    switch (op) {
+    case Max:
+        return max<T, N>(x);
+    case Average:
+        return avg(x);
+    }
+}
+
+/*
+ * In Tensorflow, pooling ignores the value in the padded cells
+ * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.)
+ * For ax pooling, return the most negative value for the type.
+ */
+template <typename T, Pool_Op op> inline T pad_val() {
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // Pooling paramaters
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+
+    // I/O sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned n_filt = 4;
+
+    // Padding
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    // For 'same' padding, increase input width by left- and right-side padding
+    // For 'valid' padding, reduce input width to area covered by pooling function
+    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
+                                            ? (CONFIG_T::n_in / CONFIG_T::stride_width * CONFIG_T::stride_width)
+                                            : (CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right);
+
+FiltLoop:
+    #pragma unroll
+    #pragma disable_loop_pipelining
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    InputWidthLoop:
+        #pragma unroll
+        #pragma disable_loop_pipelining
+        for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
+            hls_register data_T pool[CONFIG_T::pool_width];
+
+            // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
+            hls_register unsigned img_overlap = 0;
+
+        PoolWidthLoop:
+            #pragma unroll
+            #pragma disable_loop_pipelining
+            for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
+                if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[pool_col] = pad_val<data_T, CONFIG_T::pool_op>();
+                    if (CONFIG_T::count_pad)
+                        img_overlap++;
+                } else {
+                    // Current element is from input image
+                    pool[pool_col] = data[(inp_col + pool_col - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt];
+                    img_overlap++;
+                }
+            }
+
+            // Pooling operation
+            res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] =
+                static_cast<res_T>(pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool));
+
+            // If the pool op is Average, the zero-padding needs to be removed from the results
+            if (CONFIG_T::pool_op == Average)
+                res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *=
+                    (static_cast<data_T>(CONFIG_T::pool_width) / img_overlap);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+FiltLoop:
+    #pragma unroll
+    #pragma disable_loop_pipelining
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        hls_register data_T pool[CONFIG_T::n_in];
+
+    InputWidthLoop:
+        #pragma unroll
+        #pragma disable_loop_pipelining
+        for (int col = 0; col < CONFIG_T::n_in; col++) {
+            pool[col] = data[col * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool));
+    }
+}
+
+struct pooling2d_config {
+    // Pooling parameters
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+
+    // I/O sizes
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    // For 'same' padding, increase input width by left- and right-side padding
+    // For 'valid' padding, reduce input width to area covered by pooling function
+    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
+                                            ? (CONFIG_T::in_width / CONFIG_T::stride_width * CONFIG_T::stride_width)
+                                            : (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right);
+    static constexpr int padded_height = (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0)
+                                             ? (CONFIG_T::in_height / CONFIG_T::stride_height * CONFIG_T::stride_height)
+                                             : (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom);
+
+FiltLoop:
+    #pragma unroll
+    #pragma disable_loop_pipelining
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    InputHeightLoop:
+        #pragma unroll
+        #pragma disable_loop_pipelining
+        for (int inp_col = 0; inp_col < padded_height; inp_col += CONFIG_T::stride_height) {
+        InputWidthLoop:
+            #pragma unroll
+            #pragma disable_loop_pipelining
+            for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) {
+                hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+
+                // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
+                hls_register unsigned img_overlap = 0;
+
+            PoolHeightLoop:
+                #pragma unroll
+                #pragma disable_loop_pipelining
+                for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) {
+                PoolWidthLoop:
+                    #pragma unroll
+                    #pragma disable_loop_pipelining
+                    for (int pool_row = 0; pool_row < CONFIG_T::stride_width; pool_row++) {
+                        if (inp_col + pool_col < CONFIG_T::pad_top ||
+                            inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) ||
+                            inp_width + pool_row < CONFIG_T::pad_left ||
+                            inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[pool_col * CONFIG_T::stride_width + pool_row] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            // Current element is from input image
+                            pool[pool_col * CONFIG_T::stride_width + pool_row] =
+                                data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt +
+                                     (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt];
+                            img_overlap++;
+                        }
+                    }
+                }
+
+                // Pooling operation
+                res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] =
+                    static_cast<res_T>(
+                        pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool));
+
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average)
+                    res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                        (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *=
+                        (static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) /
+                         img_overlap);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+FiltLoop:
+    #pragma unroll
+    #pragma disable_loop_pipelining
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        hls_register data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        #pragma unroll
+        #pragma disable_loop_pipelining
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
new file mode 100644
index 000000000..464c6d415
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -0,0 +1,583 @@
+#ifndef NNET_RECURRENT_H_
+#define NNET_RECURRENT_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recurrent_activation.h"
+
+namespace nnet {
+
+//----------------------
+// Utils
+//----------------------
+
+template <class data_T, class res_T, class weight_t, int N_IN, int N_OUT>
+void multiply_W(data_T input[N_IN], res_T out[N_OUT], const weight_t weight[N_IN * N_OUT]) {
+MULTIPLY_W_LOOP_I:
+    #pragma unroll
+    for (int i = 0; i < N_OUT; i++) {
+        out[i] = 0;
+
+    MULTIPLY_W_LOOP_J:
+        #pragma unroll
+        for (int j = 0; j < N_IN; j++) {
+            out[i] += input[j] * weight[i * N_IN + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, class weight_t, int N_OUT>
+void multiply_U(data_T input[N_OUT], res_T out[N_OUT], const weight_t weight[N_OUT * N_OUT]) {
+MULTIPLY_U_LOOP_I:
+    #pragma unroll
+    for (int i = 0; i < N_OUT; i++) {
+        out[i] = 0;
+
+    MULTIPLY_U_LOOP_J:
+        #pragma unroll
+        for (int j = 0; j < N_OUT; j++) {
+            out[i] += input[j] * weight[i * N_OUT + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, class bias_t, int N>
+void add_bias(data_T inputs[N], res_T out[N], const bias_t bias[N]) {
+ADD_BIAS_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = inputs[i] + bias[i];
+    }
+}
+
+template <class data_T, class res_T, int N> void multiply_vectors(data_T in1[N], data_T in2[N], res_T out[N]) {
+MULTIPLY_VECT_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = in1[i] * in2[i];
+    }
+}
+
+template <class data_T, class res_T, int N> void add_vectors(data_T in1[N], data_T in2[N], res_T out[N]) {
+ADD_VECTOR_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = in1[i] + in2[i];
+    }
+}
+
+//----------------------
+// GRU
+//----------------------
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_units = 1;
+    static const unsigned n_timesteps = 1;
+    static const unsigned n_outputs = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
+              const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+              const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
+              const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
+              const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+    static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
+    // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
+    // candidate state gates, for each of the units
+    hls_register typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units];
+    nnet::dense_resource<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_x>(x, mat_mul_x_w, weights,
+                                                                                               bias);
+
+    // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights),
+    // for update, reset and candidate state gates, for each of the units
+    hls_register typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units];
+    nnet::dense_resource<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_h>(
+        h, mat_mul_h_wr, recurrent_weights, recurrent_bias);
+
+    // A vector containing both the values of z(t) and r(t) for every state
+    hls_register typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units];
+
+    // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1)
+    // Unrolled fully, no DSPs used
+    #pragma unroll
+    for (int i = 0; i < (2 * CONFIG_T::n_units); i++) {
+        z_r[i] = mat_mul_x_w[i] + mat_mul_h_wr[i];
+    }
+
+    // Activation on z(t) and r(t)
+    hls_register typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units];
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(z_r, z_r_act);
+
+    // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h
+    hls_register typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units];
+    #pragma unroll recurrent_unroll_factor
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units];
+    }
+
+    // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h}
+    typename CONFIG_T::accum_t h_cand[CONFIG_T::n_units];
+    // Addition - can unroll fully; no DSPs used here
+    #pragma unroll
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        h_cand[i] = mat_mul_x_w[i + 2 * CONFIG_T::n_units] + hadamard_r_h[i];
+    }
+
+    // Activation on candidate state
+    hls_register typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units];
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(h_cand, h_cand_act);
+
+    // Update state
+    #pragma unroll recurrent_unroll_factor
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        h[i] = static_cast<res_T>(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_units],
+         const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+         const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
+         const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
+         const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+
+    hls_register data_T x[CONFIG_T::n_in];
+    hls_register res_T h[CONFIG_T::n_units];
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_units; i++) {
+        h[i] = 0;
+    }
+
+    // Loop depedency - cannot pipeline
+    #pragma disable_loop_pipelining
+    for (int t = 0; t < CONFIG_T::n_timesteps; t++) {
+        // Get data at current time step
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            x[j] = data[j + t * CONFIG_T::n_in];
+        }
+
+        nnet::gru_cell<data_T, res_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
+
+        if (CONFIG_T::return_sequences) {
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::n_units; i++) {
+                res[CONFIG_T::n_units * t + i] = h[i];
+            }
+        }
+    }
+
+    if (!CONFIG_T::return_sequences) {
+        #pragma unroll
+        for (int i = 0; i < (CONFIG_T::n_units); i++) {
+            res[i] = h[i];
+        }
+    }
+}
+
+//----------------------
+// SimpleRNN
+//----------------------
+
+struct simpleRNN_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_outputs = 1;
+    static const unsigned n_timesteps = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void simple_rnn_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out],
+                     res_T hidden_state_o[CONFIG_T::n_out],
+                     const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
+                     const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
+                     const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
+    // Weight multiplication
+    typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] hls_register;
+    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
+        inputs, afterW, kernel);
+
+    // Bias addition
+    typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] hls_register;
+    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
+        afterW, afterBias, bias);
+
+    // Hidden state
+    typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] hls_register;
+    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
+                                                                                                 rec_kernel);
+
+    // Vector addition
+    typename CONFIG_T::accum_t afterAdd[CONFIG_T::n_out];
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(afterBias, hiddenCand, afterAdd);
+
+    // Activation
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, data_T, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        afterAdd, hidden_state_o);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out],
+                const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
+                const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
+                const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
+    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
+    res_T hidden_state_temp[CONFIG_T::n_out] hls_register;
+    res_T h[CONFIG_T::n_out] hls_register;
+    data_T in[CONFIG_T::n_in] hls_register;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[x][0] = 0;
+    }
+
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+
+        // Data at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[x][i];
+        }
+
+        // Do SimpleRNN
+        simple_rnn_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[x][i + 1] = h[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            #pragma unroll
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+            }
+        }
+    }
+}
+
+//----------------------
+// LSTM
+//----------------------
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_outputs = 1;
+
+    static const unsigned n_timesteps = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out], res_T hidden_state_o[CONFIG_T::n_out],
+               res_T cell_state[CONFIG_T::n_out], res_T cell_state_o[CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out],
+               const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
+               const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
+               const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
+
+    // Internals definitions
+    typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] hls_register;
+
+    // Hidden state Gate candidates, intermediate variables
+    typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] hls_register;
+
+    // After addition, intermediate variables
+    typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] hls_register;
+
+    // Gate outputs
+    typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t h[CONFIG_T::n_out] hls_register;
+
+    // Intermediate variable cell calculation
+    typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] hls_register;
+
+    //-----------Gate I Calculations
+    // Weight multiplication
+    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
+        inputs, i_afterW, WI);
+
+    // Bias addition
+    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
+        i_afterW, i_afterBias, BI);
+
+    // Hidden Candidate
+    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand,
+                                                                                                 RWI);
+
+    // Vector addition
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(i_afterBias, i_hiddenCand,
+                                                                                         i_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(i_afterAdd, gate_i);
+
+    //-----------Gate F Calculations
+    // Weight multiplication
+    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
+        inputs, f_afterW, WF);
+
+    // Bias addition
+    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
+        f_afterW, f_afterBias, BF);
+
+    // Hidden Candidate
+    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand,
+                                                                                                 RWF);
+
+    // Vector addition
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(f_afterBias, f_hiddenCand,
+                                                                                         f_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(f_afterAdd, gate_f);
+
+    //-----------Gate C Calculations
+    // Weight multiplication
+    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
+        inputs, c_afterW, WC);
+
+    // Bias addition
+    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
+        c_afterW, c_afterBias, BC);
+
+    // Hidden Candidate
+    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand,
+                                                                                                 RWC);
+
+    // Vector addition
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(c_afterBias, c_hiddenCand,
+                                                                                         c_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                  typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(c_afterAdd, gate_c);
+
+    //-----------gate I and C multiply
+    // Vector multiplication
+    multiply_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_i, gate_c, gate_ic);
+
+    //-----------Gate O Calculations
+    // Weight multiplication
+    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
+        inputs, o_afterW, WO);
+
+    // Bias addition
+    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
+        o_afterW, o_afterBias, BO);
+
+    // Hidden Candidate
+    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand,
+                                                                                                 RWO);
+
+    // Vector addition
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(o_afterBias, o_hiddenCand,
+                                                                                         o_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(o_afterAdd, gate_o);
+
+    //-----------Cell State Calculation
+    // Vector multiplication
+    multiply_vectors<typename CONFIG_T::accum_t, res_T, CONFIG_T::n_out>(gate_f, cell_state, cell_act_multp);
+
+    // Vector addition
+    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_ic, cell_act_multp,
+                                                                                         cell_act_add);
+
+    //-----------Forget gate Calculation
+    // Activation
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+                                  typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(cell_act_add, gate_forget);
+
+    // Vector multiplication
+    multiply_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_o, gate_forget, h);
+
+OUTPUT_WRITE_LOOP:
+    #pragma unroll
+    for (int x = (CONFIG_T::n_out - 1); x >= 0; x--) {
+        hidden_state_o[x] = h[x];
+        cell_state_o[x] = cell_act_add[x];
+    }
+}
+
+template <class data_T, class res_T, class CONFIG_T>
+void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out],
+          const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
+          const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
+          const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
+    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
+    res_T hidden_state_temp[CONFIG_T::n_out] hls_register;
+    res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
+    res_T cell_state_temp[CONFIG_T::n_out] hls_register;
+    res_T h[CONFIG_T::n_out] hls_register;
+    res_T c[CONFIG_T::n_out] hls_register;
+    data_T in[CONFIG_T::n_in] hls_register;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[x][0] = 0;
+        cell_state[x][0] = 0;
+    }
+
+    // Input dimension
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+        // Data at current time step
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[x][i];
+            cell_state_temp[x] = cell_state[x][i];
+        }
+
+        // Do LSTM
+        lstm_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO,
+                                           BI, BF, BC, BO);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[x][i + 1] = h[x];
+            cell_state[x][i + 1] = c[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
new file mode 100644
index 000000000..e5896e6da
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
@@ -0,0 +1,53 @@
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "nnet_activation.h"
+#include "nnet_common.h"
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {}
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::dense_tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
new file mode 100644
index 000000000..a8e3ffe85
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
@@ -0,0 +1,38 @@
+#ifndef NNET_IMAGE_H_
+#define NNET_IMAGE_H_
+
+namespace nnet {
+
+struct resize_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+};
+
+template <class data_T, typename CONFIG_T>
+void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan],
+                    data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) {
+    int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1;
+    int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1;
+
+    for (int i = 0; i < CONFIG_T::new_height; i++) {
+        for (int j = 0; j < CONFIG_T::new_width; j++) {
+            int x = ((j * x_ratio) >> 16);
+            int y = ((i * y_ratio) >> 16);
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] =
+                    image[(y * CONFIG_T::width * CONFIG_T::n_chan) + x * CONFIG_T::n_chan + k];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
new file mode 100644
index 000000000..05fd5fe76
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
@@ -0,0 +1,50 @@
+#ifndef NNET_TRANSPOSE_H_
+#define NNET_TRANSPOSE_H_
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T res[CONFIG_T::height * CONFIG_T::width]) {
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            res[j * CONFIG_T::height + i] = static_cast<res_T>(data[i * CONFIG_T::width + j]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T res[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]],
+                                            dim_data[CONFIG_T::perm[2]]};
+
+    int index_data[3] = {0}, index_res[3] = {0};
+
+    for (index_data[0] = 0; index_data[0] < dim_data[0]; index_data[0]++) {
+        #pragma unroll
+        for (index_data[1] = 0; index_data[1] < dim_data[1]; index_data[1]++) {
+            #pragma unroll
+            for (index_data[2] = 0; index_data[2] < dim_data[2]; index_data[2]++) {
+                index_res[0] = index_data[CONFIG_T::perm[0]];
+                index_res[1] = index_data[CONFIG_T::perm[1]];
+                index_res[2] = index_data[CONFIG_T::perm[2]];
+
+                res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = static_cast<res_T>(
+                    data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
new file mode 100644
index 000000000..221055938
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -0,0 +1,44 @@
+#ifndef NNET_TYPES_H_
+#define NNET_TYPES_H_
+
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+
+namespace nnet {
+
+/*
+ * HLS Shift Register Implementation
+ * To verify a shift register is used in hardware, go to report.html > Area Analysis of System
+ * Unrolling the shift loop minimizes resource usage and latency at the same time
+ * The shift loop should be either fully unrolled or not unrolled at all
+ * Unrolling with a specific unroll factor or pipelining with certain ii's, can cause an irregular access pattern, which
+ * wouldn't allow shift register usage in RTL
+ */
+template <typename T, int N> struct shift_reg {
+  private:
+    T data[N];
+
+  public:
+    // Default constructor
+    shift_reg() {}
+
+    // Shift queue, insert new element and return element from the front
+    T shift(T inp) {
+        T out = data[N - 1];
+
+        #pragma unroll
+        for (int i = N - 1; i > 0; i--) {
+            data[i] = data[i - 1];
+        }
+        data[0] = inp;
+
+        return out;
+    }
+
+    T read(int pos) { return data[pos]; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h
new file mode 100644
index 000000000..e23ca9770
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/parameters.h
@@ -0,0 +1,11 @@
+#ifndef PARAMETERS_H_
+#define PARAMETERS_H_
+
+#include "defines.h"
+
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+
+// hls-fpga-machine-learning insert layer-config
+
+#endif
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
new file mode 100644
index 000000000..6c7ae68fb
--- /dev/null
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -0,0 +1,167 @@
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <exception>
+
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+#include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/ext/intel/prototype/interfaces.hpp>
+
+#include "exception_handler.hpp"
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+
+int main(int argc, char **argv) {
+
+#if FPGA_SIMULATOR
+    auto selector = sycl::ext::intel::fpga_simulator_selector_v;
+#elif FPGA_HARDWARE
+    auto selector = sycl::ext::intel::fpga_selector_v;
+#else  // #if FPGA_EMULATOR
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+#endif
+
+    sycl::queue q(selector, fpga_tools::exception_handler,
+                  sycl::property::queue::enable_profiling{});
+
+    auto device = q.get_device();
+
+    // make sure the device supports USM host allocations
+    if (!device.has(sycl::aspect::usm_host_allocations)) {
+      std::cerr << "This design must either target a board that supports USM "
+                   "Host/Shared allocations, or IP Component Authoring. "
+                << std::endl;
+      std::terminate();
+    }
+
+    std::cout << "Running on device: "
+              << device.get_info<sycl::info::device::name>().c_str()
+              << std::endl;
+
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "tb_data/results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+
+    std::vector<input_data_t> inputs;
+    std::vector<output_data_t> outputs;
+
+    if (fin.is_open() && fpr.is_open()) {
+        std::vector<std::vector<float>> predictions;
+        unsigned int num_iterations = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::stringstream ssin(iline);
+            while (ssin >> current) {
+                in.push_back(current);
+            }
+
+            std::stringstream sspred(pline);
+            while (sspred >> current) {
+                pr.push_back(current);
+            }
+            if (in.size() != N_INPUT_1_1) {
+                throw std::runtime_error("The input size does not match");
+            }
+            if (pr.size() != N_LAYER_11) {
+                throw std::runtime_error("The output size does not match");
+            }
+
+            // hls-fpga-machine-learning insert data
+            inputs.emplace_back();
+            std::copy(in.cbegin(), in.cend(), inputs.back().begin());
+            outputs.emplace_back();
+            predictions.push_back(std::move(pr));
+        }
+        // Do this separately to avoid vector reallocation
+        // hls-fpga-machine-learning insert top-level-function
+        for(int i = 0; i < num_iterations; i++) {
+            InPipe::write(q, inputs[i]);
+            q.single_task(MyProject{});  // once or once for each
+        }
+        q.wait();
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert tb-output
+            outputs[j] = OutPipe::read(q);
+            for(int i = 0; i < N_LAYER_11; i++) {
+              fout << outputs[j][i] << " ";
+            }
+            fout << std::endl;
+            if (j % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                for(int i = 0; i < N_LAYER_11; i++) {
+                  std::cout << predictions[j][i] << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+                for(int i = 0; i < N_LAYER_11; i++) {
+                  std::cout << outputs[j][i] << " ";
+                }
+                std::cout << std::endl;
+            }
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+        // hls-fpga-machine-learning insert zero
+        for(int i = 0; i < num_iterations; i++) {
+            inputs.emplace_back();
+            outputs.emplace_back();
+            outputs.back().fill(0.0);
+        }
+
+        // hls-fpga-machine-learning insert top-level-function
+        for(int i = 0; i < num_iterations; i++) {
+            InPipe::write(q, inputs[i]);
+            q.single_task(MyProject{});
+        }
+        q.wait();
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert output
+            outputs[j] = OutPipe::read(q);
+            for(int i = 0; i < N_LAYER_11; i++) {
+              std::cout << outputs[j][i] << " ";
+            }
+            std::cout << std::endl;
+
+            // hls-fpga-machine-learning insert tb-output
+            for(int i = 0; i < N_LAYER_11; i++) {
+              fout << outputs[j][i] << " ";
+            }
+            fout << std::endl;
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}

From cd0a2b8eb401e8b57f1b6d1e2c309804087af085 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 21 Dec 2023 14:47:34 -0600
Subject: [PATCH 002/100] fix reduce constexpr

---
 .../oneapi/firmware/nnet_utils/nnet_common.h  | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
index 0c2e94e02..abefd87b8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
@@ -38,30 +38,32 @@ template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_
  * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
  * before applying and accumulate the result over the rolled dimension.
  * --- */
-// template <class T, int N, class Op> T reduce(const T *x, Op op) {
-//     static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ? pow2<floorlog2<N - 1>::val>::val : 0;
-//     static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-//     if (N == 1) {
-//         return x[0];
-//     }
-//     if (N == 2) {
-//         return op(x[0], x[1]);
-//     }
-//     return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-// }
-
-// alternate reduce - basic
 template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    if (N == 1) {
+    static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ?
+                                 pow2<floorlog2<N - 1>::val>::val :
+                                 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if constexpr (N == 1) {
         return x[0];
     }
-    auto val = op(x[0], x[1]);
-    for (int i = 2; i < N; i++) {
-        val = op(val, x[i]);
+    else if constexpr (N == 2) {
+        return op(x[0], x[1]);
+    } else {
+        return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
     }
-    return val;
 }
 
+// alternate reduce - basic
+// template <class T, int N, class Op> T reduce(const T *x, Op op) {
+//     if (N == 1) {
+//         return x[0];
+//     }
+//     auto val = op(x[0], x[1]);
+//     for (int i = 2; i < N; i++) {
+//         val = op(val, x[i]);
+//     }
+//     return val;
+// }
 
 template <class T> class Op_add {
   public:

From 3b3d40d8570b8914f5fa0a61ae87b71eb5b64a27 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 22 Dec 2023 18:36:13 -0600
Subject: [PATCH 003/100] further updates

---
 hls4ml/backends/oneapi/oneapi_types.py        |  71 ++
 .../backends/oneapi/passes/transform_types.py |  37 +-
 hls4ml/templates/oneapi/CMakeLists.txt        |   4 +-
 hls4ml/templates/oneapi/firmware/defines.h    |   1 +
 .../templates/oneapi/firmware/myproject.cpp   |   3 +-
 hls4ml/templates/oneapi/firmware/myproject.h  |  14 +-
 hls4ml/templates/oneapi/myproject_test.cpp    |   3 +
 hls4ml/writer/__init__.py                     |   2 +
 hls4ml/writer/oneapi_writer.py                | 967 ++++++++++++++++++
 9 files changed, 1071 insertions(+), 31 deletions(-)
 create mode 100644 hls4ml/backends/oneapi/oneapi_types.py
 create mode 100644 hls4ml/writer/oneapi_writer.py

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
new file mode 100644
index 000000000..d76449f1e
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -0,0 +1,71 @@
+'''
+This package includes oneAPI-specific customizations to the variable types
+'''
+from hls4ml.backends.fpga.fpga_types import VariableDefinition, ArrayVariableConverter
+
+# region ArrayVarable
+
+class OneAPIArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}'
+
+class OneAPIInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+class OneAPIArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition)
+
+class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition
+        )
+
+# endregion
+
+# region InterfaceMemberVariable
+
+
+class OneAPIInterfaceVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}'
+    
+    def declare_cpp(self, pipe_min_size=0, indent=''):
+        lines = indent + f'class {self.pipe_id};\n'
+        lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + (f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+                + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n')
+        return lines
+
+
+class InterfaceVariableConverter:
+    def __init__(self, type_converter, prefix, definition_cls):
+        self.type_converter = type_converter
+        self.prefix = prefix
+        self.definition_cls = definition_cls
+
+    def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition'):
+        if isinstance(tensor_var, self.definition_cls):  # Already converted
+            return tensor_var
+
+        tensor_var.pragma = pragma
+        tensor_var.type = self.type_converter.convert(tensor_var.type)
+
+        tensor_var.pipe_name = pipe_name
+        tensor_var.pipe_id = pipe_id
+        tensor_var.array_type = array_type
+
+        tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {})
+        return tensor_var
+
+
+class OneAPIInterfaceVariableConverter(InterfaceVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition
+        )
+
+
+# endregion
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 67de32ab6..121392eda 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -1,25 +1,23 @@
 from hls4ml.backends.fpga.fpga_types import (
     ACTypeConverter,
     HLSTypeConverter,
-    QuartusArrayVariableConverter,
-    QuartusInplaceArrayVariableConverter,
-    QuartusInplaceStreamVariableConverter,
-    QuartusStreamVariableConverter,
-    QuartusStructMemberVariableConverter,
     StaticWeightVariableConverter,
 )
+from hls4ml.backends.oneapi.oneapi_types import (
+    OneAPIArrayVariableConverter,
+    OneAPIInplaceArrayVariableConverter,
+    OneAPIInterfaceVariableConverter
+)
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
-
+from hls4ml.utils.string_utils import convert_to_pascal_case
 
 class TransformTypes(GlobalOptimizerPass):
     def __init__(self):
         self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
-        self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter)
-        self.inplace_array_var_converter = QuartusInplaceArrayVariableConverter(type_converter=self.type_converter)
-        self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter)
-        self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter)
-        self.inplace_stream_var_converter = QuartusInplaceStreamVariableConverter(type_converter=self.type_converter)
+        self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter)
+        self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter)
+        self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)
         self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
 
     def transform(self, model, node):
@@ -27,19 +25,22 @@ def transform(self, model, node):
 
         for out_name, var in node.variables.items():
             if io_type == 'io_stream':
-                if isinstance(var, InplaceTensorVariable):
-                    new_var = self.inplace_stream_var_converter.convert(var)
-                else:
-                    new_var = self.stream_var_converter.convert(var)
+                raise NotImplementedError("io_stream is not yet implemented for oneAPI")
             elif io_type == 'io_parallel':
                 if out_name in node.model.inputs:
-                    new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs')
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register',
+                                                                   pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
+                                                                   pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
+                                                                   array_type=f'{var.name}_array_t')
                 elif out_name in node.model.outputs:
-                    new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='outputs')
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register',
+                                                                   pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
+                                                                   pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
+                                                                   array_type=f'{var.name}_array_t')
                 elif isinstance(var, InplaceTensorVariable):
                     new_var = self.inplace_array_var_converter.convert(var, pragma='')
                 else:
-                    new_var = self.array_var_converter.convert(var, pragma='hls_register')
+                    new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register')
             else:
                 raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})')
 
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index a3a6e5c4a..66c505450 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()
 
 cmake_minimum_required (VERSION 3.7.2)
 
-project(fpga_template CXX)
+project(myproject CXX)
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
@@ -20,7 +20,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 ### Customize these build variables
 ###############################################################################
 set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp)
-set(TARGET_NAME fpga_template)
+set(TARGET_NAME myproject)
 
 # Use cmake -DFPGA_DEVICE=<board-support-package>:<board-variant> to choose a
 # different device. Here are a few device examples (this list is not
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
index 622d9f2bf..04dc640a1 100644
--- a/hls4ml/templates/oneapi/firmware/defines.h
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -6,6 +6,7 @@
 #include <sycl/ext/intel/ac_types/ac_fixed.hpp>
 #include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
+#include <array>
 
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index 93f11c837..0dc79a21c 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -8,13 +8,12 @@ void MyProject::operator()() const {
     // NETWORK INSTANTIATION
     // ****************************************
 
-    auto inputsArr = InPipe::read();
+// hls-fpga-machine-learning read in
 
 // hls-fpga-machine-learning insert layers
 
 // hls-fpga-machine-learning return
 
-    OutPipe::write(outData);
 }
 
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index f01b5978c..52f457344 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -5,18 +5,14 @@
 
 // This file defines the interface to the kernel
 
-
-using input_data_t = std::array<input_t, N_INPUT>;
-using output_data_t = std::array<result_t, N_RESULT>;
-
-class InPipeID;
-class OutPipeID;
-
+// currently this is fixed
 using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(
     sycl::ext::intel::experimental::ready_latency<0>));
 
-using InPipe = sycl::ext::intel::experimental::pipe<InPipeID, input_data_t, 0, PipeProps>;
-using OutPipe = sycl::ext::intel::experimental::pipe<OutPipeID, output_data_t, 0, PipeProps>;
+// Need to declare the input and output pipes
+
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
 
 class MyProjectID;
 
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index 6c7ae68fb..a9830245a 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -57,6 +57,9 @@ int main(int argc, char **argv) {
     std::string iline;
     std::string pline;
 
+    // hls-fpga-machine-learning insert inputs
+    // hls-fpga-machine-learning insert results
+
     std::vector<input_data_t> inputs;
     std::vector<output_data_t> outputs;
 
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index f4eed945a..942964fc8 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,4 +1,5 @@
 from hls4ml.writer.quartus_writer import QuartusWriter
+from hls4ml.writer.oneapi_writer import OneAPIWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
@@ -9,4 +10,5 @@
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
 register_writer('Quartus', QuartusWriter)
+register_writer('oneAPI', OneAPIWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
new file mode 100644
index 000000000..39a5f14c4
--- /dev/null
+++ b/hls4ml/writer/oneapi_writer.py
@@ -0,0 +1,967 @@
+import glob
+import os
+import tarfile
+from collections import OrderedDict
+from shutil import copyfile, copytree, rmtree
+
+import numpy as np
+import yaml
+
+from hls4ml.backends import get_backend
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, Dense
+from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
+from hls4ml.utils.string_utils import convert_to_pascal_case
+from hls4ml.writer.writers import Writer
+
+config_filename = 'hls4ml_config.yml'
+
+
+class OneAPIWriter(Writer):
+    def next_pow2(self, x):
+        return 1 << (x - 1).bit_length()
+
+    def __make_dat_file(self, original_path, project_path):
+        """
+        Convert other input/output data types into a dat file, which is
+        a text file with the falttened matrix printed out. Note that ' ' is
+        assumed to be the delimiter.
+        """
+
+        # Take in data from current supported data files
+        if original_path[-3:] == "npy":
+            data = np.load(original_path)
+        else:
+            raise Exception("Unsupported input/output data files.")
+
+        # Faltten data, just keep first dimension
+        data = data.reshape(data.shape[0], -1)
+
+        def print_data(f):
+            for i in range(data.shape[0]):
+                for j in range(data.shape[1]):
+                    f.write(str(data[i][j]) + " ")
+                f.write("\n")
+
+        # Print out in dat file
+        with open(project_path, "w") as f:
+            print_data(f)
+
+    def get_max_reuse_factor(self, model):
+        max_rf = 0
+        for layer in model.get_layers():
+            rf = int(layer.get_attr('reuse_factor'))
+            if rf > max_rf:
+                max_rf = rf
+        return max_rf
+
+    def print_array_to_cpp(self, var, layer, odir):
+        """Write a weights array to C++ header files.
+
+        Args:
+            var (WeightVariable): Weight to write
+            layer (Layer): Instance of the layer to which the weights belong
+            odir (str): Output directory
+        """
+        with open(f"{odir}/firmware/weights/{var.name}.h", "w") as h_file:
+
+            # meta data
+            h_file.write(f"//Numpy array shape {var.shape}\n")
+            h_file.write(f"//Min {np.min(var.min):.12f}\n")
+            h_file.write(f"//Max {np.max(var.max):.12f}\n")
+            h_file.write(f"//Number of zeros {var.nzeros}\n")
+            h_file.write("\n")
+
+            h_file.write(f"#ifndef {var.name.upper()}_H_\n")
+            h_file.write(f"#define {var.name.upper()}_H_\n")
+            h_file.write("\n")
+
+            rf = int(layer.get_attr('reuse_factor', 1))
+            weight_header = ''
+
+            weight_size = 0
+            if isinstance(layer, (Conv2D, Conv2DBatchnorm)):
+                weight_size = (
+                    layer.get_attr('impl_filt_height')
+                    * layer.get_attr('impl_filt_width')
+                    * layer.get_attr('n_filt')
+                    * layer.get_attr('n_chan')
+                )
+            elif isinstance(layer, (Conv1D)):
+                weight_size = layer.get_attr('impl_filt_width') * layer.get_attr('n_filt') * layer.get_attr('n_chan')
+            elif isinstance(layer, (Dense)):
+                weight_size = layer.get_attr('n_in') * layer.get_attr('n_out')
+
+            if rf == 1 or var.name[0] == 'b' or weight_size <= 2048 or (var.name[0] == 'w' and var.type.precision.width < 3):
+                pass  # might want to modify this
+            else:
+                block_factor = (layer.get_attr('n_in') * layer.get_attr('n_out')) / rf
+                nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2)
+                var_width = int(np.ceil(var.type.precision.width / 8))
+                bwidth = self.next_pow2(var_width)
+                weight_header += (
+                    f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
+                )
+            if var.storage.lower() == 'bram':
+                weight_header += 'static '
+            else:
+                weight_header += 'static const '
+            h_file.write(weight_header + var.definition_cpp() + " = {")
+
+            # fill c++ array.
+            # not including internal brackets for multidimensional case
+            sep = ''
+            for x in var:
+                h_file.write(sep + x)
+                sep = ", "
+            h_file.write("};\n")
+            h_file.write("\n#endif\n")
+
+
+    def write_project_dir(self, model):
+        """Write the base project directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        if not os.path.isdir(f"{model.config.get_output_dir()}/src/firmware/weights"):
+            os.makedirs(f"{model.config.get_output_dir()}/src/firmware/weights")
+
+    def write_project_cpp(self, model):
+        """Write the main architecture source file (myproject.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        project_name = model.config.get_project_name()
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, \
+             open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout:
+
+            model_inputs = model.get_input_variables()
+            model_outputs = model.get_output_variables()
+            model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+            if len(model_brams != 0):
+                raise NotImplementedError("Weights on the interface is currently not supported")
+
+            io_type = model.config.get_config_value('IOType')
+            indent = '    '
+
+            for line in f.readlines():
+                # Add headers to weights and biases
+                if 'myproject' in line:
+                    newline = line.replace('myproject', project_name)
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                # Read in inputs
+                elif '// hls-fpga-machine-learning read in' in line:
+                    newline = line
+                    if io_type == 'io_parallel':
+                        for inp in model_inputs:
+                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' 
+                    else:
+                        raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
+
+                # Insert weights
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            if w not in model_brams:
+                                newline += f'#include "weights/{w.name}.h"\n'
+
+
+                # Neural net instantiation
+                elif '// hls-fpga-machine-learning insert layers' in line:
+                    newline = line + '\n'
+                    model_inputs = model.get_input_variables()
+                    model_outputs = model.get_output_variables()
+                    for layer in model.get_layers():
+                        if io_type != 'io_stream':
+                            vars = layer.get_variables()
+                            for var in vars:
+                                if var not in model_inputs and var not in model_outputs:
+                                    def_cpp = var.definition_cpp()
+                                    if def_cpp is not None:
+                                        newline += '    ' + def_cpp + ';\n'
+                        func = layer.get_attr('function_cpp', None)
+                        if func:
+                            newline += '    ' + func + '\n'
+                            if model.config.trace_output and layer.get_attr('trace', False):
+                                newline += '#ifndef HLS_SYNTHESIS\n'
+                                for var in vars:
+                                    newline += '    nnet::save_layer_output<{}>({}, "{}", {});\n'.format(
+                                        var.type.name, var.name, layer.name, var.size_cpp()
+                                    )
+                                newline += '#endif\n'
+                            newline += '\n'
+
+                # Write the output
+                elif '// hls-fpga-machine-learning return' in line:
+                    newline = line
+                    if io_type == 'io_parallel':
+                        for out in model_outputs:
+                            newline += indent + f'{out.pipe_name}::write({out.name});\n' 
+                    else:
+                        raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
+
+                # Just copy line
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+
+    def write_project_header(self, model):
+        """Write the main architecture header file (myproject.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        project_name = model.config.get_project_name()
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, \
+             open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout:
+
+            model_inputs = model.get_input_variables()
+            model_outputs = model.get_output_variables()
+            model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+            # io_parallel and io_stream instantiate the top-level function differently
+            io_type = model.config.get_config_value('IOType')
+            indent = '    '
+            brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+            for line in f.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', format(project_name.upper()))
+
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', project_name)
+
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                # Declarations for the inputs. May need modification when io_stream is supported
+                elif '// hls-fpga-machine-learning insert inputs' in line:
+                    newline = line
+                    for inp in model_inputs:
+                        newline += inp.declare_cpp()
+
+                # and declareations for the outputs
+                elif '// hls-fpga-machine-learning insert outputs' in line:
+                    newline = line
+                    for out in model_outputs:
+                        newline += out.declare_cpp()
+
+                # Simply copy line, if no inserts are required
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+    def write_defines(self, model):
+        """Write the C++ type definitions file (defines.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, \
+             open(f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w') as fout:
+
+            for line in f.readlines():
+                # Insert numbers
+                if '// hls-fpga-machine-learning insert numbers' in line:
+                    newline = line
+
+                    defines_list = []
+                    for layer in model.get_layers():
+                        defines = ''
+                        # Note: this assumes all the layers have one ouput
+                        # (or in clones, one type of output)
+                        for k, v in layer.get_output_variable().get_shape():
+                            defines += f'#define {k} {v}\n'
+
+                        defines_list.append(defines)
+
+                    newline += ''.join(defines_list)
+
+                elif '// hls-fpga-machine-learning insert layer-precision' in line:
+                    newline = line
+                    all_precision = OrderedDict()
+                    for layer in model.get_layers():
+                        layer_precision = layer.get_layer_precision()
+                        for type_name, type_var in layer_precision.items():
+                            # Ensure that layer's types doesn't override existing types
+                            # This can happen in case of InplaceVariable types
+                            if type_name not in all_precision:
+                                all_precision[type_name] = type_var
+                    for used_type in all_precision.values():
+                        newline += used_type.definition_cpp()
+
+                else:
+                    newline = line
+                fout.write(newline)
+
+    def write_parameters(self, model):
+        """Write the C++ layer config file (parameters.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, \
+             open(f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w') as fout:
+
+            for line in f.readlines():
+                if '// hls-fpga-machine-learning insert includes' in line:
+                    newline = line
+                    for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
+                        newline += '#include "%s"\n' % include
+
+                elif "// hls-fpga-machine-learning insert layer-config" in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        config = layer.get_attr('config_cpp', None)
+                        if config:
+                            newline += config + '\n'
+                else:
+                    newline = line
+                fout.write(newline)
+
+    def write_weights(self, model):
+        """Write the weights into header files
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        for layer in model.get_layers():
+            for weights in layer.get_weights():
+                self.print_array_to_cpp(weights, layer, model.config.get_output_dir())
+
+    def write_test_bench(self, model):
+        """Write the testbench
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        # TODO - This function only works with one model input
+        # (NOT one data point - it works as expected with multiple data points)
+
+        # copy the exception handler
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp')
+        dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp'
+        copyfile(srcpath, dstpath)
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        if len(model_brams != 0):
+            raise NotImplementedError("Weights on the interface is currently not supported")
+
+        if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'):
+            os.mkdir(f'{model.config.get_output_dir()}/tb_data/')
+
+        input_data = model.config.get_config_value('InputData')
+        output_predictions = model.config.get_config_value('OutputPredictions')
+
+        if input_data:
+            if input_data[-3:] == "dat":
+                copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+            else:
+                self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+
+        if output_predictions:
+            if output_predictions[-3:] == "dat":
+                copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat')
+            else:
+                self.__make_dat_file(
+                    output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
+                )
+
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \
+             open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w') as fout:
+
+            for line in f.readlines():
+                indent = '   ' * (len(line) - len(line.lstrip(' ')))
+
+                if 'myproject' in line:
+                    newline = line.replace('myproject', model.config.get_project_name())
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+                elif '// hls-fpga-machine-learning insert inputs':
+                    newline = line
+                    for inp in model_inputs:
+                        newline += indent + f'std::vector<{inp.array_type}> {inp.name};\n'
+                        newline += indent + f'input_counts.push_back({inp.size_cpp()});\n'
+                elif '// hls-fpga-machine-learning insert results':
+                    newline = line
+                    for out in model_outputs:
+                        newline += indent + f'std::vector<{out.array_type}> {out.name};\n'
+                        newline += indent + f'output_counts.push_back({out.size_cpp()});\n'
+                elif '// hls-fpga-machine-learning insert data' in line:
+                    newline = line
+                    newline += '      std::vector<float>::const_iterator in_begin = in.cbegin();\n'
+                    newline += '      std::vector<float>::const_iterator in_end;\n'
+                    newline += '      inputs.emplace_back();\n'
+                    for inp in model.get_input_variables():
+                        newline += f'      in_end = in_begin + ({inp.size_cpp()});\n'
+                        newline += f'      std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n'
+                        newline += '      in_begin = in_end;\n'
+                    newline += '      outputs.emplace_back();\n'
+                elif '// hls-fpga-machine-learning insert zero' in line:
+                    newline = line
+                    newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
+                    for inp in model.get_input_variables():
+                        newline += indent + '  inputs.emplace_back();\n'
+                        newline += indent + '  outputs.emplace_back();\n'
+                        newline += indent + f'  std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n'
+                    newline += indent + '}\n'
+
+                elif '// hls-fpga-machine-learning insert top-level-function' in line:
+                    newline = line
+                    newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
+                    newline += indent + f'  ihc_hls_enqueue(&outputs[i], {model.config.get_project_name()}, inputs[i]'
+                    if model_brams:
+                        bram_vars = ','.join([b.name for b in model_brams])
+                        newline += f', {bram_vars});\n'
+                    else:
+                        newline += ');\n'
+                    newline += indent + '}\n'
+                elif 'hls-fpga-machine-learning insert run' in line:
+                    newline = line
+                    newline += '    ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n'
+                elif '// hls-fpga-machine-learning insert predictions' in line:
+                    newline = line
+                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
+                    newline += indent + '  std::cout << predictions[j][i] << " ";\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'std::cout << std::endl;\n'
+                elif '// hls-fpga-machine-learning insert tb-output' in line:
+                    newline = line
+                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
+                    newline += indent + f'  fout << outputs[j].{outvar.member_name}[i] << " ";\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'fout << std::endl;\n'
+                elif (
+                    '// hls-fpga-machine-learning insert output' in line
+                    or '// hls-fpga-machine-learning insert quantized' in line
+                ):
+                    newline = line
+                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
+                    newline += indent + f'  std::cout << outputs[j].{outvar.member_name}[i] << " ";\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'std::cout << std::endl;\n'
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+    def write_test_bench(self, model):
+        """Write the testbench
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        # TODO - This function only works with one model input
+        # (NOT one data point - it works as expected with multiple data points)
+
+        # copy the exception handler
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp')
+        dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp'
+        copyfile(srcpath, dstpath)
+
+        io_type = model.config.get_config_value('IOType')
+        if io_type == 'io_parallel':
+            self.write_testbench_parallel(model)
+        elif io_type == 'io_stream':
+            self.write_testbench_stream(model)
+
+    def write_bridge(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        pass
+
+    def write_build_script(self, model):
+        """Write the build scripts (Makefile, build_lib.sh)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # Makefile
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        device = model.config.get_config_value('Part')
+        with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, \
+             open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout:
+
+            for line in f.readlines():
+                line = line.replace('myproject', model.config.get_project_name())
+
+                if 'set(FPGA_DEVICE' in line:
+                    line = f'    set(FPGA_DEVICE "{device}")'
+
+                fout.write(line)
+
+
+    def write_nnet_utils(self, model):
+        """Copy the nnet_utils, AP types headers and any custom source to the project output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # nnet_utils
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        srcpath = os.path.join(filedir, '../templates/oneapi/firmware/nnet_utils/')
+        dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/'
+
+        if not os.path.exists(dstpath):
+            os.mkdir(dstpath)
+
+        headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')]
+
+        for h in headers:
+            copyfile(srcpath + h, dstpath + h)
+
+
+        # custom source
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        custom_source = get_backend('oneAPI').get_custom_source()
+        for dst, srcpath in custom_source.items():
+            dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
+            copyfile(srcpath, dstpath)
+
+    def __get_table_size(self, model, activation):
+        for layer in model.get_layers():
+            if (
+                layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
+            ) and layer.get_attr('table_size') is not None:
+                return int(layer.get_attr('table_size'))
+        return 1024
+
+    def __get_table_header(self, table_name, table_size):
+        table_header += f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+        return table_header
+
+    def __write_elu_table(self, model, path):
+        table_name = 'elu_table'
+        table_size = self.__get_table_size(model, 'elu')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = -8.0 * i / float(table_size)
+            real_val = np.exp(in_val) - 1.0
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_sigmoid_table(self, model, path):
+        MAX_VALUE = 8
+        MIN_VALUE = 0
+
+        table_name = 'sigmoid_table'
+        table_size = self.__get_table_size(model, 'sigmoid')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(int(table_size)):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+            real_val = 1.0 / (1 + np.exp(-in_val))
+            if real_val >= 0.5:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_tanh_table(self, model, path):
+        MAX_VALUE = 4
+        MIN_VALUE = 0
+
+        table_name = 'tanh_table'
+        table_size = self.__get_table_size(model, 'tanh')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+            real_val = np.tanh(in_val)
+            if real_val >= 0:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_softplus_table(self, model, path):
+        table_name = 'softplus_table'
+        table_size = self.__get_table_size(model, 'softplus')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
+            real_val = np.log(np.exp(in_val) + 1.0)
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_softsign_table(self, model, path):
+        MAX_VALUE = 8
+        MIN_VALUE = 0
+        table_name = 'softsign_table'
+        table_size = self.__get_table_size(model, 'softsign')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+
+            real_val = in_val / (np.fabs(in_val) + 1.0)
+            if real_val >= 0:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_selu_table(self, model, path):
+        table_name = 'selu_table'
+        table_size = self.__get_table_size(model, 'selu')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = -8.0 * i / float(table_size)
+            real_val = 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (np.exp(in_val) - 1.0))
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table(self, model, path):
+        table_name = 'exp_table'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+                    if fp_signed is False:
+                        raise Exception('Softmax types need to be signed')
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            b = uint_to_binary(i, N)
+            if i == 0:
+                b.insert(0, 0)
+            else:
+                b.insert(0, 1)
+            f.set_msb_bits(b)
+            real_val = f.exp_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table(self, model, path):
+        table_name = 'invert_table'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+                    if fp_signed is False:
+                        raise Exception('Softmax types need to be signed')
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            b = uint_to_binary(i, N)
+            b.insert(0, 0)
+            f.set_msb_bits(b)
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table_latency(self, model, path):
+        table_name = 'exp_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.exp_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table_latency(self, model, path):
+        table_name = 'invert_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table_legacy(self, model, path):
+        table_name = 'exp_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
+            real_val = np.exp(in_val)
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table_legacy(self, model, path):
+        table_name = 'invert_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            real_val = 0
+            in_val = 64.0 * i / float(table_size)
+            if in_val > 0.0:
+                real_val = 1.0 / in_val
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def write_activation_tables(self, model):
+        """Write the lookup tables for activation functions
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        # Output path
+        dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/activation_tables'
+        if not os.path.exists(dstpath):
+            os.mkdir(dstpath)
+
+        # Tables
+        # TODO - Only write tables needed by model, not all of them
+        self.__write_elu_table(model, dstpath)
+        self.__write_sigmoid_table(model, dstpath)
+        self.__write_tanh_table(model, dstpath)
+        self.__write_softplus_table(model, dstpath)
+        self.__write_softsign_table(model, dstpath)
+        self.__write_selu_table(model, dstpath)
+        self.__write_exp_table(model, dstpath)
+        self.__write_invert_table(model, dstpath)
+        self.__write_exp_table_latency(model, dstpath)
+        self.__write_invert_table_latency(model, dstpath)
+        self.__write_exp_table_legacy(model, dstpath)
+        self.__write_invert_table_legacy(model, dstpath)
+
+    def write_yml(self, model):
+        """Write the config to the YAML file
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        def keras_model_representer(dumper, keras_model):
+            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            keras_model.save(model_path)
+            return dumper.represent_scalar('!keras_model', model_path)
+
+        try:
+            from tensorflow.keras import Model as KerasModel
+
+            yaml.add_multi_representer(KerasModel, keras_model_representer)
+        except Exception:
+            pass
+
+        with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file:
+            yaml.dump(model.config.config, file)
+
+    def write_tar(self, model):
+        """Write the generated project as a .tar.gz archive
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive:
+            archive.add(model.config.get_output_dir(), recursive=True)
+
+    def write_hls(self, model):
+        print('Writing HLS project')
+        self.write_project_dir(model)
+        self.write_project_cpp(model)
+        self.write_project_header(model)
+        self.write_weights(model)
+        self.write_defines(model)
+        self.write_parameters(model)
+        self.write_test_bench(model)
+        self.write_bridge(model)
+        self.write_build_script(model)
+        self.write_nnet_utils(model)
+        self.write_activation_tables(model)
+        self.write_yml(model)
+        self.write_tar(model)
+        print('Done')

From b7429015f285d8cd1cf51727331d3f4572646ffe Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 26 Dec 2023 17:26:08 -0600
Subject: [PATCH 004/100] update the bridge and testbench

---
 hls4ml/templates/oneapi/myproject_bridge.cpp |  74 ++++++++
 hls4ml/templates/oneapi/myproject_test.cpp   |  51 +++---
 hls4ml/writer/oneapi_writer.py               | 178 +++++++++++--------
 3 files changed, 199 insertions(+), 104 deletions(-)
 create mode 100644 hls4ml/templates/oneapi/myproject_bridge.cpp

diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
new file mode 100644
index 000000000..4b7a6b170
--- /dev/null
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -0,0 +1,74 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+#include "exception_handler.hpp"
+/
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+    sycl::queue q(selector, fpga_tools::exception_handler,
+                  sycl::property::queue::enable_profiling{});
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+    sycl::queue q(selector, fpga_tools::exception_handler,
+                  sycl::property::queue::enable_profiling{});
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index a9830245a..39b2e17c1 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -60,9 +60,6 @@ int main(int argc, char **argv) {
     // hls-fpga-machine-learning insert inputs
     // hls-fpga-machine-learning insert results
 
-    std::vector<input_data_t> inputs;
-    std::vector<output_data_t> outputs;
-
     if (fin.is_open() && fpr.is_open()) {
         std::vector<std::vector<float>> predictions;
         unsigned int num_iterations = 0;
@@ -84,45 +81,46 @@ int main(int argc, char **argv) {
             while (sspred >> current) {
                 pr.push_back(current);
             }
-            if (in.size() != N_INPUT_1_1) {
-                throw std::runtime_error("The input size does not match");
-            }
-            if (pr.size() != N_LAYER_11) {
-                throw std::runtime_error("The output size does not match");
-            }
 
             // hls-fpga-machine-learning insert data
             inputs.emplace_back();
+            if (in.size() != inputs[0].size()) {
+                throw std::runtime_error("The input size does not match");
+            }
+
             std::copy(in.cbegin(), in.cend(), inputs.back().begin());
+
             outputs.emplace_back();
-            predictions.push_back(std::move(pr));
+            if (pr.size() != outputs[0].size()) {
+                throw std::runtime_error("The output size does not match");
+            }
+            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
+
         }
         // Do this separately to avoid vector reallocation
-        // hls-fpga-machine-learning insert top-level-function
         for(int i = 0; i < num_iterations; i++) {
-            InPipe::write(q, inputs[i]);
+            // hls-fpga-machine-learning insert tb-input
             q.single_task(MyProject{});  // once or once for each
         }
         q.wait();
 
         for (int j = 0; j < num_iterations; j++) {
             // hls-fpga-machine-learning insert tb-output
-            outputs[j] = OutPipe::read(q);
-            for(int i = 0; i < N_LAYER_11; i++) {
-              fout << outputs[j][i] << " ";
+            for(auto outval : outputs[j]) {
+              fout << outval << " ";
             }
             fout << std::endl;
             if (j % CHECKPOINT == 0) {
                 std::cout << "Predictions" << std::endl;
                 // hls-fpga-machine-learning insert predictions
-                for(int i = 0; i < N_LAYER_11; i++) {
-                  std::cout << predictions[j][i] << " ";
+                for(auto predval : predictions[j]) {
+                  std::cout << predval << " ";
                 }
                 std::cout << std::endl;
                 std::cout << "Quantized predictions" << std::endl;
                 // hls-fpga-machine-learning insert quantized
-                for(int i = 0; i < N_LAYER_11; i++) {
-                  std::cout << outputs[j][i] << " ";
+                for(auto outval : outputs[j]) {
+                  std::cout << outval << " ";
                 }
                 std::cout << std::endl;
             }
@@ -137,27 +135,26 @@ int main(int argc, char **argv) {
         for(int i = 0; i < num_iterations; i++) {
             inputs.emplace_back();
             outputs.emplace_back();
-            outputs.back().fill(0.0);
+            inputs.back().fill(0.0);
         }
 
         // hls-fpga-machine-learning insert top-level-function
         for(int i = 0; i < num_iterations; i++) {
-            InPipe::write(q, inputs[i]);
+            // hls-fpga-machine-learning insert tb-input
             q.single_task(MyProject{});
         }
         q.wait();
 
         for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert output
-            outputs[j] = OutPipe::read(q);
-            for(int i = 0; i < N_LAYER_11; i++) {
-              std::cout << outputs[j][i] << " ";
+            // hls-fpga-machine-learning insert tb-output
+            for(auto outval : outputs[j]) {
+              std::cout << outval << " ";
             }
             std::cout << std::endl;
 
             // hls-fpga-machine-learning insert tb-output
-            for(int i = 0; i < N_LAYER_11; i++) {
-              fout << outputs[j][i] << " ";
+            for(auto outval : outputs[j]) {
+              fout << outval << " ";
             }
             fout << std::endl;
         }
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 39a5f14c4..e9d45ff08 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -359,6 +359,7 @@ def write_test_bench(self, model):
         dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp'
         copyfile(srcpath, dstpath)
 
+        project_name = model.config.get_project_name()
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
         model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -366,6 +367,11 @@ def write_test_bench(self, model):
         if len(model_brams != 0):
             raise NotImplementedError("Weights on the interface is currently not supported")
 
+        if len(model_inputs) != 1 or len(model_outputs) != 1:
+            print("The testbench supports only single input arrays and single output arrays.")
+            print("Please modify it before using it.")
+            return
+
         if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'):
             os.mkdir(f'{model.config.get_output_dir()}/tb_data/')
 
@@ -387,13 +393,13 @@ def write_test_bench(self, model):
                 )
 
         with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \
-             open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w') as fout:
+             open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout:
 
             for line in f.readlines():
                 indent = '   ' * (len(line) - len(line.lstrip(' ')))
 
                 if 'myproject' in line:
-                    newline = line.replace('myproject', model.config.get_project_name())
+                    newline = line.replace('myproject', project_name)
                 elif 'MyProject' in line:
                     newline = line.replace('MyProject', convert_to_pascal_case(project_name))
 
@@ -403,100 +409,118 @@ def write_test_bench(self, model):
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
                 elif '// hls-fpga-machine-learning insert inputs':
                     newline = line
-                    for inp in model_inputs:
-                        newline += indent + f'std::vector<{inp.array_type}> {inp.name};\n'
-                        newline += indent + f'input_counts.push_back({inp.size_cpp()});\n'
+                    # there should really be only one input
+                    inp = model_inputs[0]
+                    newline += indent + f'std::vector<{inp.array_type}> inputs;\n'
+
                 elif '// hls-fpga-machine-learning insert results':
                     newline = line
-                    for out in model_outputs:
-                        newline += indent + f'std::vector<{out.array_type}> {out.name};\n'
-                        newline += indent + f'output_counts.push_back({out.size_cpp()});\n'
-                elif '// hls-fpga-machine-learning insert data' in line:
-                    newline = line
-                    newline += '      std::vector<float>::const_iterator in_begin = in.cbegin();\n'
-                    newline += '      std::vector<float>::const_iterator in_end;\n'
-                    newline += '      inputs.emplace_back();\n'
-                    for inp in model.get_input_variables():
-                        newline += f'      in_end = in_begin + ({inp.size_cpp()});\n'
-                        newline += f'      std::copy(in_begin, in_end, inputs.back().{inp.member_name});\n'
-                        newline += '      in_begin = in_end;\n'
-                    newline += '      outputs.emplace_back();\n'
-                elif '// hls-fpga-machine-learning insert zero' in line:
-                    newline = line
-                    newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
-                    for inp in model.get_input_variables():
-                        newline += indent + '  inputs.emplace_back();\n'
-                        newline += indent + '  outputs.emplace_back();\n'
-                        newline += indent + f'  std::fill_n(inputs[i].{inp.member_name}, {inp.size_cpp()}, 0.0);\n'
-                    newline += indent + '}\n'
-
-                elif '// hls-fpga-machine-learning insert top-level-function' in line:
+                    # there should really be only one out
+                    out = model_outputs[0]
+                    newline += indent + f'std::vector<{out.array_type}> predictions;\n'
+                elif '// hls-fpga-machine-learning insert tb-input' in line:
                     newline = line
-                    newline += indent + 'for(int i = 0; i < num_iterations; i++) {\n'
-                    newline += indent + f'  ihc_hls_enqueue(&outputs[i], {model.config.get_project_name()}, inputs[i]'
-                    if model_brams:
-                        bram_vars = ','.join([b.name for b in model_brams])
-                        newline += f', {bram_vars});\n'
-                    else:
-                        newline += ');\n'
-                    newline += indent + '}\n'
-                elif 'hls-fpga-machine-learning insert run' in line:
-                    newline = line
-                    newline += '    ' + f'ihc_hls_component_run_all({model.config.get_project_name()});\n'
-                elif '// hls-fpga-machine-learning insert predictions' in line:
-                    newline = line
-                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
-                    newline += indent + '  std::cout << predictions[j][i] << " ";\n'
-                    newline += indent + '}\n'
-                    newline += indent + 'std::cout << std::endl;\n'
+                    inp = model_inputs[0]
+                    newline += indent + f'{inp.pipe_name}::write(q, inputs[i]);\n'
                 elif '// hls-fpga-machine-learning insert tb-output' in line:
                     newline = line
-                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
-                    newline += indent + f'  fout << outputs[j].{outvar.member_name}[i] << " ";\n'
-                    newline += indent + '}\n'
-                    newline += indent + 'fout << std::endl;\n'
-                elif (
-                    '// hls-fpga-machine-learning insert output' in line
-                    or '// hls-fpga-machine-learning insert quantized' in line
-                ):
-                    newline = line
-                    newline += indent + f'for(int i = 0; i < {outvar.size_cpp()}; i++) {{\n'
-                    newline += indent + f'  std::cout << outputs[j].{outvar.member_name}[i] << " ";\n'
-                    newline += indent + '}\n'
-                    newline += indent + 'std::cout << std::endl;\n'
+                    out = model_outputs[0]
+                    newline += indent + f'outputs[i] = {out.pipe_name}::read(q);\n' 
                 else:
                     newline = line
 
                 fout.write(newline)
 
-    def write_test_bench(self, model):
-        """Write the testbench
+    def write_bridge(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
 
         Args:
             model (ModelGraph): the hls4ml model.
         """
-        # TODO - This function only works with one model input
-        # (NOT one data point - it works as expected with multiple data points)
+        project_name = model.config.get_project_name()
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+        # model brambs aren't actually supported yet
+
+        io_type = model.config.get_config_value('IOType')
+        indent = '    '
 
-        # copy the exception handler
         filedir = os.path.dirname(os.path.abspath(__file__))
-        srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp')
-        dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp'
-        copyfile(srcpath, dstpath)
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, \
+             open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout:
 
-        io_type = model.config.get_config_value('IOType')
-        if io_type == 'io_parallel':
-            self.write_testbench_parallel(model)
-        elif io_type == 'io_stream':
-            self.write_testbench_stream(model)
+            for line in f.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', format(project_name.upper()))
 
-    def write_bridge(self, model):
-        """Write the Python-C++ bridge (myproject_bridge.cpp)
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', format(project_name))
+
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+                elif '// hls-fpga-machine-learning insert header' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                    outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                    newline = ''
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str + '\n'
+
+                elif '// hls-fpga-machine-learning insert wrapper' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    newline = ''
+                    for i in model_inputs:
+                        newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
+                        newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input);\n'
+                        newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
+
+                    newline += '\n'
+
+                    for o in model_outputs:
+                        newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
+
+                    newline += '\n'
+
+                    # input_vars = ','.join([i.name + '_input' for i in model_inputs])
+                    # bram_vars = ','.join([b.name for b in model_brams])
+                    # output_vars = ','.join([o.name + '_output' for o in model_outputs])
+
+                    # Concatenate the input, output, and bram variables. Filter out empty/null values
+                    all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                    top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
+                    newline += top_level
+
+                    newline += '\n'
+
+                    for o in model_outputs:
+                        newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
+                        newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp}>({o.name}_output, {o.name});\n'
+                elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                    newline = ''
+                    for layer in model.get_layers():
+                        func = layer.get_attr('function_cpp')
+                        if func and model.config.trace_output and layer.get_attr('trace', False):
+                            vars = layer.get_variables()
+                            for var in vars:
+                                newline += (
+                                    indent
+                                    + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                    + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                                )
+
+                else:
+                    newline = line
+                fout.write(newline)
 
-        Args:
-            model (ModelGraph): the hls4ml model.
-        """
-        pass
 
     def write_build_script(self, model):
         """Write the build scripts (Makefile, build_lib.sh)

From 8f6ef788d9afd5507ecaeb122c3c4f280fe1c865 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 27 Dec 2023 19:06:18 -0600
Subject: [PATCH 005/100] fix issues discovered when compiling

---
 .../backends/oneapi/passes/core_templates.py  |  8 +-
 .../firmware/nnet_utils/nnet_activation.h     | 79 ++++++++++---------
 .../oneapi/firmware/nnet_utils/nnet_dense.h   | 21 ++---
 hls4ml/writer/oneapi_writer.py                | 26 +++---
 4 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index aece9fc22..608e6b7ff 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -35,8 +35,8 @@
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
-dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
-
+# dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
+dense_include_list = ['nnet_utils/nnet_dense.h']
 
 class DenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -147,8 +147,8 @@ def format(self, node):
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
-activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
-
+# activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
+activ_include_list = ['nnet_utils/nnet_activation.h']
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index d874741ec..191bf5613 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -2,6 +2,7 @@
 #define NNET_ACTIVATION_H_
 
 #include "nnet_common.h"
+#include <array>
 
 namespace nnet {
 
@@ -23,7 +24,7 @@ struct activ_config {
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void linear(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -34,7 +35,7 @@ template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[
 // *************************************************
 //       RELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void relu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -46,7 +47,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CO
 }
 
 template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void relu_max(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -59,11 +60,11 @@ void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void relu6(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
@@ -71,13 +72,13 @@ template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[C
 //       Sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void sigmoid(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     static const int MAX_VALUE = 8;
 #include "activation_tables/sigmoid_table.tb"
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T absoluteValue hls_register;
-        res_T temp2 hls_register;
+        [[intel::fpga_register]] data_T absoluteValue;
+        [[intel::fpga_register]] res_T temp2;
         if (data[ii] < 0) {
             absoluteValue = -data[ii];
         } else {
@@ -106,7 +107,7 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_fr
     static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
 
     // Slice the top N bits of the input
-    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
     // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
     if (x != 0 && y == 0)
         y[0] = 1;
@@ -118,29 +119,29 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
     static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
 
     // Slice the top N bits of the input
-    hls_register ac_int<N, false> y = x.template slc<N>(x.width - N);
+    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N);
     return y.to_uint();
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softmax_stable(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
 // Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
     // Find maximum
     Op_max<data_T> op_max;
-    hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+    [[intel::fpga_register]] data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data.data(), op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    [[intel::fpga_register]] ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         d_xi_xmax[i] = data[i] - x_max;
     }
 
     // Calculate all the e^x's
-    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
@@ -148,11 +149,11 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 
     // Explicitly sum previously calculated exponentials with an adder tree
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    hls_register typename CONFIG_T::exp_table_t exp_sum =
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
         reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
-    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
         invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
@@ -162,12 +163,12 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softmax_latency(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
 
     // Calculate all the e^x's
-    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<data_T, CONFIG_T>(data[i])];
@@ -175,11 +176,11 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 
     // Explicitly sum the results with an adder tree.
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    hls_register typename CONFIG_T::exp_table_t exp_sum =
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
         reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
-    hls_register typename CONFIG_T::inv_table_t inv_exp_sum =
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
         invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
@@ -188,11 +189,11 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
 
-    hls_register int data_round[CONFIG_T::n_in];
+    [[intel::fpga_register]] int data_round[CONFIG_T::n_in];
 New_loop:
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
@@ -230,14 +231,14 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = (res_T)0;
     }
 
-    hls_register data_T maximum = data[0];
-    hls_register int idx = 0;
+    [[intel::fpga_register]] data_T maximum = data[0];
+    [[intel::fpga_register]] int idx = 0;
 
     #pragma ii 1
     for (int i = 1; i < CONFIG_T::n_in; i++) {
@@ -251,7 +252,7 @@ void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+inline void softmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::stable:
         softmax_stable<data_T, res_T, CONFIG_T>(data, res);
@@ -275,15 +276,15 @@ inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 //       TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void dense_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     static const int MAX_VALUE = 4;
 // Initialize the lookup table
 #include "activation_tables/tanh_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T temp hls_register;
-        res_T temp2 hls_register;
+        [[intel::fpga_register]] data_T temp;
+        [[intel::fpga_register]] res_T temp2;
         if (data[ii] < 0) {
             temp = -data[ii];
         } else {
@@ -305,7 +306,7 @@ void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 //       Hard sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -318,7 +319,7 @@ void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void hard_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -364,7 +365,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI
 //       Softplus Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softplus(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
 // Initialize the lookup table
 #include "activation_tables/softplus_table.tb"
     // Index into the lookup table based on data
@@ -384,7 +385,7 @@ void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 //       Softsign Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void softsign(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     static const int MAX_VALUE = 8;
 // Initialize the lookup table
 #include "activation_tables/softsign_table.tb"
@@ -392,8 +393,8 @@ void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T temp hls_register;
-        res_T temp2 hls_register;
+        [[intel::fpga_register]] data_T temp;
+        [[intel::fpga_register]] res_T temp2;
         if (data[ii] < 0) {
             temp = -data[ii];
         } else {
@@ -433,14 +434,14 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void selu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
 // Initialize the lookup table
 #include "activation_tables/selu_table.tb"
     // Index into the lookup table based on data
@@ -477,7 +478,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_
 //       Binary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void binary_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -495,7 +496,7 @@ void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
 //       Ternary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+void ternary_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = 2 * data[ii];
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index c1786ef78..5071a7d6a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -5,6 +5,7 @@
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
 #include <cstdint>
+#include <array>
 
 namespace nnet {
 
@@ -37,21 +38,21 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
            "The current Reuse Factor is not allowed");
     assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
     //#pragma ii CONFIG_T::reuse_factor
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
 Load:
     #pragma unroll
     for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
         acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
     }
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    [[intel::fpga_register]] int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
 
     #pragma unroll
     for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
@@ -66,7 +67,7 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     #pragma nofusion
     #pragma speculated_iterations 0
     for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
     Product2:
         #pragma unroll
         for (int im = 0; im < CONFIG_T::block_factor; im++) {
@@ -78,7 +79,7 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
             tmp_acc[im] =
                 CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
         }
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
     ResetMult:
         #pragma unroll
         for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
@@ -105,14 +106,14 @@ void dense_rf_gt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
            "The current Reuse Factor is not allowed");
     assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
 
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
 InitAccum:
     #pragma unroll
     for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
@@ -122,7 +123,7 @@ void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     #pragma nofusion
     #pragma speculated_iterations 0
     for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
     MultLoop:
         #pragma unroll
         for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
@@ -157,7 +158,7 @@ void dense_rf_lt(const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
 }
 template <class data_T, class res_T, typename CONFIG_T>
 void dense_resource(
-    const data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+    const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index e9d45ff08..bc0b13c46 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -62,7 +62,7 @@ def print_array_to_cpp(self, var, layer, odir):
             layer (Layer): Instance of the layer to which the weights belong
             odir (str): Output directory
         """
-        with open(f"{odir}/firmware/weights/{var.name}.h", "w") as h_file:
+        with open(f"{odir}/src/firmware/weights/{var.name}.h", "w") as h_file:
 
             # meta data
             h_file.write(f"//Numpy array shape {var.shape}\n")
@@ -142,7 +142,7 @@ def write_project_cpp(self, model):
             model_outputs = model.get_output_variables()
             model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
 
-            if len(model_brams != 0):
+            if len(model_brams) != 0:
                 raise NotImplementedError("Weights on the interface is currently not supported")
 
             io_type = model.config.get_config_value('IOType')
@@ -182,7 +182,7 @@ def write_project_cpp(self, model):
                         if io_type != 'io_stream':
                             vars = layer.get_variables()
                             for var in vars:
-                                if var not in model_inputs and var not in model_outputs:
+                                if var not in model_inputs:
                                     def_cpp = var.definition_cpp()
                                     if def_cpp is not None:
                                         newline += '    ' + def_cpp + ';\n'
@@ -364,7 +364,7 @@ def write_test_bench(self, model):
         model_outputs = model.get_output_variables()
         model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
 
-        if len(model_brams != 0):
+        if len(model_brams) != 0:
             raise NotImplementedError("Weights on the interface is currently not supported")
 
         if len(model_inputs) != 1 or len(model_outputs) != 1:
@@ -392,11 +392,11 @@ def write_test_bench(self, model):
                     output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
                 )
 
-        with open(os.path.join(filedir, '../templates/oneapi/myproject_test_parallel.cpp')) as f, \
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, \
              open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout:
 
             for line in f.readlines():
-                indent = '   ' * (len(line) - len(line.lstrip(' ')))
+                indent = ' ' * (len(line) - len(line.lstrip('    ')))
 
                 if 'myproject' in line:
                     newline = line.replace('myproject', project_name)
@@ -407,17 +407,17 @@ def write_test_bench(self, model):
                     newline = line
                     for bram in model_brams:
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
-                elif '// hls-fpga-machine-learning insert inputs':
+                elif '// hls-fpga-machine-learning insert inputs' in line:
                     newline = line
                     # there should really be only one input
                     inp = model_inputs[0]
                     newline += indent + f'std::vector<{inp.array_type}> inputs;\n'
 
-                elif '// hls-fpga-machine-learning insert results':
+                elif '// hls-fpga-machine-learning insert results' in line:
                     newline = line
                     # there should really be only one out
                     out = model_outputs[0]
-                    newline += indent + f'std::vector<{out.array_type}> predictions;\n'
+                    newline += indent + f'std::vector<{out.array_type}> outputs;\n'
                 elif '// hls-fpga-machine-learning insert tb-input' in line:
                     newline = line
                     inp = model_inputs[0]
@@ -425,7 +425,7 @@ def write_test_bench(self, model):
                 elif '// hls-fpga-machine-learning insert tb-output' in line:
                     newline = line
                     out = model_outputs[0]
-                    newline += indent + f'outputs[i] = {out.pipe_name}::read(q);\n' 
+                    newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n' 
                 else:
                     newline = line
 
@@ -494,7 +494,7 @@ def write_bridge(self, model):
                     # output_vars = ','.join([o.name + '_output' for o in model_outputs])
 
                     # Concatenate the input, output, and bram variables. Filter out empty/null values
-                    all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+                    # all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
 
                     top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
                     newline += top_level
@@ -539,7 +539,7 @@ def write_build_script(self, model):
                 line = line.replace('myproject', model.config.get_project_name())
 
                 if 'set(FPGA_DEVICE' in line:
-                    line = f'    set(FPGA_DEVICE "{device}")'
+                    line = f'    set(FPGA_DEVICE "{device}")\n'
 
                 fout.write(line)
 
@@ -583,7 +583,7 @@ def __get_table_size(self, model, activation):
         return 1024
 
     def __get_table_header(self, table_name, table_size):
-        table_header += f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+        table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
         return table_header
 
     def __write_elu_table(self, model, path):

From 2e56be430b8427dd27f45c5ce84fd1b5a4298d92 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 8 Jan 2024 15:18:24 -0600
Subject: [PATCH 006/100] update bridge writing files

---
 hls4ml/templates/oneapi/CMakeLists.txt       | 14 ++++++++++++++
 hls4ml/templates/oneapi/myproject_bridge.cpp |  2 +-
 hls4ml/writer/oneapi_writer.py               |  5 +++--
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index 66c505450..5e5490b51 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -20,7 +20,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 ### Customize these build variables
 ###############################################################################
 set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp)
+set(LIBRARY_FILES src/firmware/myproject.cpp src/myproject_bridge.cpp)
+set(LIB_STAMP mystamp)
 set(TARGET_NAME myproject)
+set(LIBRARY_NAME myproject-${LIB_STAMP})
 
 # Use cmake -DFPGA_DEVICE=<board-support-package>:<board-variant> to choose a
 # different device. Here are a few device examples (this list is not
@@ -62,6 +65,7 @@ set(SIMULATOR_TARGET fpga_sim)
 set(REPORT_TARGET report)
 set(FPGA_TARGET fpga)
 set(IP_EXPORT_TARGET fpga_ip_export)
+set(LIBRARY_TARGET lib)
 
 # Set the names of the generated files per makefile target
 set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET})
@@ -112,6 +116,16 @@ set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reu
 set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
 set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel)
 
+###############################################################################
+### FPGA Emulator library
+###############################################################################
+add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES})
+target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${LIBRARY_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS})
+target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${LIBRARY_TARGET} ${EMULATOR_LINK_FLAGS})
+set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
+
 ###############################################################################
 ### FPGA Emulator
 ###############################################################################
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index 4b7a6b170..f4974ad8b 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -7,7 +7,7 @@
 #include <map>
 
 #include "exception_handler.hpp"
-/
+
 // hls-fpga-machine-learning insert bram
 
 namespace nnet {
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index bc0b13c46..6ccac4459 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -479,7 +479,7 @@ def write_bridge(self, model):
                     newline = ''
                     for i in model_inputs:
                         newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
-                        newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input);\n'
+                        newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n'
                         newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
 
                     newline += '\n'
@@ -503,7 +503,7 @@ def write_bridge(self, model):
 
                     for o in model_outputs:
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
-                        newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp}>({o.name}_output, {o.name});\n'
+                        newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''
                     for layer in model.get_layers():
@@ -537,6 +537,7 @@ def write_build_script(self, model):
 
             for line in f.readlines():
                 line = line.replace('myproject', model.config.get_project_name())
+                line = line.replace('mystamp', model.config.get_config_value('Stamp'))
 
                 if 'set(FPGA_DEVICE' in line:
                     line = f'    set(FPGA_DEVICE "{device}")\n'

From b90021fb94195ddf97e791368d3a880020bda94a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 8 Jan 2024 18:37:07 -0600
Subject: [PATCH 007/100] build library (but not tested)

---
 hls4ml/backends/oneapi/oneapi_backend.py | 40 +++++++++++++++++-------
 hls4ml/templates/oneapi/CMakeLists.txt   | 12 ++++---
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 799c28963..53f8d83b3 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -1,5 +1,5 @@
-import os
-from contextlib import contextmanager
+import subprocess
+from pathlib import Path
 
 import numpy as np
 
@@ -9,17 +9,8 @@
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
-#from hls4ml.report import parse_oneapi_report
 
-
-@contextmanager
-def chdir(newdir):
-    prevdir = os.getcwd()
-    os.chdir(os.path.expanduser(newdir))
-    try:
-        yield
-    finally:
-        os.chdir(prevdir)
+# from hls4ml.report import parse_oneapi_report
 
 
 class OneAPIBackend(FPGABackend):
@@ -134,6 +125,31 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
 
         return config
 
+    def compile(self, model):
+        """Compile the generated project that can be linked into Python runtime.
+
+        Args:
+            model (ModelGraph): Model to compile.
+
+        Raises:
+            Exception: If the project failed to compile
+
+        Returns:
+            string: Returns the name of the compiled library.
+        """
+        outdir = Path(Path.cwd(), model.config.get_output_dir())
+        builddir = outdir / 'build'
+        builddir.mkdir(exist_ok=True)
+        try:
+            subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
+        except subprocess.CalledProcessError:
+            raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
+        subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
+        subprocess.run('make lib', shell=True, cwd=builddir, check=True)
+
+        lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so'
+        return lib_name
+
     def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False):
         """
         Builds the project using Intel DPC++ (oneAPI) compiler.
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index 5e5490b51..abadd5395 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -31,8 +31,8 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 #   intel_s10sx_pac:pac_s10
 #   intel_s10sx_pac:pac_s10_usm
 #   intel_a10gx_pac:pac_a10
-# Note that depending on your installation, you may need to specify the full 
-# path to the board support package (BSP), this usually is in your install 
+# Note that depending on your installation, you may need to specify the full
+# path to the board support package (BSP), this usually is in your install
 # folder.
 #
 # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
@@ -42,7 +42,7 @@ if(NOT DEFINED FPGA_DEVICE)
 endif()
 
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
-# compilation. 
+# compilation.
 set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
@@ -97,6 +97,8 @@ else()
 endif()
 
 set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS})
+# for debugging need to do this. Not sure why
+# set(COMMON_LINK_FLAGS -v -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
 set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
 
 # A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
@@ -210,7 +212,7 @@ function(getCompileCommands common_compile_flags special_compile_flags common_li
         # Get the relative path to the source and object files
         file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source})
         file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
-        
+
         # Creating a string that contains the compile command
         # Start by the compiler invocation
         set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}")
@@ -254,7 +256,7 @@ function(getCompileCommands common_compile_flags special_compile_flags common_li
     # Add all the specific link flags
     foreach(FLAG ${special_link_flags})
         set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}")
-    endforeach()    
+    endforeach()
 
     # Add the output file
     set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}")

From f086aa2c6c142c79bf51845574e74040bf548dd7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 10 Jan 2024 17:50:19 -0600
Subject: [PATCH 008/100] fix a bug in testbench

---
 hls4ml/templates/oneapi/myproject_test.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index 39b2e17c1..fce1c19db 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -152,7 +152,6 @@ int main(int argc, char **argv) {
             }
             std::cout << std::endl;
 
-            // hls-fpga-machine-learning insert tb-output
             for(auto outval : outputs[j]) {
               fout << outval << " ";
             }

From 1f28cbf934af8bfe54010b5f96addcfd4816c0a6 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 10 Jan 2024 20:13:47 -0600
Subject: [PATCH 009/100] snapshot after some debugging

---
 hls4ml/model/graph.py                         |  4 +-
 hls4ml/templates/oneapi/CMakeLists.txt        |  6 ++-
 .../firmware/nnet_utils/nnet_activation.h     | 42 +++++++++----------
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |  6 +--
 .../oneapi/firmware/nnet_utils/nnet_printf.h  | 18 ++++++++
 hls4ml/writer/oneapi_writer.py                | 23 +++-------
 6 files changed, 55 insertions(+), 44 deletions(-)
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index a6b5c29e8..ba10e0285 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -740,7 +740,9 @@ def predict(self, x):
         n_outputs = len(self.get_output_variables())
 
         curr_dir = os.getcwd()
-        os.chdir(self.config.get_output_dir() + '/firmware')
+        newdir = self.config.get_output_dir() + '/firmware' if os.path.exists(self.config.get_output_dir() + '/firmware') \
+            else self.config.get_output_dir() + '/src/firmware'
+        os.chdir(newdir)
 
         output = []
         if n_samples == 1 and n_inputs == 1:
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index abadd5395..d6b2a4745 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -107,7 +107,9 @@ set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
 # 2. The "link" stage invokes the compiler's FPGA backend before linking. For
 #    this reason, FPGA backend flags must be passed as link flags in CMake.
 set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR)
+set(LIBRARY_COMPILE_FLAGS -DFPGA_EMULATOR)
 set(EMULATOR_LINK_FLAGS )
+set(LIBRARY_LINK_FLAGS -L$ENV{FPGA_VARS_DIR}/host/linux64/lib)
 set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
 set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early)
 set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR)
@@ -123,9 +125,9 @@ set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS}
 ###############################################################################
 add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES})
 target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
-target_compile_options(${LIBRARY_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS})
+target_compile_options(${LIBRARY_TARGET} PRIVATE ${LIBRARY_COMPILE_FLAGS})
 target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS})
-target_link_libraries(${LIBRARY_TARGET} ${EMULATOR_LINK_FLAGS})
+target_link_libraries(${LIBRARY_TARGET} ${LIBRARY_LINK_FLAGS})
 set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
 
 ###############################################################################
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 191bf5613..19fbdb3b5 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -24,7 +24,7 @@ struct activ_config {
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void linear(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void linear(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -35,7 +35,7 @@ template <class data_T, class res_T, typename CONFIG_T> void linear(const std::a
 // *************************************************
 //       RELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void relu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -47,7 +47,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu(const std::arr
 }
 
 template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void relu_max(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -60,11 +60,11 @@ void relu_max(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, C
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu6(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu6(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
@@ -72,7 +72,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::ar
 //       Sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void sigmoid(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     static const int MAX_VALUE = 8;
 #include "activation_tables/sigmoid_table.tb"
     #pragma unroll
@@ -124,7 +124,7 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softmax_stable(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
 // Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
@@ -163,7 +163,7 @@ void softmax_stable(const std::array<data_T, CONFIG_T::n_in> data, std::array<re
 
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softmax_latency(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
 
@@ -189,7 +189,7 @@ void softmax_latency(const std::array<data_T, CONFIG_T::n_in> data, std::array<r
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softmax_legacy(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
 
@@ -231,7 +231,7 @@ void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> data, std::array<re
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softmax_argmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = (res_T)0;
@@ -252,7 +252,7 @@ void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<re
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-inline void softmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+inline void softmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::stable:
         softmax_stable<data_T, res_T, CONFIG_T>(data, res);
@@ -276,7 +276,7 @@ inline void softmax(const std::array<data_T, CONFIG_T::n_in> data, std::array<re
 //       TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void dense_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     static const int MAX_VALUE = 4;
 // Initialize the lookup table
 #include "activation_tables/tanh_table.tb"
@@ -306,7 +306,7 @@ void dense_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T,
 //       Hard sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -319,7 +319,7 @@ void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void hard_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -365,7 +365,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI
 //       Softplus Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softplus(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softplus(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
 // Initialize the lookup table
 #include "activation_tables/softplus_table.tb"
     // Index into the lookup table based on data
@@ -385,7 +385,7 @@ void softplus(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, C
 //       Softsign Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softsign(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void softsign(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     static const int MAX_VALUE = 8;
 // Initialize the lookup table
 #include "activation_tables/softsign_table.tb"
@@ -416,7 +416,7 @@ void softsign(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, C
 //       ELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+void elu(const std::array<data_T, CONFIG_T::n_in>& data, const res_T alpha, std::array<res_T, CONFIG_T::n_in>& res) {
 // Initialize the lookup table
 #include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
@@ -434,14 +434,14 @@ void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_i
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void elu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void selu(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+template <class data_T, class res_T, typename CONFIG_T> void selu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
 // Initialize the lookup table
 #include "activation_tables/selu_table.tb"
     // Index into the lookup table based on data
@@ -478,7 +478,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_
 //       Binary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void binary_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -496,7 +496,7 @@ void binary_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T
 //       Ternary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_in> res) {
+void ternary_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = 2 * data[ii];
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index 5071a7d6a..f6bbfc04a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -38,7 +38,7 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
+void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -106,7 +106,7 @@ void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
+void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -158,7 +158,7 @@ void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T
 }
 template <class data_T, class res_T, typename CONFIG_T>
 void dense_resource(
-    const std::array<data_T, CONFIG_T::n_in> data, std::array<res_T, CONFIG_T::n_out> res,
+    const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
new file mode 100644
index 000000000..830a322de
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
@@ -0,0 +1,18 @@
+#ifndef NNET_PRINTF_H_
+#define NNET_PRINTF_H_
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define CL_CONSTANT __attribute__((opencl_constant))
+#else
+#define CL_CONSTANT
+#endif
+
+using namespace sycl;
+
+#define PRINTF(format, ...)                                    \
+  {                                                            \
+    static const CL_CONSTANT char _format[] = format;          \
+    ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \
+  }
+
+#endif
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 6ccac4459..75555475c 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -160,7 +160,7 @@ def write_project_cpp(self, model):
                     newline = line
                     if io_type == 'io_parallel':
                         for inp in model_inputs:
-                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' 
+                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n'
                     else:
                         raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
 
@@ -203,7 +203,7 @@ def write_project_cpp(self, model):
                     newline = line
                     if io_type == 'io_parallel':
                         for out in model_outputs:
-                            newline += indent + f'{out.pipe_name}::write({out.name});\n' 
+                            newline += indent + f'{out.pipe_name}::write({out.name});\n'
                     else:
                         raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
 
@@ -425,7 +425,7 @@ def write_test_bench(self, model):
                 elif '// hls-fpga-machine-learning insert tb-output' in line:
                     newline = line
                     out = model_outputs[0]
-                    newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n' 
+                    newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n'
                 else:
                     newline = line
 
@@ -484,25 +484,14 @@ def write_bridge(self, model):
 
                     newline += '\n'
 
-                    for o in model_outputs:
-                        newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
-
-                    newline += '\n'
-
-                    # input_vars = ','.join([i.name + '_input' for i in model_inputs])
-                    # bram_vars = ','.join([b.name for b in model_brams])
-                    # output_vars = ','.join([o.name + '_output' for o in model_outputs])
-
-                    # Concatenate the input, output, and bram variables. Filter out empty/null values
-                    # all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
-
-                    top_level = indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
-                    newline += top_level
+                    newline += indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
+                    newline += indent + 'q.wait();\n'
 
                     newline += '\n'
 
                     for o in model_outputs:
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
+                        newline += indent + f'for (auto val : {o.name}_output) std::cout << val.to_double() << std::endl;\n'
                         newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''

From 3e69b9a7f2d25aaca4c513b4c3fa62a0c24990ca Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 10 Jan 2024 20:35:57 -0600
Subject: [PATCH 010/100] remove forgotten debug printing

---
 hls4ml/writer/oneapi_writer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 75555475c..deff589ed 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -491,7 +491,6 @@ def write_bridge(self, model):
 
                     for o in model_outputs:
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
-                        newline += indent + f'for (auto val : {o.name}_output) std::cout << val.to_double() << std::endl;\n'
                         newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''

From 17e6856e58351863d04aad1b0d2e2e934146964c Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 10 Jan 2024 20:58:20 -0600
Subject: [PATCH 011/100] add build

---
 hls4ml/backends/oneapi/oneapi_backend.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 53f8d83b3..0f7ca7a77 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -150,22 +150,31 @@ def compile(self, model):
         lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so'
         return lib_name
 
-    def build(self, model, synth=True, fpgasynth=False, log_level=1, cont_if_large_area=False):
+    def build(self, model, build_type='fpga_emu', run=False):
         """
         Builds the project using Intel DPC++ (oneAPI) compiler.
 
         Args:
             model (ModelGraph): The model to build
-            synth, optional: Whether to run HLS synthesis
-            fpgasynth, optional:  Whether to run FPGA synthesis (oneAPI Compile)
-            log_level, optional: Logging level to be displayed during HLS synthesis (0, 1, 2)
-            cont_if_large_area: Instruct the HLS compiler to continue synthesis if the estimated resource usage exceeds
-                device resources
+            build_type, optional: What to build (e.g. fpga_emu, fpga_sim, fpga, report)
+            run, optional: Whether to run the testbench
         Errors raise exceptions
         """
 
         # Check software needed is present
-        pass
+        outdir = Path(Path.cwd(), model.config.get_output_dir())
+        builddir = outdir / 'build'
+        builddir.mkdir(exist_ok=True)
+        try:
+            subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
+        except subprocess.CalledProcessError:
+            raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
+        subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
+        subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True)
+
+        if run and build_type in ('fpga_emu', 'fpga_sim', 'fpga'):
+            executable = builddir / f'{model.config.get_project_name()}.{build_type}'
+            subprocess.run(f'{str(executable)}', shell=True, cwd=builddir, check=True)
 
     @layer_optimizer(Layer)
     def init_base_layer(self, layer):

From 2766a6e5717a8f11d323345e0ffe157c078e431d Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 12 Jan 2024 14:50:40 -0600
Subject: [PATCH 012/100] pre-commit fixes

---
 hls4ml/backends/__init__.py                   |  2 +-
 hls4ml/backends/oneapi/oneapi_types.py        | 23 ++++---
 .../backends/oneapi/passes/core_templates.py  |  2 +
 .../backends/oneapi/passes/transform_types.py | 31 +++++----
 hls4ml/model/graph.py                         |  5 +-
 hls4ml/templates/oneapi/exception_handler.hpp | 15 ++---
 hls4ml/templates/oneapi/firmware/defines.h    |  6 +-
 .../templates/oneapi/firmware/myproject.cpp   |  9 +--
 hls4ml/templates/oneapi/firmware/myproject.h  |  9 +--
 .../firmware/nnet_utils/nnet_activation.h     | 48 +++++++------
 .../oneapi/firmware/nnet_utils/nnet_common.h  | 10 +--
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |  8 +--
 .../oneapi/firmware/nnet_utils/nnet_helpers.h | 32 +++------
 .../oneapi/firmware/nnet_utils/nnet_printf.h  | 10 +--
 hls4ml/templates/oneapi/myproject_bridge.cpp  |  6 +-
 hls4ml/templates/oneapi/myproject_test.cpp    | 50 ++++++--------
 hls4ml/writer/__init__.py                     |  2 +-
 hls4ml/writer/oneapi_writer.py                | 67 +++++++++----------
 18 files changed, 161 insertions(+), 174 deletions(-)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index cbd39813f..ac6dd73fe 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -1,9 +1,9 @@
 from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend  # noqa: F401
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
+from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
-from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index d76449f1e..f28d697a4 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -1,27 +1,30 @@
 '''
 This package includes oneAPI-specific customizations to the variable types
 '''
-from hls4ml.backends.fpga.fpga_types import VariableDefinition, ArrayVariableConverter
+from hls4ml.backends.fpga.fpga_types import ArrayVariableConverter, VariableDefinition
 
 # region ArrayVarable
 
+
 class OneAPIArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
         return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}'
 
+
 class OneAPIInplaceArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
+
 class OneAPIArrayVariableConverter(ArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition)
 
+
 class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter):
     def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition
-        )
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition)
+
 
 # endregion
 
@@ -31,12 +34,14 @@ def __init__(self, type_converter):
 class OneAPIInterfaceVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
         return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}'
-    
+
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
         lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
-        lines += indent + (f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-                + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n')
+        lines += indent + (
+            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+            + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n'
+        )
         return lines
 
 
@@ -63,9 +68,7 @@ def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition'
 
 class OneAPIInterfaceVariableConverter(InterfaceVariableConverter):
     def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition
-        )
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition)
 
 
 # endregion
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 608e6b7ff..929b5a8be 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -38,6 +38,7 @@
 # dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
 dense_include_list = ['nnet_utils/nnet_dense.h']
 
+
 class DenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(Dense)
@@ -150,6 +151,7 @@ def format(self, node):
 # activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
 activ_include_list = ['nnet_utils/nnet_activation.h']
 
+
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__((Activation, ParametrizedActivation, PReLU))
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 121392eda..2cfcd02c7 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -1,17 +1,14 @@
-from hls4ml.backends.fpga.fpga_types import (
-    ACTypeConverter,
-    HLSTypeConverter,
-    StaticWeightVariableConverter,
-)
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
 from hls4ml.backends.oneapi.oneapi_types import (
     OneAPIArrayVariableConverter,
     OneAPIInplaceArrayVariableConverter,
-    OneAPIInterfaceVariableConverter
+    OneAPIInterfaceVariableConverter,
 )
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
 from hls4ml.utils.string_utils import convert_to_pascal_case
 
+
 class TransformTypes(GlobalOptimizerPass):
     def __init__(self):
         self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
@@ -28,15 +25,21 @@ def transform(self, model, node):
                 raise NotImplementedError("io_stream is not yet implemented for oneAPI")
             elif io_type == 'io_parallel':
                 if out_name in node.model.inputs:
-                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register',
-                                                                   pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
-                                                                   pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
-                                                                   array_type=f'{var.name}_array_t')
+                    new_var = self.interface_var_converter.convert(
+                        var,
+                        pragma='intel::fpga_register',
+                        pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
+                        pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
+                        array_type=f'{var.name}_array_t',
+                    )
                 elif out_name in node.model.outputs:
-                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register',
-                                                                   pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
-                                                                   pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
-                                                                   array_type=f'{var.name}_array_t')
+                    new_var = self.interface_var_converter.convert(
+                        var,
+                        pragma='intel::fpga_register',
+                        pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
+                        pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
+                        array_type=f'{var.name}_array_t',
+                    )
                 elif isinstance(var, InplaceTensorVariable):
                     new_var = self.inplace_array_var_converter.convert(var, pragma='')
                 else:
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index ba10e0285..eb7c6f36e 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -740,8 +740,11 @@ def predict(self, x):
         n_outputs = len(self.get_output_variables())
 
         curr_dir = os.getcwd()
-        newdir = self.config.get_output_dir() + '/firmware' if os.path.exists(self.config.get_output_dir() + '/firmware') \
+        newdir = (
+            self.config.get_output_dir() + '/firmware'
+            if os.path.exists(self.config.get_output_dir() + '/firmware')
             else self.config.get_output_dir() + '/src/firmware'
+        )
         os.chdir(newdir)
 
         output = []
diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp
index f5b9c8433..bb7976f61 100644
--- a/hls4ml/templates/oneapi/exception_handler.hpp
+++ b/hls4ml/templates/oneapi/exception_handler.hpp
@@ -1,20 +1,19 @@
 #ifndef __EXCEPTIONHANDLER_HPP__
 #define __EXCEPTIONHANDLER_HPP__
-#include <sycl/sycl.hpp>
 #include <exception>
 #include <iostream>
+#include <sycl/sycl.hpp>
 
 namespace fpga_tools {
 
 void exception_handler(sycl::exception_list exceptions) {
-  for (std::exception_ptr const &e : exceptions) {
-    try {
-      std::rethrow_exception(e);
-    } catch (sycl::exception const &e) {
-      std::cout << "Caught asynchronous SYCL exception:\n"
-                << e.what() << std::endl;
+    for (std::exception_ptr const &e : exceptions) {
+        try {
+            std::rethrow_exception(e);
+        } catch (sycl::exception const &e) {
+            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
+        }
     }
-  }
 }
 
 } // namespace fpga_tools
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
index 04dc640a1..7e6bb6b6e 100644
--- a/hls4ml/templates/oneapi/firmware/defines.h
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -1,12 +1,12 @@
 #ifndef DEFINES_H_
 #define DEFINES_H_
 
-#include <sycl/sycl.hpp>
-#include <sycl/ext/intel/ac_types/ac_int.hpp>
+#include <array>
 #include <sycl/ext/intel/ac_types/ac_fixed.hpp>
 #include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
-#include <array>
+#include <sycl/sycl.hpp>
 
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index 0dc79a21c..38e18e6ac 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -8,12 +8,9 @@ void MyProject::operator()() const {
     // NETWORK INSTANTIATION
     // ****************************************
 
-// hls-fpga-machine-learning read in
+    // hls-fpga-machine-learning read in
 
-// hls-fpga-machine-learning insert layers
-
-// hls-fpga-machine-learning return
+    // hls-fpga-machine-learning insert layers
 
+    // hls-fpga-machine-learning return
 }
-
-
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 52f457344..082ae5dc8 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -6,8 +6,7 @@
 // This file defines the interface to the kernel
 
 // currently this is fixed
-using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(
-    sycl::ext::intel::experimental::ready_latency<0>));
+using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));
 
 // Need to declare the input and output pipes
 
@@ -20,13 +19,11 @@ struct MyProject {
 
     // kernel property method to config invocation interface
     auto get(sycl::ext::oneapi::experimental::properties_tag) {
-        return sycl::ext::oneapi::experimental::properties{
-            sycl::ext::intel::experimental::streaming_interface<>,
-            sycl::ext::intel::experimental::pipelined<>};
+        return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>,
+                                                           sycl::ext::intel::experimental::pipelined<>};
     }
 
     SYCL_EXTERNAL void operator()() const;
 };
 
-
 #endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 19fbdb3b5..411d42b09 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -24,7 +24,8 @@ struct activ_config {
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void linear(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void linear(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -35,7 +36,8 @@ template <class data_T, class res_T, typename CONFIG_T> void linear(const std::a
 // *************************************************
 //       RELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void relu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void relu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -47,7 +49,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu(const std::arr
 }
 
 template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void relu_max(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -60,11 +62,13 @@ void relu_max(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T,
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu6(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void relu6(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void relu1(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
@@ -72,7 +76,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu1(const std::ar
 //       Sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     static const int MAX_VALUE = 8;
 #include "activation_tables/sigmoid_table.tb"
     #pragma unroll
@@ -124,7 +128,7 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softmax_stable(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
 // Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
@@ -163,7 +167,7 @@ void softmax_stable(const std::array<data_T, CONFIG_T::n_in>& data, std::array<r
 
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softmax_latency(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
 
@@ -189,7 +193,7 @@ void softmax_latency(const std::array<data_T, CONFIG_T::n_in>& data, std::array<
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
 
@@ -231,7 +235,7 @@ void softmax_legacy(const std::array<data_T, CONFIG_T::n_in>& data, std::array<r
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = (res_T)0;
@@ -252,7 +256,7 @@ void softmax_argmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<r
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-inline void softmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+inline void softmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::stable:
         softmax_stable<data_T, res_T, CONFIG_T>(data, res);
@@ -276,7 +280,7 @@ inline void softmax(const std::array<data_T, CONFIG_T::n_in>& data, std::array<r
 //       TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void dense_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     static const int MAX_VALUE = 4;
 // Initialize the lookup table
 #include "activation_tables/tanh_table.tb"
@@ -306,7 +310,7 @@ void dense_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T
 //       Hard sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -319,7 +323,7 @@ void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void hard_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -365,7 +369,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI
 //       Softplus Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softplus(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softplus(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
 // Initialize the lookup table
 #include "activation_tables/softplus_table.tb"
     // Index into the lookup table based on data
@@ -385,7 +389,7 @@ void softplus(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T,
 //       Softsign Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void softsign(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void softsign(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     static const int MAX_VALUE = 8;
 // Initialize the lookup table
 #include "activation_tables/softsign_table.tb"
@@ -416,7 +420,7 @@ void softsign(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T,
 //       ELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void elu(const std::array<data_T, CONFIG_T::n_in>& data, const res_T alpha, std::array<res_T, CONFIG_T::n_in>& res) {
+void elu(const std::array<data_T, CONFIG_T::n_in> &data, const res_T alpha, std::array<res_T, CONFIG_T::n_in> &res) {
 // Initialize the lookup table
 #include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
@@ -434,14 +438,16 @@ void elu(const std::array<data_T, CONFIG_T::n_in>& data, const res_T alpha, std:
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void elu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void selu(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void selu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
 // Initialize the lookup table
 #include "activation_tables/selu_table.tb"
     // Index into the lookup table based on data
@@ -478,7 +484,7 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_
 //       Binary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void binary_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = data[ii];
@@ -496,7 +502,7 @@ void binary_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_
 //       Ternary TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_in>& res) {
+void ternary_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_T datareg = 2 * data[ii];
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
index abefd87b8..f37a61cb0 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
@@ -1,11 +1,10 @@
 #ifndef NNET_COMMON_H_
 #define NNET_COMMON_H_
 
-
 #include "nnet_helpers.h"
-#include <sycl/ext/intel/ac_types/ac_int.hpp>
 #include <sycl/ext/intel/ac_types/ac_fixed.hpp>
 #include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
 
 typedef ac_fixed<16, 6> table_default_t;
 
@@ -39,14 +38,11 @@ template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_
  * before applying and accumulate the result over the rolled dimension.
  * --- */
 template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ?
-                                 pow2<floorlog2<N - 1>::val>::val :
-                                 0;
+    static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ? pow2<floorlog2<N - 1>::val>::val : 0;
     static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
     if constexpr (N == 1) {
         return x[0];
-    }
-    else if constexpr (N == 2) {
+    } else if constexpr (N == 2) {
         return op(x[0], x[1]);
     } else {
         return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index f6bbfc04a..d4a5ad895 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -4,8 +4,8 @@
 #include "nnet_common.h"
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
-#include <cstdint>
 #include <array>
+#include <cstdint>
 
 namespace nnet {
 
@@ -38,7 +38,7 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
+void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -106,7 +106,7 @@ void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
+void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -158,7 +158,7 @@ void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_
 }
 template <class data_T, class res_T, typename CONFIG_T>
 void dense_resource(
-    const std::array<data_T, CONFIG_T::n_in>& data, std::array<res_T, CONFIG_T::n_out>& res,
+    const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
index 888ea4a6f..284bbfd6f 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -2,13 +2,13 @@
 #define NNET_HELPERS_H
 
 #include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <map>
-#include <cmath>
 #include <sstream>
-#include <cstdio>
-#include <cstdlib>
 
 namespace nnet {
 
@@ -30,48 +30,39 @@ extern size_t trace_type_size;
 
 // constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
 // replace with template metaprogramming
-template<int n> struct ceillog2
-{
+template <int n> struct ceillog2 {
     enum { val = 1 + ceillog2<((n + 1) / 2)>::val };
 };
 
-template<> struct ceillog2<2>
-{
+template <> struct ceillog2<2> {
     enum { val = 1 };
 };
 
-template<> struct ceillog2<1>
-{
+template <> struct ceillog2<1> {
     enum { val = 0 };
 };
 
-
 // constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
 // replace with template metaprogramming
-template<int n> struct floorlog2
-{
+template <int n> struct floorlog2 {
     enum { val = 1 + floorlog2<(n / 2)>::val };
 };
 
-template<> struct floorlog2<1>
-{
+template <> struct floorlog2<1> {
     enum { val = 0 };
 };
 
-template<> struct floorlog2<0>
-{
+template <> struct floorlog2<0> {
     enum { val = 0 };
 };
 
 // constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
 // replace with template metaprogramming
-template<int n> struct pow2
-{
+template <int n> struct pow2 {
     enum { val = 2 * pow2<(n - 1)>::val };
 };
 
-template<> struct pow2<0>
-{
+template <> struct pow2<0> {
     enum { val = 1 };
 };
 
@@ -81,7 +72,6 @@ template <class data_T, class save_T> void save_output_array(data_T *data, save_
     }
 }
 
-
 // We don't want to include save_T in this function because it will be inserted into myproject.cpp
 // so a workaround with element size is used
 template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
index 830a322de..5fec90d1a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
@@ -9,10 +9,10 @@
 
 using namespace sycl;
 
-#define PRINTF(format, ...)                                    \
-  {                                                            \
-    static const CL_CONSTANT char _format[] = format;          \
-    ext::oneapi::experimental::printf(_format, ##__VA_ARGS__); \
-  }
+#define PRINTF(format, ...)                                                                                                 \
+    {                                                                                                                       \
+        static const CL_CONSTANT char _format[] = format;                                                                   \
+        ext::oneapi::experimental::printf(_format, ##__VA_ARGS__);                                                          \
+    }
 
 #endif
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index f4974ad8b..3beb224ea 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -55,8 +55,7 @@ void myproject_float(
     // hls-fpga-machine-learning insert header #float
 ) {
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
-    sycl::queue q(selector, fpga_tools::exception_handler,
-                  sycl::property::queue::enable_profiling{});
+    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
 
     // hls-fpga-machine-learning insert wrapper #float
 }
@@ -65,8 +64,7 @@ void myproject_double(
     // hls-fpga-machine-learning insert header #double
 ) {
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
-    sycl::queue q(selector, fpga_tools::exception_handler,
-                  sycl::property::queue::enable_profiling{});
+    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
     // hls-fpga-machine-learning insert wrapper #double
 }
 }
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index fce1c19db..c64fb6549 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -1,10 +1,10 @@
 #include <algorithm>
 #include <cctype>
+#include <exception>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#include <exception>
 
 #include "firmware/myproject.h"
 #include "firmware/parameters.h"
@@ -17,34 +17,29 @@
 
 #define CHECKPOINT 5000
 
-
 int main(int argc, char **argv) {
 
 #if FPGA_SIMULATOR
     auto selector = sycl::ext::intel::fpga_simulator_selector_v;
 #elif FPGA_HARDWARE
     auto selector = sycl::ext::intel::fpga_selector_v;
-#else  // #if FPGA_EMULATOR
+#else // #if FPGA_EMULATOR
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
 #endif
 
-    sycl::queue q(selector, fpga_tools::exception_handler,
-                  sycl::property::queue::enable_profiling{});
+    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
 
     auto device = q.get_device();
 
     // make sure the device supports USM host allocations
     if (!device.has(sycl::aspect::usm_host_allocations)) {
-      std::cerr << "This design must either target a board that supports USM "
-                   "Host/Shared allocations, or IP Component Authoring. "
-                << std::endl;
-      std::terminate();
+        std::cerr << "This design must either target a board that supports USM "
+                     "Host/Shared allocations, or IP Component Authoring. "
+                  << std::endl;
+        std::terminate();
     }
 
-    std::cout << "Running on device: "
-              << device.get_info<sycl::info::device::name>().c_str()
-              << std::endl;
-
+    std::cout << "Running on device: " << device.get_info<sycl::info::device::name>().c_str() << std::endl;
 
     // load input data from text file
     std::ifstream fin("tb_data/tb_input_features.dat");
@@ -95,32 +90,31 @@ int main(int argc, char **argv) {
                 throw std::runtime_error("The output size does not match");
             }
             std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
-
         }
         // Do this separately to avoid vector reallocation
-        for(int i = 0; i < num_iterations; i++) {
+        for (int i = 0; i < num_iterations; i++) {
             // hls-fpga-machine-learning insert tb-input
-            q.single_task(MyProject{});  // once or once for each
+            q.single_task(MyProject{}); // once or once for each
         }
         q.wait();
 
         for (int j = 0; j < num_iterations; j++) {
             // hls-fpga-machine-learning insert tb-output
-            for(auto outval : outputs[j]) {
-              fout << outval << " ";
+            for (auto outval : outputs[j]) {
+                fout << outval << " ";
             }
             fout << std::endl;
             if (j % CHECKPOINT == 0) {
                 std::cout << "Predictions" << std::endl;
                 // hls-fpga-machine-learning insert predictions
-                for(auto predval : predictions[j]) {
-                  std::cout << predval << " ";
+                for (auto predval : predictions[j]) {
+                    std::cout << predval << " ";
                 }
                 std::cout << std::endl;
                 std::cout << "Quantized predictions" << std::endl;
                 // hls-fpga-machine-learning insert quantized
-                for(auto outval : outputs[j]) {
-                  std::cout << outval << " ";
+                for (auto outval : outputs[j]) {
+                    std::cout << outval << " ";
                 }
                 std::cout << std::endl;
             }
@@ -132,14 +126,14 @@ int main(int argc, char **argv) {
         std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
                   << " invocations." << std::endl;
         // hls-fpga-machine-learning insert zero
-        for(int i = 0; i < num_iterations; i++) {
+        for (int i = 0; i < num_iterations; i++) {
             inputs.emplace_back();
             outputs.emplace_back();
             inputs.back().fill(0.0);
         }
 
         // hls-fpga-machine-learning insert top-level-function
-        for(int i = 0; i < num_iterations; i++) {
+        for (int i = 0; i < num_iterations; i++) {
             // hls-fpga-machine-learning insert tb-input
             q.single_task(MyProject{});
         }
@@ -147,13 +141,13 @@ int main(int argc, char **argv) {
 
         for (int j = 0; j < num_iterations; j++) {
             // hls-fpga-machine-learning insert tb-output
-            for(auto outval : outputs[j]) {
-              std::cout << outval << " ";
+            for (auto outval : outputs[j]) {
+                std::cout << outval << " ";
             }
             std::cout << std::endl;
 
-            for(auto outval : outputs[j]) {
-              fout << outval << " ";
+            for (auto outval : outputs[j]) {
+                fout << outval << " ";
             }
             fout << std::endl;
         }
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 942964fc8..c53163a4b 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,5 +1,5 @@
-from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.oneapi_writer import OneAPIWriter
+from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index deff589ed..31ebaeaf9 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -63,7 +63,6 @@ def print_array_to_cpp(self, var, layer, odir):
             odir (str): Output directory
         """
         with open(f"{odir}/src/firmware/weights/{var.name}.h", "w") as h_file:
-
             # meta data
             h_file.write(f"//Numpy array shape {var.shape}\n")
             h_file.write(f"//Min {np.min(var.min):.12f}\n")
@@ -98,9 +97,7 @@ def print_array_to_cpp(self, var, layer, odir):
                 nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2)
                 var_width = int(np.ceil(var.type.precision.width / 8))
                 bwidth = self.next_pow2(var_width)
-                weight_header += (
-                    f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
-                )
+                weight_header += f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
             if var.storage.lower() == 'bram':
                 weight_header += 'static '
             else:
@@ -116,7 +113,6 @@ def print_array_to_cpp(self, var, layer, odir):
             h_file.write("};\n")
             h_file.write("\n#endif\n")
 
-
     def write_project_dir(self, model):
         """Write the base project directory
 
@@ -135,9 +131,9 @@ def write_project_cpp(self, model):
         project_name = model.config.get_project_name()
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, \
-             open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w'
+        ) as fout:
             model_inputs = model.get_input_variables()
             model_outputs = model.get_output_variables()
             model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -172,7 +168,6 @@ def write_project_cpp(self, model):
                             if w not in model_brams:
                                 newline += f'#include "weights/{w.name}.h"\n'
 
-
                 # Neural net instantiation
                 elif '// hls-fpga-machine-learning insert layers' in line:
                     newline = line + '\n'
@@ -213,7 +208,6 @@ def write_project_cpp(self, model):
 
                 fout.write(newline)
 
-
     def write_project_header(self, model):
         """Write the main architecture header file (myproject.h)
 
@@ -224,9 +218,9 @@ def write_project_header(self, model):
         project_name = model.config.get_project_name()
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, \
-             open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w'
+        ) as fout:
             model_inputs = model.get_input_variables()
             model_outputs = model.get_output_variables()
             model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -271,9 +265,9 @@ def write_defines(self, model):
             model (ModelGraph): the hls4ml model.
         """
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, \
-             open(f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w'
+        ) as fout:
             for line in f.readlines():
                 # Insert numbers
                 if '// hls-fpga-machine-learning insert numbers' in line:
@@ -315,13 +309,15 @@ def write_parameters(self, model):
             model (ModelGraph): the hls4ml model.
         """
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, \
-             open(f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w'
+        ) as fout:
             for line in f.readlines():
                 if '// hls-fpga-machine-learning insert includes' in line:
                     newline = line
-                    for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
+                    for include in sorted(
+                        set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))
+                    ):
                         newline += '#include "%s"\n' % include
 
                 elif "// hls-fpga-machine-learning insert layer-config" in line:
@@ -392,9 +388,9 @@ def write_test_bench(self, model):
                     output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
                 )
 
-        with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, \
-             open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w'
+        ) as fout:
             for line in f.readlines():
                 indent = ' ' * (len(line) - len(line.lstrip('    ')))
 
@@ -447,9 +443,9 @@ def write_bridge(self, model):
         indent = '    '
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, \
-             open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w'
+        ) as fout:
             for line in f.readlines():
                 if 'MYPROJECT' in line:
                     newline = line.replace('MYPROJECT', format(project_name.upper()))
@@ -479,7 +475,10 @@ def write_bridge(self, model):
                     newline = ''
                     for i in model_inputs:
                         newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
-                        newline += indent + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n'
+                        newline += (
+                            indent
+                            + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n'
+                        )
                         newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
 
                     newline += '\n'
@@ -491,7 +490,10 @@ def write_bridge(self, model):
 
                     for o in model_outputs:
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
-                        newline += indent + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
+                        newline += (
+                            indent
+                            + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
+                        )
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''
                     for layer in model.get_layers():
@@ -509,7 +511,6 @@ def write_bridge(self, model):
                     newline = line
                 fout.write(newline)
 
-
     def write_build_script(self, model):
         """Write the build scripts (Makefile, build_lib.sh)
 
@@ -520,9 +521,9 @@ def write_build_script(self, model):
         # Makefile
         filedir = os.path.dirname(os.path.abspath(__file__))
         device = model.config.get_config_value('Part')
-        with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, \
-             open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout:
-
+        with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, open(
+            f'{model.config.get_output_dir()}/CMakeLists.txt', 'w'
+        ) as fout:
             for line in f.readlines():
                 line = line.replace('myproject', model.config.get_project_name())
                 line = line.replace('mystamp', model.config.get_config_value('Stamp'))
@@ -532,7 +533,6 @@ def write_build_script(self, model):
 
                 fout.write(line)
 
-
     def write_nnet_utils(self, model):
         """Copy the nnet_utils, AP types headers and any custom source to the project output directory
 
@@ -554,7 +554,6 @@ def write_nnet_utils(self, model):
         for h in headers:
             copyfile(srcpath + h, dstpath + h)
 
-
         # custom source
         filedir = os.path.dirname(os.path.abspath(__file__))
 

From c4ce138799970981cb39987ef0884118d8e26f1a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 12 Jan 2024 15:01:49 -0600
Subject: [PATCH 013/100] fix more pre-commit

---
 hls4ml/writer/oneapi_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 31ebaeaf9..3d826e1f0 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -2,7 +2,7 @@
 import os
 import tarfile
 from collections import OrderedDict
-from shutil import copyfile, copytree, rmtree
+from shutil import copyfile
 
 import numpy as np
 import yaml

From 354d70857ac91eab083e0bdc1b045ed9b15f1147 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 12 Jan 2024 15:33:47 -0600
Subject: [PATCH 014/100] fix more pre-commit errors

---
 hls4ml/writer/oneapi_writer.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 3d826e1f0..7ff0ccf08 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -97,7 +97,10 @@ def print_array_to_cpp(self, var, layer, odir):
                 nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2)
                 var_width = int(np.ceil(var.type.precision.width / 8))
                 bwidth = self.next_pow2(var_width)
-                weight_header += f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
+                weight_header += (
+                    f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), '
+                    'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
+                )
             if var.storage.lower() == 'bram':
                 weight_header += 'static '
             else:
@@ -223,12 +226,12 @@ def write_project_header(self, model):
         ) as fout:
             model_inputs = model.get_input_variables()
             model_outputs = model.get_output_variables()
-            model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+            # model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
 
-            # io_parallel and io_stream instantiate the top-level function differently
-            io_type = model.config.get_config_value('IOType')
-            indent = '    '
-            brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+            # io_parallel and io_stream instantiate the top-level function differently (io_stream not yet supported)
+            # io_type = model.config.get_config_value('IOType')
+            # indent = '    '
+            # brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
 
             for line in f.readlines():
                 if 'MYPROJECT' in line:
@@ -392,7 +395,7 @@ def write_test_bench(self, model):
             f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w'
         ) as fout:
             for line in f.readlines():
-                indent = ' ' * (len(line) - len(line.lstrip('    ')))
+                indent = ' ' * (len(line) - len(line.lstrip(' ')))
 
                 if 'myproject' in line:
                     newline = line.replace('myproject', project_name)
@@ -439,7 +442,7 @@ def write_bridge(self, model):
         model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
         # model brambs aren't actually supported yet
 
-        io_type = model.config.get_config_value('IOType')
+        # io_type = model.config.get_config_value('IOType')
         indent = '    '
 
         filedir = os.path.dirname(os.path.abspath(__file__))
@@ -477,7 +480,8 @@ def write_bridge(self, model):
                         newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
                         newline += (
                             indent
-                            + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());\n'
+                            + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());'
+                            + '\n'
                         )
                         newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
 
@@ -492,7 +496,8 @@ def write_bridge(self, model):
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
                         newline += (
                             indent
-                            + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>({o.name}_output.data(), {o.name});\n'
+                            + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>'
+                            + f'({o.name}_output.data(), {o.name});\n'
                         )
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''

From 8119029d178e373ca35c9bc0f723cac051e3ad1d Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 21 Jan 2024 12:13:18 -0600
Subject: [PATCH 015/100] snapshot of work before reworking types

---
 hls4ml/backends/oneapi/oneapi_types.py        | 51 +++++++++++++++++--
 .../backends/oneapi/passes/transform_types.py | 28 +++++-----
 2 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index f28d697a4..c679f14ae 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -1,7 +1,13 @@
 '''
 This package includes oneAPI-specific customizations to the variable types
 '''
-from hls4ml.backends.fpga.fpga_types import ArrayVariableConverter, VariableDefinition
+from hls4ml.backends.fpga.fpga_types import (
+    ArrayVariableConverter,
+    InplaceStreamVariableConverter,
+    StreamVariableConverter,
+    VariableDefinition,
+)
+from hls4ml.utils.string_utils import convert_to_pascal_case
 
 # region ArrayVarable
 
@@ -51,16 +57,16 @@ def __init__(self, type_converter, prefix, definition_cls):
         self.prefix = prefix
         self.definition_cls = definition_cls
 
-    def convert(self, tensor_var, pipe_name, pipe_id, array_type, pragma='partition'):
+    def convert(self, tensor_var, pragma='partition'):
         if isinstance(tensor_var, self.definition_cls):  # Already converted
             return tensor_var
 
         tensor_var.pragma = pragma
         tensor_var.type = self.type_converter.convert(tensor_var.type)
 
-        tensor_var.pipe_name = pipe_name
-        tensor_var.pipe_id = pipe_id
-        tensor_var.array_type = array_type
+        tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe'
+        tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID'
+        tensor_var.array_type = f'{tensor_var.name}_array_t'
 
         tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {})
         return tensor_var
@@ -72,3 +78,38 @@ def __init__(self, type_converter):
 
 
 # endregion
+
+
+# region StreamVariable
+class OneAPIStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=True):
+        return f'{self.name}{name_suffix}'
+
+    def declare_cpp(self, pipe_min_size=0, indent=''):
+        lines = indent + f'class {self.pipe_id};\n'
+        lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + (
+            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+            + f'{self.array_type}, {pipe_min_size}>;\n'
+        )
+        return lines
+
+
+class OneAPIInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'using {self.name} = {self.input_var.name}'
+
+
+class OneAPIStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition)
+
+
+class OneAPIInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition
+        )
+
+
+# endregion
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 2cfcd02c7..71a63585b 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -2,11 +2,14 @@
 from hls4ml.backends.oneapi.oneapi_types import (
     OneAPIArrayVariableConverter,
     OneAPIInplaceArrayVariableConverter,
+    OneAPIInplaceStreamVariableConverter,
     OneAPIInterfaceVariableConverter,
+    OneAPIStreamVariableConverter,
 )
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
-from hls4ml.utils.string_utils import convert_to_pascal_case
+
+# from hls4ml.utils.string_utils import convert_to_pascal_case
 
 
 class TransformTypes(GlobalOptimizerPass):
@@ -15,6 +18,8 @@ def __init__(self):
         self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter)
         self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter)
         self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter)
+        self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter)
         self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
 
     def transform(self, model, node):
@@ -22,24 +27,15 @@ def transform(self, model, node):
 
         for out_name, var in node.variables.items():
             if io_type == 'io_stream':
-                raise NotImplementedError("io_stream is not yet implemented for oneAPI")
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_stream_var_converter.convert(var)
+                else:
+                    new_var = self.stream_var_converter.convert(var)
             elif io_type == 'io_parallel':
                 if out_name in node.model.inputs:
-                    new_var = self.interface_var_converter.convert(
-                        var,
-                        pragma='intel::fpga_register',
-                        pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
-                        pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
-                        array_type=f'{var.name}_array_t',
-                    )
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register')
                 elif out_name in node.model.outputs:
-                    new_var = self.interface_var_converter.convert(
-                        var,
-                        pragma='intel::fpga_register',
-                        pipe_name=f'{convert_to_pascal_case(var.name)}Pipe',
-                        pipe_id=f'{convert_to_pascal_case(var.name)}PipeID',
-                        array_type=f'{var.name}_array_t',
-                    )
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register')
                 elif isinstance(var, InplaceTensorVariable):
                     new_var = self.inplace_array_var_converter.convert(var, pragma='')
                 else:

From cae1a8a001a66435812d1560ad4754fcced60a17 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 12 Feb 2024 17:31:51 -0600
Subject: [PATCH 016/100] Use using to decide array type, some preliminary
 updates

---
 hls4ml/backends/fpga/fpga_types.py          | 2 +-
 hls4ml/backends/oneapi/oneapi_types.py      | 2 +-
 hls4ml/templates/oneapi/CMakeLists.txt      | 4 ++--
 hls4ml/templates/oneapi/firmware/defines.h  | 3 ++-
 hls4ml/templates/quartus/firmware/defines.h | 2 ++
 hls4ml/templates/vivado/firmware/defines.h  | 2 ++
 6 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index ceac0b5e4..41f3cd12e 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -172,7 +172,7 @@ def convert_precision(self, precision_converter):
 class PackedTypeConverter(TypeDefinition, TypePrecisionConverter):
     def definition_cpp(self):
         n_elem_expr = '/' if self.unpack else '*'
-        return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format(
+        return 'typedef array<{precision}, {n_elem}> {name};\n'.format(
             name=self.name,
             precision=self.precision.definition_cpp(),
             n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack),
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index c679f14ae..bf445c8f2 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -43,7 +43,7 @@ def definition_cpp(self, name_suffix='', as_reference=False):
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + f'using {self.array_type} = array<{self.type.name}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
             + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n'
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index d6b2a4745..1ab0d3748 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -98,8 +98,8 @@ endif()
 
 set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS})
 # for debugging need to do this. Not sure why
-# set(COMMON_LINK_FLAGS -v -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
-set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+# set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
 
 # A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
 # 1. The "compile" stage compiles the device code to an intermediate
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
index 7e6bb6b6e..b88fca49b 100644
--- a/hls4ml/templates/oneapi/firmware/defines.h
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -3,7 +3,6 @@
 
 #include <array>
 #include <sycl/ext/intel/ac_types/ac_fixed.hpp>
-#include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
 #include <sycl/ext/intel/ac_types/ac_int.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
 #include <sycl/sycl.hpp>
@@ -11,6 +10,8 @@
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
 
+using std::array;
+
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
index c3fe4ec40..a465f2716 100644
--- a/hls4ml/templates/quartus/firmware/defines.h
+++ b/hls4ml/templates/quartus/firmware/defines.h
@@ -36,6 +36,8 @@ template <typename T> using stream_out = ihc::stream_out<T>;
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
 
+using nnet::array;
+
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision
diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h
index 1f11b0209..e0a75ec64 100644
--- a/hls4ml/templates/vivado/firmware/defines.h
+++ b/hls4ml/templates/vivado/firmware/defines.h
@@ -7,6 +7,8 @@
 #include <cstddef>
 #include <cstdio>
 
+using nnet::array;
+
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision

From 06a8c277c11622c8ddc08e2f80af52986ff6a69f Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 13 Feb 2024 19:00:15 -0600
Subject: [PATCH 017/100] snapshot unifying types

---
 hls4ml/backends/oneapi/oneapi_types.py        |  78 ++++++----
 .../firmware/nnet_utils/nnet_activation.h     | 139 ++++++++----------
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |  19 ++-
 hls4ml/writer/oneapi_writer.py                |   4 +-
 4 files changed, 117 insertions(+), 123 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index bf445c8f2..4559c1f9e 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -1,9 +1,11 @@
 '''
 This package includes oneAPI-specific customizations to the variable types
 '''
+import numpy as np
+
 from hls4ml.backends.fpga.fpga_types import (
-    ArrayVariableConverter,
     InplaceStreamVariableConverter,
+    PackedType,
     StreamVariableConverter,
     VariableDefinition,
 )
@@ -14,7 +16,7 @@
 
 class OneAPIArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
-        return f'[[{self.pragma}]] std::array<{self.type.name}, {self.size_cpp()}> {self.name}{name_suffix}'
+        return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
 
 
 class OneAPIInplaceArrayVariableDefinition(VariableDefinition):
@@ -22,12 +24,47 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
-class OneAPIArrayVariableConverter(ArrayVariableConverter):
+class AggregratedArrayVariableConverter:
+    """This is a bit of an extension of the standard ArrayVariableConverter"""
+
+    def __init__(self, type_converter, prefix, definition_cls):
+        self.type_converter = type_converter
+        self.prefix = prefix
+        self.definition_cls = definition_cls
+
+    def convert(self, tensor_var, pragma='', depth=0, n_pack=1):
+        if isinstance(tensor_var, self.definition_cls):  # Already converted
+            return tensor_var
+
+        tensor_var.pragma = pragma
+        if pragma == 'stream':
+            if depth == 0:
+                depth = np.prod(tensor_var.shape) // tensor_var.shape[-1]
+            self.pragma = ('stream', depth)
+            n_elem = tensor_var.shape[-1]
+        else:
+            self.pragma = pragma
+            n_elem = tensor_var.size()
+            n_pack = 1  # ignore any passed value
+
+        tensor_var.type = self.type_converter.convert(
+            PackedType(tensor_var.type.name, tensor_var.type.precision, n_elem, n_pack)
+        )
+
+        # pipe_name and pipe_id are only used for io_stream and interface variables in io_parallel
+        tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe'
+        tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID'
+
+        tensor_var.__class__ = type(self.prefix + 'AggregateArrayVariable', (type(tensor_var), self.definition_cls), {})
+        return tensor_var
+
+
+class OneAPIArrayVariableConverter(AggregratedArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition)
 
 
-class OneAPIInplaceArrayVariableConverter(ArrayVariableConverter):
+class OneAPIInplaceArrayVariableConverter(AggregratedArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition)
 
@@ -39,40 +76,19 @@ def __init__(self, type_converter):
 
 class OneAPIInterfaceVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
-        return f'[[{self.pragma}]] {self.array_type} {self.name}{name_suffix}'
+        return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.array_type} = array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + f'using {self.type.name} = array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.array_type}, {pipe_min_size}, PipeProps>;\n'
+            + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
         )
         return lines
 
 
-class InterfaceVariableConverter:
-    def __init__(self, type_converter, prefix, definition_cls):
-        self.type_converter = type_converter
-        self.prefix = prefix
-        self.definition_cls = definition_cls
-
-    def convert(self, tensor_var, pragma='partition'):
-        if isinstance(tensor_var, self.definition_cls):  # Already converted
-            return tensor_var
-
-        tensor_var.pragma = pragma
-        tensor_var.type = self.type_converter.convert(tensor_var.type)
-
-        tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe'
-        tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID'
-        tensor_var.array_type = f'{tensor_var.name}_array_t'
-
-        tensor_var.__class__ = type(self.prefix + 'InterfaceMemberVariable', (type(tensor_var), self.definition_cls), {})
-        return tensor_var
-
-
-class OneAPIInterfaceVariableConverter(InterfaceVariableConverter):
+class OneAPIInterfaceVariableConverter(AggregratedArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition)
 
@@ -87,10 +103,10 @@ def definition_cpp(self, name_suffix='', as_reference=True):
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.array_type} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + f'using {self.name} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.array_type}, {pipe_min_size}>;\n'
+            + f'{self.type}, {pipe_min_size}>;\n'
         )
         return lines
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 411d42b09..ef22a6b20 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -2,7 +2,6 @@
 #define NNET_ACTIVATION_H_
 
 #include "nnet_common.h"
-#include <array>
 
 namespace nnet {
 
@@ -24,11 +23,10 @@ struct activ_config {
 // *************************************************
 //       LINEAR Activation -- See Issue 53
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void linear(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void linear(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         res[ii] = datareg;
     }
 }
@@ -36,11 +34,10 @@ void linear(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CO
 // *************************************************
 //       RELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void relu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg > 0)
             res[ii] = datareg;
         else
@@ -48,11 +45,10 @@ void relu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONF
     }
 }
 
-template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T> void relu_max(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg < 0)
             res[ii] = 0;
         else if (datareg > MAX_INT)
@@ -62,27 +58,24 @@ void relu_max(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void relu6(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu6(const data_T &data, res_T &res) {
     relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void relu1(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void relu1(const data_T &data, res_T &res) {
     relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
 }
 
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_T &data, res_T &res) {
     static const int MAX_VALUE = 8;
 #include "activation_tables/sigmoid_table.tb"
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        [[intel::fpga_register]] data_T absoluteValue;
-        [[intel::fpga_register]] res_T temp2;
+        [[intel::fpga_register]] typename data_T::value_type absoluteValue;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
         if (data[ii] < 0) {
             absoluteValue = -data[ii];
         } else {
@@ -91,7 +84,7 @@ void sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, C
         int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
         if (absoluteValue > MAX_VALUE)
             index = CONFIG_T::table_size - 1;
-        temp2 = (res_T)sigmoid_table[index];
+        temp2 = static_cast<typename res_T::value_type>(sigmoid_table[index]);
         if (data[ii] < 0) {
             res[ii] = 1 - temp2;
         } else {
@@ -127,18 +120,19 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
     return y.to_uint();
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
 // Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
     // Find maximum
-    Op_max<data_T> op_max;
-    [[intel::fpga_register]] data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data.data(), op_max);
+    Op_max<typename data_T::value_type> op_max;
+    [[intel::fpga_register]] auto x_max =
+        reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
+        d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         d_xi_xmax[i] = data[i] - x_max;
@@ -148,7 +142,7 @@ void softmax_stable(const std::array<data_T, CONFIG_T::n_in> &data, std::array<r
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
+        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
     }
 
     // Explicitly sum previously calculated exponentials with an adder tree
@@ -166,8 +160,7 @@ void softmax_stable(const std::array<data_T, CONFIG_T::n_in> &data, std::array<r
 }
 
 // TODO - Improve accuracy
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(const data_T &data, res_T &res) {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
 
@@ -175,7 +168,7 @@ void softmax_latency(const std::array<data_T, CONFIG_T::n_in> &data, std::array<
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<data_T, CONFIG_T>(data[i])];
+        exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<typename data_T::value_type, CONFIG_T>(data[i])];
     }
 
     // Explicitly sum the results with an adder tree.
@@ -192,8 +185,7 @@ void softmax_latency(const std::array<data_T, CONFIG_T::n_in> &data, std::array<
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softmax_legacy(const data_T &data, res_T &res) {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
 
@@ -234,14 +226,13 @@ void softmax_legacy(const std::array<data_T, CONFIG_T::n_in> &data, std::array<r
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softmax_argmax(const data_T &data, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
-        res[i] = (res_T)0;
+        res[i] = static_cast<typename res_T::value_type>(0);
     }
 
-    [[intel::fpga_register]] data_T maximum = data[0];
+    [[intel::fpga_register]] auto maximum = data[0];
     [[intel::fpga_register]] int idx = 0;
 
     #pragma ii 1
@@ -252,11 +243,10 @@ void softmax_argmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<r
         }
     }
 
-    res[idx] = (res_T)1;
+    res[idx] = static_cast<typename res_T::value_type>(1);
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-inline void softmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax(const data_T &data, res_T &res) {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::stable:
         softmax_stable<data_T, res_T, CONFIG_T>(data, res);
@@ -279,16 +269,15 @@ inline void softmax(const std::array<data_T, CONFIG_T::n_in> &data, std::array<r
 // *************************************************
 //       TanH Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void dense_tanh(const data_T &data, res_T &res) {
     static const int MAX_VALUE = 4;
 // Initialize the lookup table
 #include "activation_tables/tanh_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        [[intel::fpga_register]] data_T temp;
-        [[intel::fpga_register]] res_T temp2;
+        [[intel::fpga_register]] typename data_T::value_type temp;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
         if (data[ii] < 0) {
             temp = -data[ii];
         } else {
@@ -297,7 +286,7 @@ void dense_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T
         ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int();
         if (temp > MAX_VALUE)
             index = CONFIG_T::table_size - 1;
-        temp2 = (res_T)tanh_table[index];
+        temp2 = static_cast<typename res_T::value_type>(tanh_table[index]);
         if (data[ii] < 0) {
             res[ii] = -temp2;
         } else {
@@ -309,8 +298,7 @@ void dense_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T
 // *************************************************
 //       Hard sigmoid Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void hard_sigmoid(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -322,8 +310,7 @@ void hard_sigmoid(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
@@ -339,10 +326,10 @@ void hard_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
 //       Leaky RELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+void leaky_relu(const data_T &data, typename data_T::value_type alpha, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg > 0)
             res[ii] = datareg;
         else
@@ -354,10 +341,10 @@ void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+void thresholded_relu(const data_T &data, typename data_T::value_type theta, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg > theta)
             res[ii] = datareg;
         else
@@ -368,8 +355,7 @@ void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFI
 // *************************************************
 //       Softplus Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void softplus(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softplus(const data_T &data, res_T &res) {
 // Initialize the lookup table
 #include "activation_tables/softplus_table.tb"
     // Index into the lookup table based on data
@@ -381,15 +367,14 @@ void softplus(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
             index = 0;
         if (index > CONFIG_T::table_size - 1)
             index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)softplus_table[index];
+        res[ii] = static_cast<typename res_T::value_type>(softplus_table[index]);
     }
 }
 
 // *************************************************
 //       Softsign Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void softsign(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void softsign(const data_T &data, res_T &res) {
     static const int MAX_VALUE = 8;
 // Initialize the lookup table
 #include "activation_tables/softsign_table.tb"
@@ -397,8 +382,8 @@ void softsign(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        [[intel::fpga_register]] data_T temp;
-        [[intel::fpga_register]] res_T temp2;
+        [[intel::fpga_register]] typename data_T::value_type temp;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
         if (data[ii] < 0) {
             temp = -data[ii];
         } else {
@@ -407,7 +392,7 @@ void softsign(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
         ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int();
         if (temp > MAX_VALUE)
             index = CONFIG_T::table_size - 1;
-        temp2 = (res_T)softsign_table[index];
+        temp2 = static_cast<typename res_T::value_type>(softsign_table[index]);
         if (data[ii] < 0) {
             res[ii] = -temp2;
         } else {
@@ -419,14 +404,13 @@ void softsign(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T,
 // *************************************************
 //       ELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void elu(const std::array<data_T, CONFIG_T::n_in> &data, const res_T alpha, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(const data_T &data, const res_T alpha, res_T &res) {
 // Initialize the lookup table
 #include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg >= 0) {
             res[ii] = datareg;
         } else {
@@ -438,24 +422,22 @@ void elu(const std::array<data_T, CONFIG_T::n_in> &data, const res_T alpha, std:
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void elu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void elu(const data_T &data, res_T &res) {
     elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
 }
 
 // *************************************************
 //       SELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void selu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void selu(const data_T &data, res_T &res) {
 // Initialize the lookup table
 #include "activation_tables/selu_table.tb"
     // Index into the lookup table based on data
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg >= 0) {
-            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+            res[ii] = static_cast<typename res_T::value_type>(1.0507009873554804934193349852946) * datareg;
         } else {
             ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
             if (index > CONFIG_T::table_size - 1)
@@ -468,11 +450,10 @@ void selu(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONF
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template <class data_T, class res_T, typename CONFIG_T> void prelu(const data_T &data, const data_T &alpha, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         if (datareg > 0)
             res[ii] = datareg;
         else
@@ -483,30 +464,28 @@ void prelu(data_T data[CONFIG_T::n_in], const data_T alpha[CONFIG_T::n_in], res_
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void binary_tanh(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = data[ii];
-        res_T cache;
+        auto datareg = data[ii];
+        typename res_T::value_type cache;
         if (datareg > 0)
             cache = 1;
         else
             cache = -1;
 
-        res[ii] = (res_T)cache;
+        res[ii] = cache;
     }
 }
 
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_in> &res) {
+template <class data_T, class res_T, typename CONFIG_T> void ternary_tanh(const data_T &data, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_T datareg = 2 * data[ii];
-        res_T cache;
+        auto datareg = 2 * data[ii];
+        typename res_T::value_type cache;
         if (datareg > 1)
             cache = 1;
         else if (datareg > -1 && datareg <= 1)
@@ -514,7 +493,7 @@ void ternary_tanh(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res
         else
             cache = -1;
 
-        res[ii] = (res_T)cache;
+        res[ii] = cache;
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index d4a5ad895..bb5dac59b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -4,7 +4,6 @@
 #include "nnet_common.h"
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
-#include <array>
 #include <cstdint>
 
 namespace nnet {
@@ -38,7 +37,7 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
+void dense_rf_gt(const data_T &data, res_T &res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -76,8 +75,8 @@ void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_
                 continue;
             int data_index = d_index[ir][im];
             // Modified this
-            tmp_acc[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
+            tmp_acc[im] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t>::product(
+                data[data_index], weights[w_index]);
         }
         [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
     ResetMult:
@@ -102,11 +101,11 @@ void dense_rf_gt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_
 Store:
     #pragma unroll
     for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
+        res[ires] = cast<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(acc[ires]); // acc[jj];
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
+void dense_rf_lt(const data_T &data, res_T &res,
                  const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
                  const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
@@ -131,8 +130,8 @@ void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_
             if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
                 continue;
             // Modified this
-            mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+            mult[im] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t>::product(
+                data[in_index], weights[w_index]);
             in_index += CONFIG_T::reuse_factor;
             if (in_index >= CONFIG_T::n_in)
                 in_index = ir;
@@ -153,12 +152,12 @@ void dense_rf_lt(const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_
 Result:
     #pragma unroll
     for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+        res[ires] = cast<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(acc[ires]);
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
 void dense_resource(
-    const std::array<data_T, CONFIG_T::n_in> &data, std::array<res_T, CONFIG_T::n_out> &res,
+    const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 7ff0ccf08..7c58a9af4 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -410,13 +410,13 @@ def write_test_bench(self, model):
                     newline = line
                     # there should really be only one input
                     inp = model_inputs[0]
-                    newline += indent + f'std::vector<{inp.array_type}> inputs;\n'
+                    newline += indent + f'std::vector<{inp.type}> inputs;\n'
 
                 elif '// hls-fpga-machine-learning insert results' in line:
                     newline = line
                     # there should really be only one out
                     out = model_outputs[0]
-                    newline += indent + f'std::vector<{out.array_type}> outputs;\n'
+                    newline += indent + f'std::vector<{out.type}> outputs;\n'
                 elif '// hls-fpga-machine-learning insert tb-input' in line:
                     newline = line
                     inp = model_inputs[0]

From 8f5877896d95111dc58927181a5fa3616c401ac7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 13 Feb 2024 22:24:56 -0600
Subject: [PATCH 018/100] fix the testbench and bridge

---
 hls4ml/writer/oneapi_writer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 7c58a9af4..a31d80b5e 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -410,13 +410,13 @@ def write_test_bench(self, model):
                     newline = line
                     # there should really be only one input
                     inp = model_inputs[0]
-                    newline += indent + f'std::vector<{inp.type}> inputs;\n'
+                    newline += indent + f'std::vector<{inp.type.name}> inputs;\n'
 
                 elif '// hls-fpga-machine-learning insert results' in line:
                     newline = line
                     # there should really be only one out
                     out = model_outputs[0]
-                    newline += indent + f'std::vector<{out.type}> outputs;\n'
+                    newline += indent + f'std::vector<{out.type.name}> outputs;\n'
                 elif '// hls-fpga-machine-learning insert tb-input' in line:
                     newline = line
                     inp = model_inputs[0]
@@ -480,8 +480,8 @@ def write_bridge(self, model):
                         newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
                         newline += (
                             indent
-                            + f'nnet::convert_data<{dtype}, {i.type.name}, {i.size_cpp()}>({i.name}, {i.name}_input.data());'
-                            + '\n'
+                            + f'nnet::convert_data<{dtype}, typename {i.type.name}::value_type, {i.size_cpp()}>'
+                            + f'({i.name}, {i.name}_input.data());\n'
                         )
                         newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
 
@@ -496,7 +496,7 @@ def write_bridge(self, model):
                         newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
                         newline += (
                             indent
-                            + f'nnet::convert_data_back<{o.type.name}, {dtype}, {o.size_cpp()}>'
+                            + f'nnet::convert_data_back<typename {o.type.name}::value_type, {dtype}, {o.size_cpp()}>'
                             + f'({o.name}_output.data(), {o.name});\n'
                         )
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:

From 86b0f4bb4c3c5436777942573d6d8a594c8e4d5f Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 14 Feb 2024 11:17:00 -0600
Subject: [PATCH 019/100] snapshot updating nnet_utils (not finished)

---
 .../oneapi/firmware/nnet_utils/nnet_conv1d.h  |  5 +-
 .../nnet_utils/nnet_conv1d_resource.h         | 69 +++++++-------
 .../nnet_utils/nnet_conv2d_resource.h         | 31 +++---
 .../nnet_utils/nnet_dense_compressed.h        | 10 +-
 .../oneapi/firmware/nnet_utils/nnet_pooling.h | 20 ++--
 .../firmware/nnet_utils/nnet_recurrent.h      | 94 +++++++++----------
 6 files changed, 117 insertions(+), 112 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
index 8897e1315..549cb2c19 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
@@ -44,15 +44,14 @@ struct conv1d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+void conv_1d_cl(const data_T &data, res_T &res,
                 const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+void pointwise_conv_1d_cl(const data_T &data, res_T &res,
                           const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                           const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
index a110d6d42..9690f56e2 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -12,13 +12,12 @@ enum class conv1d_implementation { combination, im2col, winograd };
 //      im2col - General-purpose 1D Convolution algorithm
 // ****************************************************************
 
-template <class data_T, typename CONFIG_T>
-void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                  data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int col) {
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) {
     // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP
     // usage
 
-    hls_register int index = 0;
+    [[intel::fpga_register]] int index = 0;
 
 KernelLoop:
     #pragma unroll
@@ -26,7 +25,7 @@ void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     ChannelLoop:
         #pragma unroll
         for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
-            hls_register int index_data =
+            [[intel::fpga_register]] int index_data =
                 (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
             if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
                 data_col[index++] = data[index_data];
@@ -39,7 +38,7 @@ void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
 template <class data_T, class res_T, typename CONFIG_T>
 void conv_1d_im2col_cl(
-    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     // im2col performs no filter transformations; therefore, filter size remains constant
@@ -48,6 +47,9 @@ void conv_1d_im2col_cl(
     // Unroll factor for loop traversing input image, derived from parallelisation_factor
     static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
 
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
 ColLoop:
     #pragma unroll pf
     #pragma ii CONFIG_T::reuse_factor
@@ -56,11 +58,11 @@ void conv_1d_im2col_cl(
         // See Intel's HLS - Loop Best Practices
         // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-        hls_register data_T data_col[CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
-        im2col_1d_cl<data_T, CONFIG_T>(data, data_col, i);
+        [[intel::fpga_register]] data_col_T data_col;
+        im2col_1d_cl<data_T, data_col_T, CONFIG_T>(data, data_col, i);
 
-        hls_register res_T res_col[CONFIG_T::n_filt];
-        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        [[intel::fpga_register]] res_col_T res_col;
+        dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
     // Unroll fully, since
     // (1) n_filt is usually low in io_parallel (< 32)
@@ -88,7 +90,7 @@ inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[
 
 template <class data_T, class res_T, typename CONFIG_T>
 void winograd_conv1d_3x1_kernel_cl(
-    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     // Ensure Winograd conditions are met
@@ -106,7 +108,7 @@ void winograd_conv1d_3x1_kernel_cl(
         int offset = CONFIG_T::n_filt * i;
         #pragma unroll
         for (int f = 0; f < CONFIG_T::n_filt; f++) {
-            res[offset + f] = static_cast<res_T>(biases[f]);
+            res[offset + f] = static_cast<typename res_T::value_type>(biases[f]);
         }
     }
 
@@ -117,8 +119,8 @@ void winograd_conv1d_3x1_kernel_cl(
         #pragma unroll
         for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
             // Get current 4x1 tile
-            hls_register data_T T[16];
-            hls_register uint8_t p = 0;
+            [[intel::fpga_register]] typename data_T::value_type T[16];
+            [[intel::fpga_register]] uint8_t p = 0;
 
             #pragma unroll
             for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) {
@@ -130,24 +132,25 @@ void winograd_conv1d_3x1_kernel_cl(
             }
 
             // Transform input tile
-            hls_register typename CONFIG_T::accum_t D[4];
-            winograd_transform_input_tile_3x1_kernel<data_T, typename CONFIG_T::accum_t>(T, D);
+            [[intel::fpga_register]] typename CONFIG_T::accum_t D[4];
+            winograd_transform_input_tile_3x1_kernel<typename data_T::value_type, typename CONFIG_T::accum_t>(T, D);
 
             #pragma unroll
             for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
-                hls_register int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel);
+                [[intel::fpga_register]] int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel);
 
                 // Hadamard product between transformed input tile and kernel
-                hls_register typename CONFIG_T::accum_t Y[4];
+                [[intel::fpga_register]] typename CONFIG_T::accum_t Y[4];
                 #pragma unroll
                 for (int i = 0; i < 4; i++) {
                     Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
                 }
 
                 // Explicitly transform intermediate result Z = A'YA and save to output
-                res[CONFIG_T::n_filt * col + filter] += static_cast<res_T>(Y[0] + Y[1] + Y[2]);
+                res[CONFIG_T::n_filt * col + filter] += static_cast<typename res_T::value_type>(Y[0] + Y[1] + Y[2]);
                 if ((col + 1) < CONFIG_T::out_width)
-                    res[CONFIG_T::n_filt * (col + 1) + filter] += static_cast<res_T>(Y[1] - Y[2] - Y[3]);
+                    res[CONFIG_T::n_filt * (col + 1) + filter] +=
+                        static_cast<typename res_T::value_type>(Y[1] - Y[2] - Y[3]);
             }
         }
     }
@@ -157,17 +160,17 @@ void winograd_conv1d_3x1_kernel_cl(
 //       1D Convolution for 1x1 kernels using optimized im2col
 // ****************************************************************
 
-template <class data_T, typename CONFIG_T>
-void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan],
-                            const int col) {
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int col) {
     // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
 
-    hls_register int index = 0;
+    [[intel::fpga_register]] int index = 0;
 
 ChannelLoop:
     #pragma unroll
     for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
-        hls_register int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+        [[intel::fpga_register]] int index_data =
+            (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
         if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
             data_col[index++] = data[index_data];
         } else {
@@ -177,8 +180,7 @@ void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                                   res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
                                    const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
@@ -186,6 +188,9 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
     // Unroll factor for loop traversing input image, derived from parallelisation_factor
     static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
 
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
 ColLoop:
     #pragma unroll pf
     #pragma ii CONFIG_T::reuse_factor
@@ -194,11 +199,11 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
         // See Intel's HLS - Loop Best Practices
         // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-        hls_register data_T data_col[CONFIG_T::n_chan];
-        im2col_1d_pointwise_cl<data_T, CONFIG_T>(data, data_col, col);
+        [[intel::fpga_register]] data_col_T data_col;
+        im2col_1d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, col);
 
-        hls_register res_T res_col[CONFIG_T::n_filt];
-        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        [[intel::fpga_register]] res_T res_col;
+        dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
     // Unroll fully, since
     // (1) n_filt is usually low in io_parallel (< 32)
@@ -216,7 +221,7 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
 // ****************************************************************
 template <class data_T, class res_T, typename CONFIG_T>
 void conv_1d_resource_cl(
-    data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+    const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     static constexpr bool winograd_conditions =
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 73ad45592..85c4c78d9 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -20,18 +20,18 @@ void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
     // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little
     // DSP usage
 
-    hls_register int index = 0;
+    [[intel::fpga_register]] int index = 0;
 
 FiltHeightLoop:
     #pragma unroll
     for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) {
-        hls_register int input_row =
+        [[intel::fpga_register]] int input_row =
             -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height;
 
     FiltWidthLoop:
         #pragma unroll
         for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) {
-            hls_register int input_col =
+            [[intel::fpga_register]] int input_col =
                 -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width;
 
         ChannelLoop:
@@ -73,10 +73,11 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-            hls_register data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
+            [[intel::fpga_register]] data_T
+                data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
             im2col_2d_cl<data_T, CONFIG_T>(data, data_col, i, j);
 
-            hls_register res_T res_col[CONFIG_T::n_filt];
+            [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt];
             dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
         // Unroll fully, since
@@ -158,9 +159,9 @@ void winograd_conv2d_3x3_kernel_cl(
             #pragma unroll
             for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
                 // Get current 4x4 tile
-                hls_register data_T T[16];
-                hls_register typename CONFIG_T::accum_t D[16];
-                hls_register uint8_t p = 0;
+                [[intel::fpga_register]] data_T T[16];
+                [[intel::fpga_register]] typename CONFIG_T::accum_t D[16];
+                [[intel::fpga_register]] uint8_t p = 0;
 
                 #pragma unroll
                 for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) {
@@ -179,10 +180,10 @@ void winograd_conv2d_3x3_kernel_cl(
 
                 #pragma unroll
                 for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
-                    hls_register int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel);
+                    [[intel::fpga_register]] int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel);
 
                     // Hadamard product between transformed input tile and kernel
-                    hls_register typename CONFIG_T::accum_t Y[16];
+                    [[intel::fpga_register]] typename CONFIG_T::accum_t Y[16];
                     #pragma unroll
                     for (int i = 0; i < 16; i++) {
                         Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
@@ -215,14 +216,14 @@ void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width
                             data_T data_col[CONFIG_T::n_chan], const int row, const int col) {
     // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
 
-    hls_register int index = 0;
+    [[intel::fpga_register]] int index = 0;
 
 ChannelLoop:
     #pragma unroll
     for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
 
-        hls_register int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height;
-        hls_register int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width;
+        [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height;
+        [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width;
 
         if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) {
             data_col[index++] =
@@ -256,10 +257,10 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-            hls_register data_T data_col[CONFIG_T::n_chan];
+            [[intel::fpga_register]] data_T data_col[CONFIG_T::n_chan];
             im2col_2d_pointwise_cl<data_T, CONFIG_T>(data, data_col, row, col);
 
-            hls_register res_T res_col[CONFIG_T::n_filt];
+            [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt];
             dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
         FiltLoop:
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
index ba50a631b..a66423cef 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
@@ -12,7 +12,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
                       const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
 
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
 
 InitAccum:
     #pragma unroll
@@ -20,8 +20,8 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
         acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
     }
 
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    [[intel::fpga_register]] data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
 
     #pragma unroll
     for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
@@ -36,7 +36,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     #pragma nofusion
     #pragma speculated_iterations 0
     for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
     CompressedMultLoop:
         #pragma unroll
         for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
@@ -49,7 +49,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
                 inputs[is][im] = inputs[is + 1][im];
             }
         }
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
+        [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
     ResetMult:
         #pragma unroll
         for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
index bbfc0908e..c50c34601 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
@@ -7,7 +7,7 @@ namespace nnet {
 
 // Returns the maximum value from an array of size N
 template <typename T, int N> T max(T x[N]) {
-    hls_register T y = x[0];
+    [[intel::fpga_register]] T y = x[0];
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
@@ -22,7 +22,7 @@ template <typename T, int N> T max(T x[N]) {
 
 // Returns the mean value of an array of size N
 template <typename T, int N> T avg(T (&x)[N]) {
-    hls_register T y = 0;
+    [[intel::fpga_register]] T y = 0;
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
@@ -38,7 +38,7 @@ template <typename T, int N> T avg(T (&x)[N]) {
 // Returns the mean value of an array of size N
 // Overload of the above function; using a wider accumulator than the input to avoid overflow
 template <int W, int N> ac_int<W, true> avg(ac_int<W, true> (&x)[N]) {
-    hls_register ac_int<W + ceillog2(N), true> tmp = 0;
+    [[intel::fpga_register]] ac_int<W + ceillog2(N), true> tmp = 0;
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
@@ -57,7 +57,7 @@ template <int W, int N> ac_int<W, true> avg(ac_int<W, true> (&x)[N]) {
 // Returns the mean value of an array of size N
 // Overload of the above function; using a wider accumulator than the input to avoid overflow
 template <int W, int I, int N> ac_fixed<W, I, true> avg(ac_fixed<W, I, true> (&x)[N]) {
-    hls_register ac_fixed<W + ceillog2(N), I + ceillog2(N), true> tmp = 0;
+    [[intel::fpga_register]] ac_fixed<W + ceillog2(N), I + ceillog2(N), true> tmp = 0;
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
@@ -136,10 +136,10 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
         #pragma unroll
         #pragma disable_loop_pipelining
         for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
-            hls_register data_T pool[CONFIG_T::pool_width];
+            [[intel::fpga_register]] data_T pool[CONFIG_T::pool_width];
 
             // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
-            hls_register unsigned img_overlap = 0;
+            [[intel::fpga_register]] unsigned img_overlap = 0;
 
         PoolWidthLoop:
             #pragma unroll
@@ -178,7 +178,7 @@ void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T r
     #pragma unroll
     #pragma disable_loop_pipelining
     for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        hls_register data_T pool[CONFIG_T::n_in];
+        [[intel::fpga_register]] data_T pool[CONFIG_T::n_in];
 
     InputWidthLoop:
         #pragma unroll
@@ -241,10 +241,10 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
             #pragma unroll
             #pragma disable_loop_pipelining
             for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) {
-                hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                [[intel::fpga_register]] data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
 
                 // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
-                hls_register unsigned img_overlap = 0;
+                [[intel::fpga_register]] unsigned img_overlap = 0;
 
             PoolHeightLoop:
                 #pragma unroll
@@ -301,7 +301,7 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *
     #pragma unroll
     #pragma disable_loop_pipelining
     for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        hls_register data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+        [[intel::fpga_register]] data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
 
     InputLoop:
         #pragma unroll
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index 464c6d415..340a8eda1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -104,18 +104,18 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
     static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
     // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
     // candidate state gates, for each of the units
-    hls_register typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units];
     nnet::dense_resource<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_x>(x, mat_mul_x_w, weights,
                                                                                                bias);
 
     // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights),
     // for update, reset and candidate state gates, for each of the units
-    hls_register typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units];
     nnet::dense_resource<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_h>(
         h, mat_mul_h_wr, recurrent_weights, recurrent_bias);
 
     // A vector containing both the values of z(t) and r(t) for every state
-    hls_register typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units];
 
     // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1)
     // Unrolled fully, no DSPs used
@@ -125,12 +125,12 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
     }
 
     // Activation on z(t) and r(t)
-    hls_register typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units];
     CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
                                        typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(z_r, z_r_act);
 
     // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h
-    hls_register typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units];
     #pragma unroll recurrent_unroll_factor
     for (int i = 0; i < (CONFIG_T::n_units); i++) {
         hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units];
@@ -145,7 +145,7 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
     }
 
     // Activation on candidate state
-    hls_register typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units];
     CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
                                   typename CONFIG_T::ACT_CONFIG_T>::activation(h_cand, h_cand_act);
 
@@ -163,8 +163,8 @@ void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::
          const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
          const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
 
-    hls_register data_T x[CONFIG_T::n_in];
-    hls_register res_T h[CONFIG_T::n_units];
+    [[intel::fpga_register]] data_T x[CONFIG_T::n_in];
+    [[intel::fpga_register]] res_T h[CONFIG_T::n_units];
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_units; i++) {
@@ -233,17 +233,17 @@ void simple_rnn_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T:
                      const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
                      const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
     // Weight multiplication
-    typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] [[intel::fpga_register]];
     multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
         inputs, afterW, kernel);
 
     // Bias addition
-    typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
     add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
         afterW, afterBias, bias);
 
     // Hidden state
-    typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
     multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
                                                                                                  rec_kernel);
 
@@ -261,10 +261,10 @@ void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[C
                 const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
                 const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
                 const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
-    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
-    res_T hidden_state_temp[CONFIG_T::n_out] hls_register;
-    res_T h[CONFIG_T::n_out] hls_register;
-    data_T in[CONFIG_T::n_in] hls_register;
+    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
+    res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
+    res_T h[CONFIG_T::n_out] [[intel::fpga_register]];
+    data_T in[CONFIG_T::n_in] [[intel::fpga_register]];
 
 // Set initially hidden state (output) to zero
 INIT_LOOP:
@@ -360,39 +360,39 @@ void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out
                const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
 
     // Internals definitions
-    typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
 
     // Hidden state Gate candidates, intermediate variables
-    typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
 
     // After addition, intermediate variables
-    typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
 
     // Gate outputs
-    typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t h[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t h[CONFIG_T::n_out] [[intel::fpga_register]];
 
     // Intermediate variable cell calculation
-    typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] hls_register;
-    typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] hls_register;
+    typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] [[intel::fpga_register]];
+    typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] [[intel::fpga_register]];
 
     //-----------Gate I Calculations
     // Weight multiplication
@@ -518,13 +518,13 @@ void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_
           const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
           const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
           const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
-    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
-    res_T hidden_state_temp[CONFIG_T::n_out] hls_register;
-    res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
-    res_T cell_state_temp[CONFIG_T::n_out] hls_register;
-    res_T h[CONFIG_T::n_out] hls_register;
-    res_T c[CONFIG_T::n_out] hls_register;
-    data_T in[CONFIG_T::n_in] hls_register;
+    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
+    res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
+    res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
+    res_T cell_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
+    res_T h[CONFIG_T::n_out] [[intel::fpga_register]];
+    res_T c[CONFIG_T::n_out] [[intel::fpga_register]];
+    data_T in[CONFIG_T::n_in] [[intel::fpga_register]];
 
 // Set initially hidden state (output) to zero
 INIT_LOOP:

From 62c5ecb3cedeb30b93ae133948aea48aa9c93b33 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 14 Feb 2024 11:39:08 -0600
Subject: [PATCH 020/100] define array in nnet_types for oneAPI

---
 hls4ml/backends/fpga/fpga_types.py                       | 2 +-
 hls4ml/backends/oneapi/oneapi_types.py                   | 6 ++++--
 hls4ml/templates/oneapi/firmware/defines.h               | 2 --
 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h | 3 +++
 hls4ml/templates/quartus/firmware/defines.h              | 2 --
 hls4ml/templates/vivado/firmware/defines.h               | 2 --
 6 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index 41f3cd12e..ceac0b5e4 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -172,7 +172,7 @@ def convert_precision(self, precision_converter):
 class PackedTypeConverter(TypeDefinition, TypePrecisionConverter):
     def definition_cpp(self):
         n_elem_expr = '/' if self.unpack else '*'
-        return 'typedef array<{precision}, {n_elem}> {name};\n'.format(
+        return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format(
             name=self.name,
             precision=self.precision.definition_cpp(),
             n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack),
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 4559c1f9e..103f015c4 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -80,7 +80,9 @@ def definition_cpp(self, name_suffix='', as_reference=False):
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.type.name} = array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n'
+        lines += (
+            indent + f'using {self.type.name} = nnet::array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n'
+        )
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
             + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
@@ -103,7 +105,7 @@ def definition_cpp(self, name_suffix='', as_reference=True):
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.name} = std::array<{self.type.name}, {self.size_cpp()}>;\n'
+        lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
             + f'{self.type}, {pipe_min_size}>;\n'
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
index b88fca49b..05b98cda2 100644
--- a/hls4ml/templates/oneapi/firmware/defines.h
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -10,8 +10,6 @@
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
 
-using std::array;
-
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 221055938..cd572f0c7 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -1,12 +1,15 @@
 #ifndef NNET_TYPES_H_
 #define NNET_TYPES_H_
 
+#include <array>
 #include <assert.h>
 #include <cstddef>
 #include <cstdio>
 
 namespace nnet {
 
+template <class T, std::size_t N> using array = std::array<T, N>;
+
 /*
  * HLS Shift Register Implementation
  * To verify a shift register is used in hardware, go to report.html > Area Analysis of System
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
index a465f2716..c3fe4ec40 100644
--- a/hls4ml/templates/quartus/firmware/defines.h
+++ b/hls4ml/templates/quartus/firmware/defines.h
@@ -36,8 +36,6 @@ template <typename T> using stream_out = ihc::stream_out<T>;
 // Include nnet::array - a custom array-like struct, mainly used with io_stream
 #include "nnet_utils/nnet_types.h"
 
-using nnet::array;
-
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision
diff --git a/hls4ml/templates/vivado/firmware/defines.h b/hls4ml/templates/vivado/firmware/defines.h
index e0a75ec64..1f11b0209 100644
--- a/hls4ml/templates/vivado/firmware/defines.h
+++ b/hls4ml/templates/vivado/firmware/defines.h
@@ -7,8 +7,6 @@
 #include <cstddef>
 #include <cstdio>
 
-using nnet::array;
-
 // hls-fpga-machine-learning insert numbers
 
 // hls-fpga-machine-learning insert layer-precision

From d203b42af702fc9b14ad47354c742f796306b088 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 14 Feb 2024 13:45:35 -0600
Subject: [PATCH 021/100] fix parallel conv2d

---
 .../firmware/nnet_utils/nnet_activation.h     |  3 +-
 .../oneapi/firmware/nnet_utils/nnet_conv2d.h  |  6 +-
 .../nnet_utils/nnet_conv2d_resource.h         | 65 ++++++++++---------
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index ef22a6b20..3fbeeaa66 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -404,7 +404,8 @@ template <class data_T, class res_T, typename CONFIG_T> void softsign(const data
 // *************************************************
 //       ELU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void elu(const data_T &data, const res_T alpha, res_T &res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(const data_T &data, const typename res_T::value_type alpha, res_T &res) {
 // Initialize the lookup table
 #include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
index 3aa71a74b..0038ce7d1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
@@ -50,8 +50,7 @@ struct conv2d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+void conv_2d_cl(const data_T &data, res_T &res,
                 const typename CONFIG_T::weight_t
                     weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
@@ -59,8 +58,7 @@ void conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T:
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+void pointwise_conv_2d_cl(const data_T &data, res_T &res,
                           const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                           const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 85c4c78d9..8c7fdcad2 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -13,10 +13,8 @@ enum class conv2d_implementation { combination, im2col, winograd };
 //      im2col - General-purpose 2D Convolution algorithm
 // ****************************************************************
 
-template <class data_T, typename CONFIG_T>
-void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                  data_T data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan], const int row,
-                  const int col) {
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const int col) {
     // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little
     // DSP usage
 
@@ -49,8 +47,7 @@ void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                       res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+void conv_2d_im2col_cl(const data_T &data, res_T &res,
                        const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
                                                                  CONFIG_T::n_chan * CONFIG_T::n_filt],
                        const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
@@ -62,6 +59,10 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO
     static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
     static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
 
+    using data_col_T =
+        array<typename data_T::value_type, CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
 HeightLoop:
     #pragma unroll pfr
     for (int i = 0; i < CONFIG_T::out_height; i++) {
@@ -73,12 +74,11 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-            [[intel::fpga_register]] data_T
-                data_col[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan];
-            im2col_2d_cl<data_T, CONFIG_T>(data, data_col, i, j);
+            [[intel::fpga_register]] data_col_T data_col;
+            im2col_2d_cl<data_T, data_col_T, CONFIG_T>(data, data_col, i, j);
 
-            [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt];
-            dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+            [[intel::fpga_register]] res_col_T res_col;
+            dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
         // Unroll fully, since
         // (1) n_filt is usually low in io_parallel (< 32)
@@ -122,8 +122,7 @@ inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D
 
 template <class data_T, class res_T, typename CONFIG_T>
 void winograd_conv2d_3x3_kernel_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t
         weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
@@ -145,7 +144,7 @@ void winograd_conv2d_3x3_kernel_cl(
         int offset = CONFIG_T::n_filt * i;
         #pragma unroll
         for (int f = 0; f < CONFIG_T::n_filt; f++) {
-            res[offset + f] = static_cast<res_T>(biases[f]);
+            res[offset + f] = static_cast<typename res_T::value_type>(biases[f]);
         }
     }
 
@@ -159,7 +158,7 @@ void winograd_conv2d_3x3_kernel_cl(
             #pragma unroll
             for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
                 // Get current 4x4 tile
-                [[intel::fpga_register]] data_T T[16];
+                [[intel::fpga_register]] typename data_T::value_type T[16];
                 [[intel::fpga_register]] typename CONFIG_T::accum_t D[16];
                 [[intel::fpga_register]] uint8_t p = 0;
 
@@ -176,7 +175,7 @@ void winograd_conv2d_3x3_kernel_cl(
                 }
 
                 // Transform input tile
-                winograd_transform_input_tile_3x3_kernel<data_T, typename CONFIG_T::accum_t>(T, D);
+                winograd_transform_input_tile_3x3_kernel<typename data_T::value_type, typename CONFIG_T::accum_t>(T, D);
 
                 #pragma unroll
                 for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
@@ -191,16 +190,20 @@ void winograd_conv2d_3x3_kernel_cl(
 
                     // Explicitly transform intermediate result Z = A'YA and save to output
                     res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] +=
-                        static_cast<res_T>(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] + Y[10]);
+                        static_cast<typename res_T::value_type>(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] +
+                                                                Y[10]);
                     if ((col + 1) < CONFIG_T::out_height)
                         res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] +=
-                            static_cast<res_T>(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] - Y[11]);
+                            static_cast<typename res_T::value_type>(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] -
+                                                                    Y[11]);
                     if ((row + 1) < CONFIG_T::out_width)
                         res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] +=
-                            static_cast<res_T>(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] - Y[13] - Y[14]);
+                            static_cast<typename res_T::value_type>(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] -
+                                                                    Y[13] - Y[14]);
                     if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height)
                         res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] +=
-                            static_cast<res_T>(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] - Y[13] + Y[14]);
+                            static_cast<typename res_T::value_type>(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] -
+                                                                    Y[13] + Y[14]);
                 }
             }
         }
@@ -211,9 +214,8 @@ void winograd_conv2d_3x3_kernel_cl(
 //       2D Convolution for 1x1 kernels using optimized im2col
 // ****************************************************************
 
-template <class data_T, typename CONFIG_T>
-void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                            data_T data_col[CONFIG_T::n_chan], const int row, const int col) {
+template <class data_T, class data_col_Ttypename CONFIG_T>
+void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) {
     // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
 
     [[intel::fpga_register]] int index = 0;
@@ -235,8 +237,7 @@ void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                                   res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
                                    const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
@@ -246,6 +247,9 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
     static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
     static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
 
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
 HeightLoop:
     #pragma unroll pfr
     for (int row = 0; row < CONFIG_T::out_height; row++) {
@@ -257,11 +261,11 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
 
-            [[intel::fpga_register]] data_T data_col[CONFIG_T::n_chan];
-            im2col_2d_pointwise_cl<data_T, CONFIG_T>(data, data_col, row, col);
+            [[intel::fpga_register]] data_col_T data_col;
+            im2col_2d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, row, col);
 
-            [[intel::fpga_register]] res_T res_col[CONFIG_T::n_filt];
-            dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+            [[intel::fpga_register]] res_T res_col;
+            dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
         FiltLoop:
             #pragma unroll
@@ -276,8 +280,7 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
 //      Top-level function - handles different implementations
 // ****************************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                         res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+void conv_2d_resource_cl(const data_T &data, res_T &res,
                          const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
                                                                    CONFIG_T::n_chan * CONFIG_T::n_filt],
                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {

From f983ecea46f760a992c182da83a15b34e49338a7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 14 Feb 2024 17:41:17 -0600
Subject: [PATCH 022/100] add back the streaming versions of algs, most
 unconverted

---
 .../firmware/nnet_utils/nnet_activation.h     |   3 +-
 .../nnet_utils/nnet_activation_stream.h       | 661 ++++++++++++++++++
 .../nnet_utils/nnet_batchnorm_stream.h        | 106 +++
 .../firmware/nnet_utils/nnet_conv1d_stream.h  | 172 +++++
 .../firmware/nnet_utils/nnet_conv2d_stream.h  | 238 +++++++
 .../firmware/nnet_utils/nnet_dense_stream.h   |  23 +
 .../firmware/nnet_utils/nnet_embed_stream.h   |  29 +
 .../firmware/nnet_utils/nnet_merge_stream.h   | 341 +++++++++
 .../firmware/nnet_utils/nnet_padding_stream.h |  83 +++
 .../firmware/nnet_utils/nnet_pooling_stream.h | 317 +++++++++
 .../nnet_utils/nnet_recurrent_stream.h        |  65 ++
 .../firmware/nnet_utils/nnet_resize_stream.h  |  56 ++
 .../oneapi/firmware/nnet_utils/nnet_stream.h  | 116 +++
 .../nnet_utils/nnet_transpose_stream.h        |  32 +
 14 files changed, 2240 insertions(+), 2 deletions(-)
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 3fbeeaa66..1aceaeb26 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -235,8 +235,7 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_argmax(cons
     [[intel::fpga_register]] auto maximum = data[0];
     [[intel::fpga_register]] int idx = 0;
 
-    #pragma ii 1
-    for (int i = 1; i < CONFIG_T::n_in; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 1; i < CONFIG_T::n_in; i++) {
         if (data[i] > maximum) {
             maximum = data[i];
             idx = i;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
new file mode 100644
index 000000000..8cb1349fd
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -0,0 +1,661 @@
+#ifndef NNET_ACTIVATION_STREAM_H_
+#define NNET_ACTIVATION_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// *************************************************
+//       Linear Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void linear() {
+LinearActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    LinearPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            out_data[j] = in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       ReLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void relu() {
+ReLUActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    ReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void leaky_relu(const typename data_pipe::value_type::value_type alpha) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+LeakyReLUActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    LeakyReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void thresholded_relu(const typename data_pipe::value_type::value_type theta) {
+ThresholdedReLUActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    ThresholdedReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void elu(const typename data_pipe::value_type::value_type alpha) {
+#include "activation_tables/elu_table.tb"
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+EluActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    EluPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = datareg;
+            } else {
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = alpha * elu_table[index];
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void elu() {
+    elu<data_pipe, res_pipe, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SeLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void selu() {
+#include "activation_tables/selu_table.tb"
+
+SeluActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    SeluPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg;
+            } else {
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = selu_table[index];
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+PReLUActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    PReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_pipe::value_type::size + j] * in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus() {
+#include "activation_tables/softplus_table.tb"
+
+SoftplusActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    SoftplusPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
+            [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softplus_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign() {
+#include "activation_tables/softsign_table.tb"
+
+    static const int MAX_VALUE = 8;
+
+SoftsignActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    SoftsignPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue;
+            ;
+            if (in_data[j] < 0) {
+                absValue = -in_data[j];
+            } else {
+                absValue = in_data[j];
+            }
+            ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int();
+            if (absValue > MAX_VALUE)
+                index = CONFIG_T::table_size - 1;
+            if (in_data[j] < 0) {
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-softsign_table[index]);
+            } else {
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(softsign_table[index]);
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable() {
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+    [[intel::fpga_register]] typename data_pipe::value_type::value_type data_array[data_pipe::value_type::size];
+
+SoftmaxArrayLoop:
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxArrayPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+            data_array[j] = in_pack[j];
+        }
+
+        // Find the max and compute all delta(x_i, x_max)
+        Op_max<typename data_pipe::value_type::value_type> op_max;
+        [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max =
+            reduce<typename data_pipe::value_type::value_type, data_pipe::value_type::size,
+                   Op_max<typename data_pipe::value_type::value_type>>(data_array, op_max);
+
+        // For the diffs, use the same type as the input but force rounding and saturation
+        [[intel::fpga_register]] ac_fixed<data_pipe::value_type::value_type::width,
+                                          data_pipe::value_type::value_type::i_width, true, AC_RND, AC_SAT>
+            d_xi_xmax[data_pipe::value_type::size];
+        #pragma unroll
+        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+            d_xi_xmax[j] = data_array[j] - x_max;
+        }
+
+        // Calculate all the e^x's
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size];
+        #pragma unroll
+        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+            exp_res[j] = exp_table[softmax_stable_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
+                d_xi_xmax[j])];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_pipe::value_type::size, Op_add<typename CONFIG_T::exp_table_t>>(
+                exp_res, op_add);
+
+        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        typename res_pipe::value_type out_pack;
+
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+
+            // TODO - Find Quartus-equivalent pragma
+            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_latency() {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+    // Calculate all the e^x's
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size];
+
+SoftmaxExpLoop:
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxExpPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+            exp_res[j] =
+                exp_table_latency[softmax_latency_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
+                    in_pack[j])];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        // Multiply previously calculated exponetials with the reciprocal of the sum
+        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        typename res_pipe::value_type out_pack;
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legacy() {
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
+
+    // Index into the lookup table based on data for exponentials
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[data_pipe::value_type::size];
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
+    [[intel::fpga_register]] typename data_pipe::value_type::value_type data_cache[data_pipe::value_type::size];
+
+SoftmaxInitLoop:
+    [[intel::initiation_interval(1)]] for (unsigned s = 0; s < CONFIG_T::n_in / data_pipe::value_type::size; s++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxInitPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+            data_cache[j] = in_pack[j];
+            exp_res[j] = 0;
+        }
+
+    SoftmaxExpLoop:
+        #pragma unroll
+        for (int i = 0; i < data_pipe::value_type::size; i++) {
+        SoftmaxExpInner:
+            #pragma unroll
+            for (int j = 0; j < data_pipe::value_type::size; j++) {
+                if (i == j) {
+                    exp_diff_res = 1;
+                } else {
+                    int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int();
+                    int index = data_round + 8 * CONFIG_T::table_size / 16;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    exp_diff_res = exp_table_legacy[index];
+                }
+                exp_res[i] += exp_diff_res;
+            }
+        }
+
+        typename res_pipe::value_type out_pack;
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+            int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
+            out_pack[j] = static_cast<typename res_pipe::value_type::value_type>(invert_table_legacy[exp_res_index]);
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax() {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+        #pragma unroll
+        for (int i = 0; i < res_pipe::value_type::size; i++) {
+            out_data[i] = static_cast<typename res_pipe::value_type::value_type>(0);
+        }
+
+        [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0];
+        [[intel::fpga_register]] int idx = 0;
+
+        [[intel::initiation_interval(1)]] for (int i = 1; i < res_pipe::value_type::size; i++) {
+            if (in_data[i] > maximum) {
+                maximum = in_data[i];
+                idx = i;
+            }
+        }
+
+        out_data[idx] = static_cast<typename res_pipe::value_type::value_type>(1);
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax() {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_pipe, res_pipe, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_pipe, res_pipe, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_pipe, res_pipe, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_pipe, res_pipe, CONFIG_T>(data, res);
+        break;
+    default:
+        softmax_stable<data_pipe, res_pipe, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh() {
+#include "activation_tables/tanh_table.tb"
+    static const int MAX_VALUE = 4;
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+TanHActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    TanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
+
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
+
+            [[intel::fpga_register]] int index;
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = tanh_table[index];
+            else
+                out_data[j] = -tanh_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid() {
+#include "activation_tables/sigmoid_table.tb"
+    static const int MAX_VALUE = 8;
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+SigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    SigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
+
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
+
+            [[intel::fpga_register]] int index;
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = sigmoid_table[index];
+            else
+                out_data[j] = 1 - sigmoid_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+// Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid() {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+HardSigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    HardSigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
+            out_data[j] = datareg;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh() {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+
+HardSigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+
+        auto in_data = data_pipe::read();
+        typename res_pipe::value_type out_data;
+
+    HardSigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh() {
+BinaryTanHActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+
+        [[intel::fpga_register]] auto in_data = data_pipe::read();
+        [[intel::fpga_register]] typename res_pipe::value_type out_data;
+
+    BinaryTanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
+            else
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-1);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh() {
+TernaryTanHActLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+
+        [[intel::fpga_register]] auto in_data = data_pipe::read();
+        [[intel::fpga_register]] typename res_pipe::value_type out_data;
+
+    TernaryTanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < res_pipe::value_type::size; j++) {
+            if (in_data[j] > 1)
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
+            else if (in_data[j] <= -1)
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-1);
+            else
+                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(0);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
new file mode 100644
index 000000000..0f5970bfe
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -0,0 +1,106 @@
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(stream<data_T> &data, stream<res_T> &res, const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
+    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+
+BatchNormLoop:
+    #pragma ii pipeline
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+
+    BatchNormpack:
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * data_T::size + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(stream<data_T> &data, stream<nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias>> &res,
+                           const typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+
+BinaryNormLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
+
+    BatchNormPack:
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * data_T::size + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(stream<data_T> &data, stream<nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias>> &res,
+                            const typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            const typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+
+TernaryNormLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
+
+    BatchNormPack:
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * data_T::size + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+
+            if (in_data[j] > threshold_hi[norm_index])
+                out_data[j] = 1;
+            else if (in_data[j] <= threshold_lo[norm_index])
+                out_data[j] = -1;
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 000000000..28e9f6b87
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,172 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void kernel_shift(shift_buffer, kernel_window)
+ *
+ * Args:
+ *   shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
+ *   kernel_window - array of values from the input curently being convolved with the kernel
+ *
+ * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
+ */
+template <class data_T, typename CONFIG_T>
+void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan],
+                     typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+/*
+ * Manually shift kernel_window by one step to the left
+ * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
+ * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are
+ * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see
+ * synthesis report in report.html > Area Analysis of System
+ */
+KernelShiftWidth:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
+    KernelShiftChannel:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel];
+        }
+    }
+
+// Insert shift_buffer values into the last column of the kernel window
+KernelPushChannel:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel];
+    }
+}
+
+/*
+ * void shift_line_buffer(in_element, line_buffer, shift_buffer)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer -
+ * array elements popped from the line the buffer during the shift operation
+ *
+ * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
+ * Popped elements are later used to update the kernel window, during the kernel_shift operation
+ */
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer_1d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) {
+// For every channel, insert the incoming pixel at end of the shift buffer
+UpdateBuffer:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        shift_buffer[channel] = in_elem[channel];
+    }
+}
+
+/*
+ * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved
+ * with the kernel weights - Conv1D layer weights biases - Conv1D layer biases
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
+ * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
+ */
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_1d(
+    const data_T &in_elem, stream<res_T> &res_stream,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::filt_width - 1;
+
+    // X position pixel
+    static int pX = 0;
+
+    // X strides
+    static int sX = 0;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
+    nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
+        // Step 3 - Dense matrix multiplication
+        [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt];
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            kernel_window, res_out, weights, biases);
+
+        // Write result to output stream
+        [[intel::fpga_register]] res_T res_pack;
+    CastLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
+            res_pack[channel] = res_out[channel];
+        }
+        res_stream.write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        // Move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(stream<data_T> &data, stream<res_T> &res,
+                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Line buffer and kernel window
+    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type,
+                                                    CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan];
+    [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+
+    // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
+    static const data_T padds(0);
+
+// Input image left-side padding
+PaddingLeftWidth:
+    for (int col = 0; col < CONFIG_T::pad_left; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+    }
+
+// Read input image
+ReadInputWidth:
+    for (int col = 0; col < CONFIG_T::in_width; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
+    }
+
+// Input image right-side padding
+PaddingRightWidth:
+    for (int col = 0; col < CONFIG_T::pad_right; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 000000000..1090f9bda
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,238 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void kernel_shift(shift_buffer, kernel_window)
+ *
+ * Args:
+ *   shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
+ *   kernel_window - array of values from the input curently being convolved with the kernel
+ *
+ * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
+ */
+template <class data_T, typename CONFIG_T>
+void kernel_shift_2d(
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) {
+/*
+ * Manually shift kernel_window by one step to the left
+ * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
+ * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are
+ * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see
+ * synthesis report in report.html > Area Analysis of System
+ */
+KernelShiftWidth:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
+    KernelShiftHeight:
+        #pragma unroll
+        for (int row = 0; row < CONFIG_T::filt_height; row++) {
+        KernelShiftChannel:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + col * CONFIG_T::n_chan + channel] =
+                    kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + (col + 1) * CONFIG_T::n_chan + channel];
+            }
+        }
+    }
+
+// Insert shift_buffer values into the last column of the kernel window
+KernelPushHeight:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_height; col++) {
+    KernelPushChannel:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan +
+                          channel] = shift_buffer[col][channel];
+        }
+    }
+}
+
+/*
+ * void shift_line_buffer(in_element, line_buffer, shift_buffer)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer -
+ * array elements popped from the line the buffer during the shift operation
+ *
+ * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
+ * Popped elements are later used to update the kernel window, during the kernel_shift operation
+ */
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer_2d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) {
+// For every channel, insert the incoming pixel at end of the shift buffer
+UpdateBuffer:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        shift_buffer[CONFIG_T::filt_height - 1][channel] = in_elem[channel];
+    }
+
+// Shift line buffer and save popped values to shift buffer
+LineBufferDataIn:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+    LineBufferShift:
+        #pragma unroll
+        for (unsigned col = 1; col < CONFIG_T::filt_height; col++) {
+            // Shift the line buffer, return the popped pixel
+            typename data_T::value_type pop =
+                line_buffer[col - 1][channel].shift(shift_buffer[CONFIG_T::filt_height - col][channel]);
+
+            // Place popped pixed into the shift buffer, one row above
+            shift_buffer[CONFIG_T::filt_height - col - 1][channel] = pop;
+        }
+    }
+}
+
+/*
+ * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved
+ * with the kernel weights - Conv1D/Conv2D layer weights biases - Conv1D/Conv2D layer biases
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
+ * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
+ */
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_2d(
+    const data_T &in_elem, stream<res_T> &res_stream,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::filt_width - 1;
+    static constexpr int lShiftY = CONFIG_T::filt_height - 1;
+
+    // X, Y position pixels
+    static int pX = 0;
+    static int pY = 0;
+
+    // X, Y strides
+    static int sX = 0;
+    static int sY = 0;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan];
+    nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
+        // Step 3 - Dense matrix multiplication
+        [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt];
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            kernel_window, res_out, weights, biases);
+
+        // Write result to output stream
+        [[intel::fpga_register]] res_T res_pack;
+    CastLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
+            res_pack[channel] = res_out[channel];
+        }
+        res_stream.write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) &&
+        (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) {
+        pX = 0;
+        sX = 0;
+        pY = 0;
+        sY = 0;
+        // Reached end of row
+    } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        pY++;
+        sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1);
+        // Same row, same colum, therefore, move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(stream<data_T> &data, stream<res_T> &res,
+                const typename CONFIG_T::weight_t
+                    weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type,
+                                                    CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan];
+    [[intel::fpga_register]] static
+        typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+
+    // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
+    static const data_T padds(0);
+
+// Padding above input image
+PaddingTopHeight:
+    #pragma loop_coalesce 2
+    for (int row = 0; row < CONFIG_T::pad_top; row++) {
+    PaddingTopWidth:
+        for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        }
+    }
+
+ReadInputHeight:
+    #pragma loop_coalesce 2
+    for (int row = 0; row < CONFIG_T::in_height; row++) {
+    // Input image left-side padding
+    PaddingLeftWidth:
+        for (int col = 0; col < CONFIG_T::pad_left; col++) {
+            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        }
+
+    // Read input image
+    ReadInputWidth:
+        for (int col = 0; col < CONFIG_T::in_width; col++) {
+            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
+        }
+
+    // Input image right-side padding
+    PaddingRightWidth:
+        for (int col = 0; col < CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        }
+    }
+
+// Padding below input image
+PaddingBottomHeight:
+    #pragma loop_coalesce 2
+    for (int row = 0; row < CONFIG_T::pad_bottom; row++) {
+    PaddingBottomWidth:
+        for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 000000000..85b734624
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,23 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// Note:  DataPack logic removed, at least in the initial version
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    [[intel::fpga_register]] typename res_pipe::value_type res;
+    [[intel::fpga_register]] auto data = data_pipe::read()
+        dense_resource<typename data_pipe::value_type, typename res_pipe::value_type, CONFIG_T>(data, res, weights, biases);
+    res_pipe::write(res);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
new file mode 100644
index 000000000..51e54e991
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
@@ -0,0 +1,29 @@
+#ifndef NNET_EMBED_STREAM_H_
+#define NNET_EMBED_STREAM_H_
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(stream<data_T> &data, stream<res_T> &res,
+               const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+    data_T in_data = data.read();
+
+InputSequence:
+    #pragma ii CONFIG_T::reuse_factor
+    for (int j = 0; j < data_T::size; j++) {
+
+        res_T res_pack;
+
+    DenseEmbedding:
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i];
+        }
+
+        res.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
new file mode 100644
index 000000000..aeafc00ca
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
@@ -0,0 +1,341 @@
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    AddPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] + in_data2[j]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    SubtractPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] - in_data2[j]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MultLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    MultPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] * in_data2[j]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AvgLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    AvgPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] =
+                static_cast<typename res_T::value_type>((in_data1[j] + in_data2[j]) / (typename res_T::value_type)2);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MaxLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    MaxPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = static_cast<typename res_T::value_type>(out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j]
+                                                                                                            : in_data2[j]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MinLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+
+        [[intel::fpga_register]] res_T out_data;
+
+    MinPack:
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = static_cast<typename res_T::value_type>(out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j]
+                                                                                                            : in_data2[j]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    [[intel::fpga_register]] res_T out_data;
+
+ConcatLoop1:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+    ConcatPack1:
+        #pragma unroll
+        for (int j = 0; j < input1_T::size; j++) {
+            out_data[j + (i * input1_T::size)] = static_cast<typename res_T::value_type>(in_data1[j]);
+        }
+    }
+
+ConcatLoop2:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+    ConcatPack2:
+        #pragma unroll
+        for (int j = 0; j < input2_T::size; j++) {
+            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] =
+                static_cast<typename res_T::value_type>(in_data2[j]);
+        }
+    }
+    res.write(out_data);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+ConcatLoopHeight1:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] res_T out_data;
+
+    ConcatPackInput1:
+        #pragma unroll
+        for (int k = 0; k < input1_T::size; k++) {
+            out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+        }
+
+        res.write(out_data);
+    }
+
+ConcatLoopHeight2:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] res_T out_data;
+
+    ConcatPackInput2:
+        #pragma unroll
+        for (int k = 0; k < input2_T::size; k++) {
+            out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+ConcatLoopHeight:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] res_T out_data;
+
+    ConcatPackInput1:
+        #pragma unroll
+        for (int k = 0; k < input1_T::size; k++) {
+            out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+        }
+
+    ConcatPackInput2:
+        #pragma unroll
+        for (int k = 0; k < input2_T::size; k++) {
+            out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] input1_T in_data1 = data1.read();
+            [[intel::fpga_register]] res_T out_data;
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1_T::size; k++) {
+                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            }
+
+            res.write(out_data);
+        }
+    }
+
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+
+            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] res_T out_data;
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2_T::size; k++) {
+                out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] input1_T in_data1 = data1.read();
+            [[intel::fpga_register]] res_T out_data;
+
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1_T::size; k++) {
+                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+
+            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] res_T out_data;
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2_T::size; k++) {
+                out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] input1_T in_data1 = data1.read();
+            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] res_T out_data;
+
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1_T::size; k++) {
+                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            }
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2_T::size; k++) {
+                out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
new file mode 100644
index 000000000..8990a3339
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
@@ -0,0 +1,83 @@
+#ifndef NNET_PADDING_STREAM_H_
+#define NNET_PADDING_STREAM_H_
+
+namespace nnet {
+
+template <class res_T, typename CONFIG_T> inline void fill_zero(stream<res_T> &res) {
+    [[intel::fpga_register]] res_T res_part;
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_chan; i++) {
+        res_part[i] = 0;
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> inline void fill_data(stream<data_T> &data, stream<res_T> &res) {
+    [[intel::fpga_register]] data_T data_part = data.read();
+    [[intel::fpga_register]] res_T res_part;
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_chan; i++) {
+        res_part[i] = data_part[i];
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad1d_cl(stream<data_T> &data, stream<res_T> &res) {
+PadLeft:
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+
+CopyMain:
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        fill_data<data_T, res_T, CONFIG_T>(data, res);
+    }
+
+PadRight:
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(stream<data_T> &data, stream<res_T> &res) {
+PadTop:
+    #pragma loop_coalesce 2
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+    PadTopWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadMain:
+    #pragma loop_coalesce 2
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+
+    PadLeft:
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+
+    CopyMain:
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            fill_data<data_T, res_T, CONFIG_T>(data, res);
+        }
+
+    PadRight:
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadBottom:
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+    PadBottomWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 000000000..ffaf74b2f
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,317 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+#include "nnet_conv1d_stream.h"
+#include "nnet_conv2d_stream.h"
+#include "nnet_pooling.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void compute_pool_buffer_1d(in_element, res_stream, line_buffer, kernel_window)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4)
+ * Counter housekeeping - performs the required pooling operation
+ *
+ */
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_1d(const data_T &in_elem, stream<res_T> &res_stream,
+                            nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt],
+                            typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt]) {
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::pool_width - 1;
+
+    // X position pixels
+    static int pX = 0;
+
+    // X strides
+    static int sX = 0;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt];
+    nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full pool window
+    if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
+        [[intel::fpga_register]] res_T res_pack;
+
+    FiltLoop:
+        #pragma unroll
+        for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+            [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_width];
+
+        // Retrieve data for current channel
+        PoolLoop:
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::pool_width; i++) {
+                pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter];
+            }
+
+            // Step 3 - Pooling
+            res_pack[filter] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool_window));
+        }
+
+        // Write result to output stream
+        res_stream.write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        // Move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(stream<data_T> &data, stream<res_T> &res) {
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+        line_buffer[CONFIG_T::n_filt];
+    [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
+
+// Read input image
+ReadInputWidth:
+    for (int col = 0; col < CONFIG_T::in_width; col++) {
+        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window);
+    }
+}
+
+/*
+ * void compute_pool_buffer_2d(in_element, res_stream, line_buffer, kernel_window)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4)
+ * Counter housekeeping - performs the required pooling operation
+ *
+ */
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_2d(
+    const data_T &in_elem, stream<res_T> &res_stream,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::pool_height - 1]
+                                                                                [CONFIG_T::n_filt],
+    typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]) {
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::pool_width - 1;
+    static constexpr int lShiftY = CONFIG_T::pool_height - 1;
+
+    // X, Y position pixels
+    static int pX = 0;
+    static int pY = 0;
+
+    // X, Y strides
+    static int sX = 0;
+    static int sY = 0;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt];
+    nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full pool window
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
+        [[intel::fpga_register]] res_T res_pack;
+
+    FiltLoop:
+        #pragma unroll
+        for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+            [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+
+        // Retrieve data for current channel
+        PoolLoop:
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::pool_height * CONFIG_T::pool_width; i++) {
+                pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter];
+            }
+
+            // Step 3 - Pooling
+            res_pack[filter] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(
+                    pool_window));
+        }
+
+        // Write result to output stream
+        res_stream.write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) &&
+        (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) {
+        pX = 0;
+        sX = 0;
+        pY = 0;
+        sY = 0;
+        // Reached end of row
+    } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        pY++;
+        sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1);
+        // Same row, same colum, therefore, move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(stream<data_T> &data, stream<res_T> &res) {
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+        line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt];
+    [[intel::fpga_register]] static
+        typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+
+ReadInputHeight:
+    #pragma loop_coalesce 2
+    for (int row = 0; row < CONFIG_T::in_height; row++) {
+    // Read input image
+    ReadInputWidth:
+        for (int col = 0; col < CONFIG_T::in_width; col++) {
+            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window);
+        }
+    }
+}
+
+/*
+ * A function used with Global Pooling
+ * Returns the value before pooling
+ * Max : Return the minimal possible value
+ * Avg : Return 0
+ */
+template <typename T, Pool_Op op> inline T init_pool_value() {
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+/*
+ * A function used with Global Pooling
+ * Updates the output pooling value
+ * Max : Return the maximum between the previous maximum and current input
+ * Avg : Returns the cumulative sum
+ */
+template <class T_y, class T_x, Pool_Op op> inline T_y reduce_global_pool(T_y y, T_x x) {
+    if (op == Max) {
+        return (x > y) ? (T_y)x : y;
+    } else {
+        return (T_y)(x + y);
+    }
+}
+
+/*
+ * A function used with Global Pooling
+ * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max)
+ */
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt]) {
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = reduce_global_pool<typename CONFIG_T::accum_t, typename data_T::value_type, CONFIG_T::pool_op>(
+            data_input[i], in_elem[i]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling1d_cl(stream<data_T> &data, stream<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt];
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = init_pool_value<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+    }
+
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_input);
+    }
+
+    [[intel::fpga_register]] res_T res_pack;
+    if (CONFIG_T::pool_op == Average) {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i] / CONFIG_T::n_in);
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i]);
+        }
+    }
+
+    res.write(res_pack);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling2d_cl(stream<data_T> &data, stream<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt];
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = init_pool_value<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_input);
+        }
+    }
+
+    [[intel::fpga_register]] res_T res_pack;
+    if (CONFIG_T::pool_op == Average) {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] =
+                static_cast<typename res_T::value_type>(data_input[i] / (CONFIG_T::in_width * CONFIG_T::in_height));
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i]);
+        }
+    }
+
+    res.write(res_pack);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
new file mode 100644
index 000000000..9e51d35a0
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -0,0 +1,65 @@
+#ifndef NNET_RECURRENT_STREAM_H_
+#define NNET_RECURRENT_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recurrent_activation.h"
+
+namespace nnet {
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(stream<data_T> &data_stream, stream<res_T> &res_stream,
+         const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+         const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
+         const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
+         const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+
+    [[intel::fpga_register]] typename res_T::value_type h[CONFIG_T::n_units];
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_units; i++) {
+        h[i] = 0;
+    }
+
+    [[intel::fpga_register]] typename data_T::value_type x[CONFIG_T::n_in];
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / data_T::size; i_in++) {
+        data_T data_pack = data_stream.read();
+
+    DataPack:
+        #pragma unroll
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            x[i_pack] = data_pack[i_pack];
+        }
+
+        nnet::gru_cell<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(x, h, weights, recurrent_weights,
+                                                                                          bias, recurrent_bias);
+
+        if (CONFIG_T::return_sequences) {
+            res_T res_pack;
+
+        ResPackRetSeq:
+            #pragma unroll
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = h[i_pack];
+            }
+
+            res_stream.write(res_pack);
+        }
+    }
+
+    if (!CONFIG_T::return_sequences) {
+        res_T res_pack;
+
+    ResPackNoRetSeq:
+        #pragma unroll
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            res_pack[i_pack] = h[i_pack];
+        }
+
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
new file mode 100644
index 000000000..c619edb7c
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
@@ -0,0 +1,56 @@
+#ifndef NNET_IMAGE_STREAM_H_
+#define NNET_IMAGE_STREAM_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> void resize_nearest(stream<data_T> &image, stream<data_T> &resized) {
+    assert(CONFIG_T::new_height % CONFIG_T::height == 0);
+    assert(CONFIG_T::new_width % CONFIG_T::width == 0);
+
+    constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height;
+    constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width;
+
+ImageHeight:
+    for (unsigned h = 0; h < CONFIG_T::height; h++) {
+        [[intel::fpga_register]] data_T data_in_row[CONFIG_T::width];
+
+    ImageWidth:
+        for (unsigned i = 0; i < CONFIG_T::width; i++) {
+            [[intel::fpga_register]] data_T in_data = image.read();
+
+        ImageChan:
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_chan; j++) {
+                data_in_row[i][j] = in_data[j];
+            }
+        }
+
+    ResizeHeight:
+        for (unsigned i = 0; i < ratio_height; i++) {
+
+        ImageWidth2:
+            for (unsigned l = 0; l < CONFIG_T::width; l++) {
+
+            ResizeWidth:
+                for (unsigned j = 0; j < ratio_width; j++) {
+
+                    [[intel::fpga_register]] data_T out_data;
+
+                ResizeChan:
+                    #pragma unroll
+                    for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
+                        out_data[k] = data_in_row[l][k];
+                    }
+
+                    resized.write(out_data);
+                }
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
new file mode 100644
index 000000000..2bee64476
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -0,0 +1,116 @@
+#ifndef NNET_CLONE_H
+#define NNET_CLONE_H
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+struct broadcast_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned n_dupl = 2;
+};
+
+template <class data_T, class res_T, int N>
+void clone_stream(stream<data_T> &data, stream<res_T> &res1, stream<res_T> &res2) {
+CloneLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+
+    ClonePack:
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+    }
+}
+
+template <class data_T, class res_T, int N>
+void clone_stream(stream<data_T> &data, stream<res_T> &res1, stream<res_T> &res2, stream<res_T> &res3) {
+CloneLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        res_T out_data3;
+
+    ClonePack:
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+            out_data3[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+        res3.write(out_data3);
+    }
+}
+
+template <class data_T, class res_T, int N> void repack_stream(stream<data_T> &data, stream<res_T> &res) {
+    if (data_T::size == res_T::size) {
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+
+            data_T in_data = data.read();
+            res_T out_data;
+
+            #pragma unroll
+            for (int j = 0; j < data_T::size; j++) {
+                out_data[j] = in_data[j];
+            }
+
+            res.write(out_data);
+        }
+    } else if (data_T::size > res_T::size) {
+        constexpr unsigned pack_diff = data_T::size / res_T::size;
+
+        for (int i = 0; i < N / data_T::size; i++) {
+
+            data_T in_data = data.read();
+            res_T out_data;
+
+            [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
+
+                res_T out_data;
+
+                #pragma unroll
+                for (int k = 0; k < res_T::size; k++) {
+                    out_data[k] = in_data[j * res_T::size + k];
+                }
+                res.write(out_data);
+            }
+        }
+    } else { // data_T::size < res_T::size
+        res_T out_data;
+        constexpr unsigned pack_diff = res_T::size / data_T::size;
+        unsigned pack_cnt = 0;
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+
+            data_T in_data = data.read();
+
+            #pragma unroll
+            for (int j = 0; j < data_T::size; j++) {
+                out_data[pack_cnt * data_T::size + j] = in_data[j];
+            }
+
+            if (pack_cnt == pack_diff - 1) {
+                res.write(out_data);
+                pack_cnt = 0;
+            } else {
+                pack_cnt++;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
new file mode 100644
index 000000000..5fa126890
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
@@ -0,0 +1,32 @@
+#ifndef NNET_TRANSPOSE_STREAM_H_
+#define NNET_TRANSPOSE_STREAM_H_
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T> void transpose_2d(stream<data_T> &data, stream<res_T> &res) {
+    [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
+        [[intel::fpga_register]] data_T in_data = data.read();
+
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
+        [[intel::fpga_register]] res_T out_data;
+
+        #pragma unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif

From 5dd9282427e9b45375eb683a4caced9d8d0f59bc Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 15 Feb 2024 01:45:07 -0600
Subject: [PATCH 023/100] tentatively complete streaming for dense but not
 functional

---
 hls4ml/backends/oneapi/oneapi_template.py     | 43 +++++++++
 hls4ml/backends/oneapi/oneapi_types.py        | 21 ++---
 .../backends/oneapi/passes/core_templates.py  | 89 +++++++++++++++++--
 .../backends/oneapi/passes/transform_types.py |  8 +-
 hls4ml/templates/oneapi/firmware/defines.h    |  1 -
 .../templates/oneapi/firmware/myproject.cpp   |  8 ++
 .../nnet_utils/nnet_activation_stream.h       | 14 ++-
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |  8 +-
 .../nnet_utils/nnet_dense_compressed.h        |  4 +-
 .../firmware/nnet_utils/nnet_dense_stream.h   |  4 +-
 hls4ml/writer/oneapi_writer.py                | 33 +++++--
 11 files changed, 184 insertions(+), 49 deletions(-)
 create mode 100644 hls4ml/backends/oneapi/oneapi_template.py

diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py
new file mode 100644
index 000000000..184e319f3
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_template.py
@@ -0,0 +1,43 @@
+'''
+This package includes oneAPI-specific templates
+'''
+
+from hls4ml.backends.template import Template
+
+
+class StreamFunctionCallTemplate(Template):
+    def __init__(self, layer_class):
+        if isinstance(layer_class, (list, tuple, set)):
+            name = '_'.join([cls.__name__.lower() for cls in layer_class])
+        else:
+            name = layer_class.__name__.lower()
+        name += '_stream_function_template'
+        super().__init__(name, layer_class, 'stream_function_cpp')
+
+    def _default_function_params(self, layer):
+        params = self._default_params(layer)
+        return params
+
+    def transform(self, model, node):
+        return super().transform(model, node)
+
+
+class TaskSequenceTemplate(Template):
+    def __init__(self, layer_class):
+        if isinstance(layer_class, (list, tuple, set)):
+            name = '_'.join([cls.__name__.lower() for cls in layer_class])
+        else:
+            name = layer_class.__name__.lower()
+        name += '_task_sequence_template'
+        super().__init__(name, layer_class, 'tast_sequence_cpp')
+
+    def _default_function_params(self, layer):
+        params = self._default_params(layer)
+        params['config'] = f'config{layer.index}'
+        params['input_pipe'] = layer.get_input_variable().pipe_name
+        params['output_pipe'] = layer.get_output_variable().pipe_name
+
+        return params
+
+    def transform(self, model, node):
+        return super().transform(model, node)
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 103f015c4..640ff3c6b 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -3,12 +3,7 @@
 '''
 import numpy as np
 
-from hls4ml.backends.fpga.fpga_types import (
-    InplaceStreamVariableConverter,
-    PackedType,
-    StreamVariableConverter,
-    VariableDefinition,
-)
+from hls4ml.backends.fpga.fpga_types import PackedType, VariableDefinition
 from hls4ml.utils.string_utils import convert_to_pascal_case
 
 # region ArrayVarable
@@ -40,10 +35,10 @@ def convert(self, tensor_var, pragma='', depth=0, n_pack=1):
         if pragma == 'stream':
             if depth == 0:
                 depth = np.prod(tensor_var.shape) // tensor_var.shape[-1]
-            self.pragma = ('stream', depth)
+            tensor_var.pragma = ('stream', depth)
             n_elem = tensor_var.shape[-1]
         else:
-            self.pragma = pragma
+            tensor_var.pragma = pragma
             n_elem = tensor_var.size()
             n_pack = 1  # ignore any passed value
 
@@ -103,12 +98,12 @@ class OneAPIStreamVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=True):
         return f'{self.name}{name_suffix}'
 
-    def declare_cpp(self, pipe_min_size=0, indent=''):
+    def declare_cpp(self, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n'
+        # lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type}, {pipe_min_size}>;\n'
+            + f'{self.type.name}, {self.pragma[-1]}>;\n'
         )
         return lines
 
@@ -118,12 +113,12 @@ def definition_cpp(self):
         return f'using {self.name} = {self.input_var.name}'
 
 
-class OneAPIStreamVariableConverter(StreamVariableConverter):
+class OneAPIStreamVariableConverter(AggregratedArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition)
 
 
-class OneAPIInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+class OneAPIInplaceStreamVariableConverter(AggregratedArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(
             type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 929b5a8be..ac6e66c48 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -1,4 +1,5 @@
 from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
 
@@ -35,8 +36,11 @@
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
-# dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
-dense_include_list = ['nnet_utils/nnet_dense.h']
+dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}<{input_pipe}, {output_pipe}, {config}>> {name};'
+
+dense_stream_function_template = '{name}.async({w}, {b});'
+
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
 
 
 class DenseConfigTemplate(LayerConfigTemplate):
@@ -68,6 +72,30 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class DenseTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class DenseStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
 # BatchNormalization templates
 
 batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
@@ -148,8 +176,11 @@ def format(self, node):
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
-# activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
-activ_include_list = ['nnet_utils/nnet_activation.h']
+activ_task_sequence_template = 'task_sequence<nnet::{activation}<{input_pipe}, {output_pipe}, {config}>> {name};'
+activ_stream_function_template = '{name}.async();'
+param_activ_stream_function_template = '{name}.async({param});'
+
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
 
 
 class ActivationConfigTemplate(LayerConfigTemplate):
@@ -190,7 +221,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
 
@@ -204,7 +235,7 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node._get_act_function_name()
         params['param'] = node.get_attr('activ_param', 1.0)
-        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
 
@@ -218,6 +249,50 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
         params['param'] = node.get_weights('alpha').name
-        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+
+        return self.template.format(**params)
+
+
+class ActivationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Activation, ParametrizedActivation, PReLU, HardActivation, Softmax, ParametrizedActivation, PReLU))
+        self.template = activ_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        return self.template.format(**params)
+
+
+class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax))
+        self.template = activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
+
 
+class ParametrizedActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation)
+        self.template = param_activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['param'] = node.get_attr('activ_param', 1.0)
+        return self.template.format(**params)
+
+
+class PReLUActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PReLU)
+        self.template = param_activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['param'] = node.get_weights('alpha').name
         return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 71a63585b..665857445 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -27,10 +27,14 @@ def transform(self, model, node):
 
         for out_name, var in node.variables.items():
             if io_type == 'io_stream':
+                if out_name in node.model.inputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='stream')
+                elif out_name in node.model.outputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='stream')
                 if isinstance(var, InplaceTensorVariable):
-                    new_var = self.inplace_stream_var_converter.convert(var)
+                    new_var = self.inplace_stream_var_converter.convert(var, pragma='stream')
                 else:
-                    new_var = self.stream_var_converter.convert(var)
+                    new_var = self.stream_var_converter.convert(var, pragma='stream')
             elif io_type == 'io_parallel':
                 if out_name in node.model.inputs:
                     new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register')
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
index 05b98cda2..05de507dc 100644
--- a/hls4ml/templates/oneapi/firmware/defines.h
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -1,7 +1,6 @@
 #ifndef DEFINES_H_
 #define DEFINES_H_
 
-#include <array>
 #include <sycl/ext/intel/ac_types/ac_fixed.hpp>
 #include <sycl/ext/intel/ac_types/ac_int.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index 38e18e6ac..06e7d3fe3 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -1,8 +1,14 @@
 #include "myproject.h"
 #include "parameters.h"
+#include <sycl/ext/intel/experimental/task_sequence.hpp>
 
 // hls-fpga-machine-learning insert weights
 
+// The inter-task pipes need to be declared in the global scope
+// hls-fpga-machine-learning insert inter-task pipes
+
+using sycl::ext::intel::experimental::task_sequence;
+
 void MyProject::operator()() const {
     // ****************************************
     // NETWORK INSTANTIATION
@@ -10,6 +16,8 @@ void MyProject::operator()() const {
 
     // hls-fpga-machine-learning read in
 
+    // hls-fpga-machine-learning declare task sequences
+
     // hls-fpga-machine-learning insert layers
 
     // hls-fpga-machine-learning return
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 8cb1349fd..9989036cb 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -129,9 +129,7 @@ void elu(const typename data_pipe::value_type::value_type alpha) {
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void elu() {
-    elu<data_pipe, res_pipe, CONFIG_T>(data, 1.0, res);
-}
+template <class data_pipe, class res_pipe, typename CONFIG_T> void elu() { elu<data_pipe, res_pipe, CONFIG_T>(1.0); }
 
 // *************************************************
 //       SeLU Activation
@@ -452,19 +450,19 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argma
 template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax() {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
-        softmax_latency<data_pipe, res_pipe, CONFIG_T>(data, res);
+        softmax_latency<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::stable:
-        softmax_stable<data_pipe, res_pipe, CONFIG_T>(data, res);
+        softmax_stable<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::legacy:
-        softmax_legacy<data_pipe, res_pipe, CONFIG_T>(data, res);
+        softmax_legacy<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::argmax:
-        softmax_argmax<data_pipe, res_pipe, CONFIG_T>(data, res);
+        softmax_argmax<data_pipe, res_pipe, CONFIG_T>();
         break;
     default:
-        softmax_stable<data_pipe, res_pipe, CONFIG_T>(data, res);
+        softmax_stable<data_pipe, res_pipe, CONFIG_T>();
         break;
     }
 }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index bb5dac59b..2bedac676 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -63,9 +63,7 @@ void dense_rf_gt(const data_T &data, res_T &res,
         }
     }
 Product1:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
         [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
     Product2:
         #pragma unroll
@@ -119,9 +117,7 @@ void dense_rf_lt(const data_T &data, res_T &res,
         acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
     }
 ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
         [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
     MultLoop:
         #pragma unroll
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
index a66423cef..cb50e4e4b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
@@ -33,9 +33,7 @@ void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
         }
     }
 ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
         [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
     CompressedMultLoop:
         #pragma unroll
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 85b734624..3b7249038 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -13,8 +13,8 @@ void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * C
                     const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
 
     [[intel::fpga_register]] typename res_pipe::value_type res;
-    [[intel::fpga_register]] auto data = data_pipe::read()
-        dense_resource<typename data_pipe::value_type, typename res_pipe::value_type, CONFIG_T>(data, res, weights, biases);
+    [[intel::fpga_register]] auto data = data_pipe::read();
+    dense_resource<typename data_pipe::value_type, typename res_pipe::value_type, CONFIG_T>(data, res, weights, biases);
     res_pipe::write(res);
 }
 
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index a31d80b5e..889363e45 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -154,14 +154,23 @@ def write_project_cpp(self, model):
                 elif 'MyProject' in line:
                     newline = line.replace('MyProject', convert_to_pascal_case(project_name))
 
+                # oneAPI pipes need to be declared and passed as template parameters
+                elif '// hls-fpga-machine-learning insert inter-task pipes' in line:
+                    newline = line
+                    if io_type == 'io_stream':
+                        for layer in model.get_layers():
+                            vars = layer.get_variables()
+                            for var in vars:
+                                if var not in model_inputs and var not in model_outputs:
+                                    newline += var.declare_cpp()
+
                 # Read in inputs
                 elif '// hls-fpga-machine-learning read in' in line:
                     newline = line
                     if io_type == 'io_parallel':
                         for inp in model_inputs:
                             newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n'
-                    else:
-                        raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
+                    # for streaming we don't need to read it in
 
                 # Insert weights
                 elif '// hls-fpga-machine-learning insert weights' in line:
@@ -171,11 +180,18 @@ def write_project_cpp(self, model):
                             if w not in model_brams:
                                 newline += f'#include "weights/{w.name}.h"\n'
 
+                # Insert task sequences
+                elif '// hls-fpga-machine-learning declare task sequences' in line:
+                    newline = line
+                    if io_type == 'io_stream':  # only need this for io_stream
+                        for layer in model.get_layers():
+                            ts = layer.get_attr('tast_sequence_cpp')
+                            if ts:
+                                newline += '    ' + ts + '\n'
+
                 # Neural net instantiation
                 elif '// hls-fpga-machine-learning insert layers' in line:
                     newline = line + '\n'
-                    model_inputs = model.get_input_variables()
-                    model_outputs = model.get_output_variables()
                     for layer in model.get_layers():
                         if io_type != 'io_stream':
                             vars = layer.get_variables()
@@ -184,7 +200,11 @@ def write_project_cpp(self, model):
                                     def_cpp = var.definition_cpp()
                                     if def_cpp is not None:
                                         newline += '    ' + def_cpp + ';\n'
-                        func = layer.get_attr('function_cpp', None)
+                        func = (
+                            layer.get_attr('function_cpp')
+                            if io_type == 'io_parallel'
+                            else layer.get_attr('stream_function_cpp')
+                        )
                         if func:
                             newline += '    ' + func + '\n'
                             if model.config.trace_output and layer.get_attr('trace', False):
@@ -202,8 +222,7 @@ def write_project_cpp(self, model):
                     if io_type == 'io_parallel':
                         for out in model_outputs:
                             newline += indent + f'{out.pipe_name}::write({out.name});\n'
-                    else:
-                        raise NotImplementedError("Only io_parallel is currently supported with oneAPI")
+                    # don't need to add anything in io_stream
 
                 # Just copy line
                 else:

From 09b95136246fe64d446a4843d3b5f6a311eaac30 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 15 Feb 2024 12:41:41 -0600
Subject: [PATCH 024/100] first version that compiles streaming

---
 hls4ml/backends/oneapi/oneapi_types.py        |  10 +-
 .../backends/oneapi/passes/core_templates.py  |   4 +-
 .../nnet_utils/nnet_activation_stream.h       | 233 ++++++++++--------
 .../firmware/nnet_utils/nnet_dense_stream.h   |   4 +-
 hls4ml/writer/oneapi_writer.py                |   1 -
 5 files changed, 149 insertions(+), 103 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 640ff3c6b..f74e69114 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -11,7 +11,10 @@
 
 class OneAPIArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
-        return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        if self.pragma and not isinstance(self.pragma, tuple):
+            return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        else:
+            return f'{self.type.name} {self.name}{name_suffix}'
 
 
 class OneAPIInplaceArrayVariableDefinition(VariableDefinition):
@@ -71,7 +74,10 @@ def __init__(self, type_converter):
 
 class OneAPIInterfaceVariableDefinition(VariableDefinition):
     def definition_cpp(self, name_suffix='', as_reference=False):
-        return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        if self.pragma and not isinstance(self.pragma, tuple):
+            return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        else:
+            return f'{self.type.name} {self.name}{name_suffix}'
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index ac6e66c48..a68600350 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -36,7 +36,7 @@
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
-dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}<{input_pipe}, {output_pipe}, {config}>> {name};'
+dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 
 dense_stream_function_template = '{name}.async({w}, {b});'
 
@@ -176,7 +176,7 @@ def format(self, node):
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
-activ_task_sequence_template = 'task_sequence<nnet::{activation}<{input_pipe}, {output_pipe}, {config}>> {name};'
+activ_task_sequence_template = 'task_sequence<nnet::{activation}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 activ_stream_function_template = '{name}.async();'
 param_activ_stream_function_template = '{name}.async({param});'
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 9989036cb..f9ad60031 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -9,15 +9,16 @@ namespace nnet {
 // *************************************************
 //       Linear Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void linear() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream() {
 LinearActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     LinearPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             out_data[j] = in_data[j];
         }
 
@@ -28,15 +29,16 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear() {
 // *************************************************
 //       ReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void relu() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
 ReLUActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     ReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
@@ -52,17 +54,20 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void relu() {
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void leaky_relu(const typename data_pipe::value_type::value_type alpha) {
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 LeakyReLUActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     LeakyReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
@@ -79,13 +84,14 @@ void leaky_relu(const typename data_pipe::value_type::value_type alpha) {
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void thresholded_relu(const typename data_pipe::value_type::value_type theta) {
 ThresholdedReLUActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     ThresholdedReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > theta)
                 out_data[j] = in_data[j];
             else
@@ -103,17 +109,20 @@ template <class data_pipe, class res_pipe, typename CONFIG_T>
 void elu(const typename data_pipe::value_type::value_type alpha) {
 #include "activation_tables/elu_table.tb"
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 EluActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     EluPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
             if (datareg >= 0) {
                 out_data[j] = datareg;
@@ -129,22 +138,25 @@ void elu(const typename data_pipe::value_type::value_type alpha) {
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void elu() { elu<data_pipe, res_pipe, CONFIG_T>(1.0); }
+template <class data_pipe, class res_pipe, typename CONFIG_T> void elu_stream() {
+    elu_stream<data_pipe, res_pipe, CONFIG_T>(1.0);
+}
 
 // *************************************************
 //       SeLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void selu() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream() {
 #include "activation_tables/selu_table.tb"
 
 SeluActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     SeluPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
             if (datareg >= 0) {
                 out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg;
@@ -165,21 +177,24 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu() {
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) {
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 PReLUActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     PReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
-                out_data[j] = alpha[i * res_pipe::value_type::size + j] * in_data[j];
+                out_data[j] = alpha[i * std::tuple_size<typename res_pipe::value_type>{} + j] * in_data[j];
         }
 
         res_pipe::write(out_data);
@@ -189,17 +204,18 @@ void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in
 // *************************************************
 //       Softplus Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus_stream() {
 #include "activation_tables/softplus_table.tb"
 
 SoftplusActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     SoftplusPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
             [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
             if (index < 0)
@@ -216,19 +232,20 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus() {
 // *************************************************
 //       Softsign Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stream() {
 #include "activation_tables/softsign_table.tb"
 
     static const int MAX_VALUE = 8;
 
 SoftsignActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     SoftsignPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue;
             ;
             if (in_data[j] < 0) {
@@ -254,44 +271,48 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign() {
 //       Softmax Activation
 // *************************************************
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
-    [[intel::fpga_register]] typename data_pipe::value_type::value_type data_array[data_pipe::value_type::size];
+    [[intel::fpga_register]]
+    typename data_pipe::value_type::value_type data_array[std::tuple_size<typename data_pipe::value_type>{}];
 
 SoftmaxArrayLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{};
+                                                  i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxArrayPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
         Op_max<typename data_pipe::value_type::value_type> op_max;
         [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max =
-            reduce<typename data_pipe::value_type::value_type, data_pipe::value_type::size,
+            reduce<typename data_pipe::value_type::value_type, std::tuple_size<typename data_pipe::value_type>{},
                    Op_max<typename data_pipe::value_type::value_type>>(data_array, op_max);
 
         // For the diffs, use the same type as the input but force rounding and saturation
         [[intel::fpga_register]] ac_fixed<data_pipe::value_type::value_type::width,
                                           data_pipe::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[data_pipe::value_type::size];
+            d_xi_xmax[std::tuple_size<typename data_pipe::value_type>{}];
         #pragma unroll
-        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size];
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
         #pragma unroll
-        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
             exp_res[j] = exp_table[softmax_stable_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
                 d_xi_xmax[j])];
         }
@@ -300,8 +321,8 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stabl
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
         [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, data_pipe::value_type::size, Op_add<typename CONFIG_T::exp_table_t>>(
-                exp_res, op_add);
+            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename data_pipe::value_type>{},
+                   Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
             invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
@@ -309,7 +330,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stabl
 
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
 
             // TODO - Find Quartus-equivalent pragma
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
@@ -321,23 +342,26 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stabl
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_latency() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_latency_stream() {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
     // Calculate all the e^x's
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[data_pipe::value_type::size];
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
 
 SoftmaxExpLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; i < CONFIG_T::n_in / data_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{};
+                                                  i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxExpPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
             exp_res[j] =
                 exp_table_latency[softmax_latency_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
                     in_pack[j])];
@@ -356,7 +380,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_laten
         typename res_pipe::value_type out_pack;
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
@@ -365,32 +389,34 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_laten
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legacy() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legacy_stream() {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
 
     // Index into the lookup table based on data for exponentials
-    [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[data_pipe::value_type::size];
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
     [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
-    [[intel::fpga_register]] typename data_pipe::value_type::value_type data_cache[data_pipe::value_type::size];
+    [[intel::fpga_register]]
+    typename data_pipe::value_type::value_type data_cache[std::tuple_size<typename data_pipe::value_type>{}];
 
 SoftmaxInitLoop:
-    [[intel::initiation_interval(1)]] for (unsigned s = 0; s < CONFIG_T::n_in / data_pipe::value_type::size; s++) {
+    [[intel::initiation_interval(1)]] for (unsigned s = 0;
+                                           s < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{}; s++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxInitPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < data_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
             data_cache[j] = in_pack[j];
             exp_res[j] = 0;
         }
 
     SoftmaxExpLoop:
         #pragma unroll
-        for (int i = 0; i < data_pipe::value_type::size; i++) {
+        for (int i = 0; i < std::tuple_size<typename data_pipe::value_type>{}; i++) {
         SoftmaxExpInner:
             #pragma unroll
-            for (int j = 0; j < data_pipe::value_type::size; j++) {
+            for (int j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
                 if (i == j) {
                     exp_diff_res = 1;
                 } else {
@@ -409,7 +435,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legac
         typename res_pipe::value_type out_pack;
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < res_pipe::value_type::size; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
             if (exp_res_index < 0)
                 exp_res_index = 0;
@@ -422,20 +448,21 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legac
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax() {
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax_stream() {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
         #pragma unroll
-        for (int i = 0; i < res_pipe::value_type::size; i++) {
+        for (int i = 0; i < std::tuple_size<typename res_pipe::value_type>{}; i++) {
             out_data[i] = static_cast<typename res_pipe::value_type::value_type>(0);
         }
 
         [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0];
         [[intel::fpga_register]] int idx = 0;
 
-        [[intel::initiation_interval(1)]] for (int i = 1; i < res_pipe::value_type::size; i++) {
+        [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size<typename res_pipe::value_type>{}; i++) {
             if (in_data[i] > maximum) {
                 maximum = in_data[i];
                 idx = i;
@@ -447,22 +474,22 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argma
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stream() {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
-        softmax_latency<data_pipe, res_pipe, CONFIG_T>();
+        softmax_latency_stream<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::stable:
-        softmax_stable<data_pipe, res_pipe, CONFIG_T>();
+        softmax_stable_stream<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::legacy:
-        softmax_legacy<data_pipe, res_pipe, CONFIG_T>();
+        softmax_legacy_stream<data_pipe, res_pipe, CONFIG_T>();
         break;
     case softmax_implementation::argmax:
-        softmax_argmax<data_pipe, res_pipe, CONFIG_T>();
+        softmax_argmax_stream<data_pipe, res_pipe, CONFIG_T>();
         break;
     default:
-        softmax_stable<data_pipe, res_pipe, CONFIG_T>();
+        softmax_stable_stream<data_pipe, res_pipe, CONFIG_T>();
         break;
     }
 }
@@ -470,22 +497,25 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax() {
 // *************************************************
 //       TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh_stream() {
 #include "activation_tables/tanh_table.tb"
     static const int MAX_VALUE = 4;
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 TanHActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
 
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     TanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
 
             if (in_data[j] < 0)
@@ -512,21 +542,24 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh()
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_stream() {
 #include "activation_tables/sigmoid_table.tb"
     static const int MAX_VALUE = 8;
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 SigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     SigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
 
             if (in_data[j] < 0)
@@ -554,20 +587,23 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid() {
 //       Hard sigmoid Activation
 // *************************************************
 // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations
-template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid_stream() {
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 HardSigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
 
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     HardSigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
             if (datareg > 1)
                 datareg = 1;
@@ -580,20 +616,23 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid(
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh_stream() {
 
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_pipe::value_type::size, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = data_pipe::value_type::size / multiplier_limit;
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
 
 HardSigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i++) {
 
         auto in_data = data_pipe::read();
         typename res_pipe::value_type out_data;
 
     HardSigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
             if (sigmoid > 1)
                 sigmoid = 1;
@@ -609,16 +648,17 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh() {
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh_stream() {
 BinaryTanHActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
 
         [[intel::fpga_register]] auto in_data = data_pipe::read();
         [[intel::fpga_register]] typename res_pipe::value_type out_data;
 
     BinaryTanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
             else
@@ -632,16 +672,17 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh()
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh_stream() {
 TernaryTanHActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / res_pipe::value_type::size; i++) {
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                           i++) {
 
         [[intel::fpga_register]] auto in_data = data_pipe::read();
         [[intel::fpga_register]] typename res_pipe::value_type out_data;
 
     TernaryTanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < res_pipe::value_type::size; j++) {
+        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
             if (in_data[j] > 1)
                 out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
             else if (in_data[j] <= -1)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 3b7249038..0572e1810 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -9,8 +9,8 @@ namespace nnet {
 
 // Note:  DataPack logic removed, at least in the initial version
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+void dense_resource_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                           const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
 
     [[intel::fpga_register]] typename res_pipe::value_type res;
     [[intel::fpga_register]] auto data = data_pipe::read();
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 889363e45..91dbaef76 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -214,7 +214,6 @@ def write_project_cpp(self, model):
                                         var.type.name, var.name, layer.name, var.size_cpp()
                                     )
                                 newline += '#endif\n'
-                            newline += '\n'
 
                 # Write the output
                 elif '// hls-fpga-machine-learning return' in line:

From 0e3f9ba655c80a624c5222372bdc74099f7f5ed6 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 16 Feb 2024 10:51:04 -0600
Subject: [PATCH 025/100] change how the pipe value type is extracted

---
 .../nnet_utils/nnet_activation_stream.h       | 287 ++++++++++--------
 .../firmware/nnet_utils/nnet_dense_stream.h   |   5 +-
 .../oneapi/firmware/nnet_utils/nnet_types.h   |  12 +
 3 files changed, 168 insertions(+), 136 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index f9ad60031..a4f3c6072 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -11,14 +11,14 @@ namespace nnet {
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream() {
 LinearActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     LinearPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             out_data[j] = in_data[j];
         }
 
@@ -31,14 +31,14 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
 ReLUActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     ReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
@@ -53,21 +53,22 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream()
 //       Leaky RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void leaky_relu(const typename data_pipe::value_type::value_type alpha) {
+void leaky_relu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 LeakyReLUActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     LeakyReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
@@ -82,16 +83,16 @@ void leaky_relu(const typename data_pipe::value_type::value_type alpha) {
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void thresholded_relu(const typename data_pipe::value_type::value_type theta) {
+void thresholded_relu(const typename ExtractPipeType<data_pipe>::value_type::value_type theta) {
 ThresholdedReLUActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     ThresholdedReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > theta)
                 out_data[j] = in_data[j];
             else
@@ -106,24 +107,25 @@ void thresholded_relu(const typename data_pipe::value_type::value_type theta) {
 //       ELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void elu(const typename data_pipe::value_type::value_type alpha) {
+void elu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
 #include "activation_tables/elu_table.tb"
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 EluActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     EluPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
-            [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
             if (datareg >= 0) {
                 out_data[j] = datareg;
             } else {
@@ -149,17 +151,18 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream()
 #include "activation_tables/selu_table.tb"
 
 SeluActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     SeluPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
-            [[intel::fpga_register]] typename data_pipe::value_type::value_type datareg = in_data[j];
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
             if (datareg >= 0) {
-                out_data[j] = typename data_pipe::value_type::value_type(1.0507009873554804934193349852946) * datareg;
+                out_data[j] =
+                    typename ExtractPipeType<data_pipe>::value_type::value_type(1.0507009873554804934193349852946) * datareg;
             } else {
                 int index = (datareg * CONFIG_T::table_size / -8).to_int();
                 if (index > CONFIG_T::table_size - 1)
@@ -176,25 +179,26 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream()
 //       PReLU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void prelu(const typename data_pipe::value_type::value_type alpha[CONFIG_T::n_in]) {
+void prelu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha[CONFIG_T::n_in]) {
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 PReLUActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     PReLUPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > 0)
                 out_data[j] = in_data[j];
             else
-                out_data[j] = alpha[i * std::tuple_size<typename res_pipe::value_type>{} + j] * in_data[j];
+                out_data[j] = alpha[i * std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{} + j] * in_data[j];
         }
 
         res_pipe::write(out_data);
@@ -208,14 +212,14 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus_stre
 #include "activation_tables/softplus_table.tb"
 
 SoftplusActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     SoftplusPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
             [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
             if (index < 0)
@@ -238,15 +242,15 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
     static const int MAX_VALUE = 8;
 
 SoftsignActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     SoftsignPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
-            [[intel::fpga_register]] typename data_pipe::value_type::value_type absValue;
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absValue;
             ;
             if (in_data[j] < 0) {
                 absValue = -in_data[j];
@@ -257,9 +261,10 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
             if (absValue > MAX_VALUE)
                 index = CONFIG_T::table_size - 1;
             if (in_data[j] < 0) {
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-softsign_table[index]);
+                out_data[j] =
+                    static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-softsign_table[index]);
             } else {
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(softsign_table[index]);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(softsign_table[index]);
             }
         }
 
@@ -276,61 +281,65 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stabl
 #include "activation_tables/invert_table.tb"
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
-    [[intel::fpga_register]]
-    typename data_pipe::value_type::value_type data_array[std::tuple_size<typename data_pipe::value_type>{}];
+    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
+        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
 
 SoftmaxArrayLoop:
     [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
                                                   i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxArrayPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
-        Op_max<typename data_pipe::value_type::value_type> op_max;
-        [[intel::fpga_register]] typename data_pipe::value_type::value_type x_max =
-            reduce<typename data_pipe::value_type::value_type, std::tuple_size<typename data_pipe::value_type>{},
-                   Op_max<typename data_pipe::value_type::value_type>>(data_array, op_max);
+        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
+        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
+            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
+                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
 
         // For the diffs, use the same type as the input but force rounding and saturation
-        [[intel::fpga_register]] ac_fixed<data_pipe::value_type::value_type::width,
-                                          data_pipe::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[std::tuple_size<typename data_pipe::value_type>{}];
+        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
+                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
+            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
+        [[intel::fpga_register]]
+        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
-            exp_res[j] = exp_table[softmax_stable_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
-                d_xi_xmax[j])];
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            exp_res[j] =
+                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                                                           CONFIG_T>(d_xi_xmax[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
         [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename data_pipe::value_type>{},
+            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
                    Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
             invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-        typename res_pipe::value_type out_pack;
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
 
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
 
             // TODO - Find Quartus-equivalent pragma
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
@@ -347,24 +356,25 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_laten
 #include "activation_tables/invert_table_latency.tb"
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
     // Calculate all the e^x's
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
+    [[intel::fpga_register]]
+    typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
 
 SoftmaxExpLoop:
     [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
                                                   i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxExpPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
-            exp_res[j] =
-                exp_table_latency[softmax_latency_idx_from_real_val<typename data_pipe::value_type::value_type, CONFIG_T>(
-                    in_pack[j])];
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val<
+                typename ExtractPipeType<data_pipe>::value_type::value_type, CONFIG_T>(in_pack[j])];
         }
 
         // Explicitly sum the results with an adder tree.
@@ -377,10 +387,10 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_laten
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
             invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
-        typename res_pipe::value_type out_pack;
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
             out_pack[j] = exp_res[j] * inv_exp_sum;
         }
@@ -394,29 +404,32 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legac
 #include "activation_tables/invert_table_legacy.tb"
 
     // Index into the lookup table based on data for exponentials
-    [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size<typename data_pipe::value_type>{}];
-    [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
     [[intel::fpga_register]]
-    typename data_pipe::value_type::value_type data_cache[std::tuple_size<typename data_pipe::value_type>{}];
+    typename CONFIG_T::table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
+    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
+        data_cache[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
 
 SoftmaxInitLoop:
     [[intel::initiation_interval(1)]] for (unsigned s = 0;
-                                           s < CONFIG_T::n_in / std::tuple_size<typename data_pipe::value_type>{}; s++) {
+                                           s < CONFIG_T::n_in /
+                                                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+                                           s++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxInitPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
             data_cache[j] = in_pack[j];
             exp_res[j] = 0;
         }
 
     SoftmaxExpLoop:
         #pragma unroll
-        for (int i = 0; i < std::tuple_size<typename data_pipe::value_type>{}; i++) {
+        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; i++) {
         SoftmaxExpInner:
             #pragma unroll
-            for (int j = 0; j < std::tuple_size<typename data_pipe::value_type>{}; j++) {
+            for (int j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
                 if (i == j) {
                     exp_diff_res = 1;
                 } else {
@@ -432,16 +445,17 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legac
             }
         }
 
-        typename res_pipe::value_type out_pack;
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
     SoftmaxInvPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
             if (exp_res_index < 0)
                 exp_res_index = 0;
             if (exp_res_index > CONFIG_T::table_size - 1)
                 exp_res_index = CONFIG_T::table_size - 1;
-            out_pack[j] = static_cast<typename res_pipe::value_type::value_type>(invert_table_legacy[exp_res_index]);
+            out_pack[j] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(invert_table_legacy[exp_res_index]);
         }
 
         res_pipe::write(out_pack);
@@ -449,27 +463,28 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legac
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax_stream() {
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
         #pragma unroll
-        for (int i = 0; i < std::tuple_size<typename res_pipe::value_type>{}; i++) {
-            out_data[i] = static_cast<typename res_pipe::value_type::value_type>(0);
+        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+            out_data[i] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
         }
 
-        [[intel::fpga_register]] typename data_pipe::value_type::value_type maximum = in_data[0];
+        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type maximum = in_data[0];
         [[intel::fpga_register]] int idx = 0;
 
-        [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size<typename res_pipe::value_type>{}; i++) {
+        [[intel::initiation_interval(1)]] for (int i = 1;
+                                               i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
             if (in_data[i] > maximum) {
                 maximum = in_data[i];
                 idx = i;
             }
         }
 
-        out_data[idx] = static_cast<typename res_pipe::value_type::value_type>(1);
+        out_data[idx] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
         res_pipe::write(out_data);
     }
 }
@@ -502,21 +517,22 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh_st
     static const int MAX_VALUE = 4;
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 TanHActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
 
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     TanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
-            [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
 
             if (in_data[j] < 0)
                 absoluteValue = (-1) * in_data[j];
@@ -547,20 +563,21 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_strea
     static const int MAX_VALUE = 8;
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 SigmoidActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     SigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
-            [[intel::fpga_register]] typename data_pipe::value_type::value_type absoluteValue;
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
 
             if (in_data[j] < 0)
                 absoluteValue = (-1) * in_data[j];
@@ -590,20 +607,21 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_strea
 template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid_stream() {
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 HardSigmoidActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
 
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     HardSigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
             if (datareg > 1)
                 datareg = 1;
@@ -619,20 +637,21 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid_
 template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh_stream() {
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename data_pipe::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename data_pipe::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
 
 HardSigmoidActLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
                                                   i++) {
 
         auto in_data = data_pipe::read();
-        typename res_pipe::value_type out_data;
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     HardSigmoidPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
             if (sigmoid > 1)
                 sigmoid = 1;
@@ -650,19 +669,19 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh_str
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh_stream() {
 BinaryTanHActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
 
         [[intel::fpga_register]] auto in_data = data_pipe::read();
-        [[intel::fpga_register]] typename res_pipe::value_type out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     BinaryTanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > 0)
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
             else
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-1);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
         }
 
         res_pipe::write(out_data);
@@ -674,21 +693,21 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh_s
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh_stream() {
 TernaryTanHActLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename res_pipe::value_type>{};
-                                           i++) {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
 
         [[intel::fpga_register]] auto in_data = data_pipe::read();
-        [[intel::fpga_register]] typename res_pipe::value_type out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     TernaryTanHPackLoop:
         #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename res_pipe::value_type>{}; j++) {
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
             if (in_data[j] > 1)
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(1);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
             else if (in_data[j] <= -1)
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(-1);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
             else
-                out_data[j] = static_cast<typename res_pipe::value_type::value_type>(0);
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
         }
 
         res_pipe::write(out_data);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 0572e1810..53987a02d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -12,9 +12,10 @@ template <class data_pipe, class res_pipe, typename CONFIG_T>
 void dense_resource_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
                            const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
 
-    [[intel::fpga_register]] typename res_pipe::value_type res;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
     [[intel::fpga_register]] auto data = data_pipe::read();
-    dense_resource<typename data_pipe::value_type, typename res_pipe::value_type, CONFIG_T>(data, res, weights, biases);
+    dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
+                   CONFIG_T>(data, res, weights, biases);
     res_pipe::write(res);
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index cd572f0c7..8e48121c1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -8,8 +8,20 @@
 
 namespace nnet {
 
+// Define the pipe type that we use
 template <class T, std::size_t N> using array = std::array<T, N>;
 
+// This is a helper to extract the value_type of a pipe
+template <typename T> struct ExtractPipeType { typedef T value_type; };
+
+template <template <class, class, int32_t, class, typename...> class PipeClass, class PipeName, class PipeDataT,
+          int32_t kPipeMinCapacity, class PipeProperties, typename... Args>
+struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProperties,
+                                 Args...>> // specialization
+{
+    typedef PipeDataT value_type;
+};
+
 /*
  * HLS Shift Register Implementation
  * To verify a shift register is used in hardware, go to report.html > Area Analysis of System

From 99038eb8a93b62570fda7db34cc11a6fcfcac049 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 16 Feb 2024 11:13:47 -0600
Subject: [PATCH 026/100] fix pre-commit error

---
 hls4ml/backends/oneapi/oneapi_types.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index f74e69114..2ddc97d36 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -1,6 +1,7 @@
 '''
 This package includes oneAPI-specific customizations to the variable types
 '''
+
 import numpy as np
 
 from hls4ml.backends.fpga.fpga_types import PackedType, VariableDefinition

From 3d555ac38c73e42b85f90c8ffc5cb865fa2f6794 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 25 Feb 2024 19:24:30 -0600
Subject: [PATCH 027/100] always treat elu as ELU class

---
 hls4ml/converters/keras/core.py                      |  4 ++++
 .../firmware/nnet_utils/nnet_activation_stream.h     | 12 ++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index ca7d0b354..105b16c3e 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -62,6 +62,10 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
 
     if layer['class_name'] != 'Activation':
         layer['activation'] = layer['class_name']
+
+    if layer['activation'] == 'elu':
+        layer['class_name'] = 'ELU'  # always use ELU type for elu, even if passed as activation
+
     if layer['class_name'] == 'LeakyReLU':
         layer['activ_param'] = keras_layer['config'].get('alpha', 0.3)
     elif layer['class_name'] == 'ThresholdedReLU':
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index a4f3c6072..9e789880a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -53,7 +53,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream()
 //       Leaky RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void leaky_relu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
+void leaky_relu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
     constexpr unsigned multiplier_limit =
         DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
@@ -83,7 +83,7 @@ void leaky_relu(const typename ExtractPipeType<data_pipe>::value_type::value_typ
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void thresholded_relu(const typename ExtractPipeType<data_pipe>::value_type::value_type theta) {
+void thresholded_relu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type theta) {
 ThresholdedReLUActLoop:
     [[intel::initiation_interval(
         1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
@@ -107,7 +107,7 @@ void thresholded_relu(const typename ExtractPipeType<data_pipe>::value_type::val
 //       ELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void elu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
+void elu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
 #include "activation_tables/elu_table.tb"
 
     constexpr unsigned multiplier_limit =
@@ -140,10 +140,6 @@ void elu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void elu_stream() {
-    elu_stream<data_pipe, res_pipe, CONFIG_T>(1.0);
-}
-
 // *************************************************
 //       SeLU Activation
 // *************************************************
@@ -179,7 +175,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream()
 //       PReLU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void prelu(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha[CONFIG_T::n_in]) {
+void prelu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha[CONFIG_T::n_in]) {
     constexpr unsigned multiplier_limit =
         DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;

From 68c6a518627538eb833069761374f658e939a4ea Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 26 Feb 2024 11:14:25 -0600
Subject: [PATCH 028/100] fix batchnorm

---
 .../backends/oneapi/passes/core_templates.py  | 28 +++++++++
 .../firmware/nnet_utils/nnet_batchnorm.h      | 30 ++++-----
 .../nnet_utils/nnet_batchnorm_stream.h        | 63 ++++++++++---------
 3 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index a68600350..093c46b65 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -112,6 +112,10 @@ def format(self, node):
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
 
+batchnorm_task_sequence_template = 'task_sequence<nnet::normalize_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+
+batchnorm_stream_function_template = '{name}.async({scale}, {bias});'
+
 batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
 
@@ -143,6 +147,30 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class BatchNormalizationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
 # Activation templates
 
 activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
index 7b84a9c0f..456a57b04 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -29,21 +29,21 @@ struct batchnorm_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
                const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
 // Calcuate result
 Result:
     #pragma unroll
     for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
         if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+            res[ires] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                            data[ires], scale[ires]) +
                         bias[ires];
         } else {
             int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
+            res[ires] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                            data[ires], scale[norm_index]) +
+                        bias[norm_index];
         }
     }
 }
@@ -63,13 +63,13 @@ struct batchnorm_quantized_tanh_config {
     static const unsigned n_zeros = 0;
 };
 
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
-                           const data_T threshold[CONFIG_T::n_scale_bias]) {
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize_binary_tanh(const data_T &data, res_T &res,
+                           const typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         ac_int<1, false> cache;
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
         if (datareg >= threshold[norm_index])
             cache = 1;
@@ -80,14 +80,14 @@ void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CON
     }
 }
 
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
-                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
-                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize_ternary_tanh(const data_T &data, res_T &res,
+                            const typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            const typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         ac_int<2, true> cache;
-        data_T datareg = data[ii];
+        auto datareg = data[ii];
         int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
         if (datareg > threshold_hi[norm_index])
             cache = 1;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index 0f5970bfe..6f7f5a60e 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -11,81 +11,84 @@ namespace nnet {
 // ****************************************************
 //       Streaming Batch Normalization
 // ****************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(stream<data_T> &data, stream<res_T> &res, const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_stream(const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+                      const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
-    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                               typename CONFIG_T::scale_t>::limit(multiplier_limit);
 
 BatchNormLoop:
-    #pragma ii pipeline
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        data_T in_data = data.read();
-        res_T out_data;
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
 
     BatchNormpack:
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
+        for (int j = 0; j < datasize; j++) {
             int norm_index;
             if (CONFIG_T::n_filt == -1)
-                norm_index = i * data_T::size + j;
+                norm_index = i * datasize + j;
             else
                 norm_index = j % CONFIG_T::n_filt;
-            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                              in_data[j], scale[norm_index]) +
+            out_data[j] = CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                                                     typename CONFIG_T::scale_t>::product(in_data[j], scale[norm_index]) +
                           bias[norm_index];
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
 // ****************************************************
 //       Merged Batch Normalization and Quantized Tanh
 // ****************************************************
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(stream<data_T> &data, stream<nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias>> &res,
-                           const typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_binary_tanh_stream(
+    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold[CONFIG_T::n_scale_bias]) {
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type::value_type>{};
 
 BinaryNormLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        data_T in_data = data.read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
         nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
 
     BatchNormPack:
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
+        for (int j = 0; j < datasize; j++) {
             int norm_index;
             if (CONFIG_T::n_filt == -1)
-                norm_index = i * data_T::size + j;
+                norm_index = i * datasize + j;
             else
                 norm_index = j % CONFIG_T::n_filt;
 
             out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(stream<data_T> &data, stream<nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias>> &res,
-                            const typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
-                            const typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_ternary_tanh_stream(
+    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold_hi[CONFIG_T::n_scale_bias],
+    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
 
 TernaryNormLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        data_T in_data = data.read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
         nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
 
     BatchNormPack:
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
+        for (int j = 0; j < datasize; j++) {
             int norm_index;
             if (CONFIG_T::n_filt == -1)
-                norm_index = i * data_T::size + j;
+                norm_index = i * datasize + j;
             else
                 norm_index = j % CONFIG_T::n_filt;
 
@@ -97,7 +100,7 @@ void normalize_ternary_tanh(stream<data_T> &data, stream<nnet::array<ac_int<2, t
                 out_data[j] = 0;
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 

From a3f5b3c5b704ac1ac289d83a8b90e406b7b86579 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 26 Feb 2024 18:34:50 -0600
Subject: [PATCH 029/100] snapshot towards fixing conv

---
 .../oneapi/passes/convolution_templates.py    | 63 +++++++++++++++++++
 .../nnet_utils/nnet_conv1d_resource.h         | 16 +++--
 .../firmware/nnet_utils/nnet_conv1d_stream.h  | 49 ++++++++-------
 .../nnet_utils/nnet_conv2d_resource.h         | 10 +--
 .../oneapi/firmware/nnet_utils/nnet_types.h   | 10 +++
 5 files changed, 113 insertions(+), 35 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 75f8ca687..4d42f4111 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -1,4 +1,5 @@
 from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
 
@@ -59,6 +60,13 @@
 """
 
 conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+conv1d_task_sequence_template = (
+    'task_sequence<nnet::conv_1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+conv1d_stream_function_template = '{name}.async({w}, {b});'
+
 conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
 
@@ -103,6 +111,30 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class Conv1DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Conv1D)
+        self.template = conv1d_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class Conv1DStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Conv1D)
+        self.template = conv1d_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
 ''' 2D Conv '''
 conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
     static const unsigned in_height = {in_height};
@@ -139,6 +171,13 @@ def format(self, node):
 }};\n"""
 
 conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+conv2d_task_sequence_template = (
+    'task_sequence<nnet::conv_2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+conv2d_stream_function_template = '{name}.async({w}, {b});'
+
 conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
 
 
@@ -181,3 +220,27 @@ def format(self, node):
         params['b'] = node.get_weights('bias').name
 
         return self.template.format(**params)
+
+
+class Conv2DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm))
+        self.template = conv2d_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class Conv2DStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm))
+        self.template = conv2d_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
index 9690f56e2..b8405b126 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -52,8 +52,7 @@ void conv_1d_im2col_cl(
 
 ColLoop:
     #pragma unroll pf
-    #pragma ii CONFIG_T::reuse_factor
-    for (int i = 0; i < CONFIG_T::out_width; i++) {
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int i = 0; i < CONFIG_T::out_width; i++) {
         // Loop variables should always be declared in the deepest scope available
         // See Intel's HLS - Loop Best Practices
         // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
@@ -193,8 +192,7 @@ void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
 
 ColLoop:
     #pragma unroll pf
-    #pragma ii CONFIG_T::reuse_factor
-    for (int col = 0; col < CONFIG_T::out_width; col++) {
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) {
         // Loop variables should always be declared in the deepest scope available
         // See Intel's HLS - Loop Best Practices
         // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
@@ -228,12 +226,12 @@ void conv_1d_resource_cl(
         // Winograd's minimal filtering algorithm not applicable to stride != 1
         CONFIG_T::stride_width == 1 &&
 
-            // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
-            CONFIG_T::out_width > 2 &&
+        // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+        CONFIG_T::out_width > 2 &&
 
-            // Verify user opted for Winograd
-            CONFIG_T::implementation == nnet::conv1d_implementation::combination ||
-        CONFIG_T::implementation == nnet::conv1d_implementation::winograd;
+        // Verify user opted for Winograd
+        (CONFIG_T::implementation == nnet::conv1d_implementation::combination ||
+         CONFIG_T::implementation == nnet::conv1d_implementation::winograd);
 
     if (CONFIG_T::filt_width == 3 && winograd_conditions) {
         winograd_conv1d_3x1_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
index 28e9f6b87..7bb8a4fe1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -15,9 +15,8 @@ namespace nnet {
  *
  * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
  */
-template <class data_T, typename CONFIG_T>
-void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan],
-                     typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+template <class data_T, class data_window_T, typename CONFIG_T>
+void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan], data_window_T &kernel_window) {
 /*
  * Manually shift kernel_window by one step to the left
  * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
@@ -83,14 +82,16 @@ void shift_line_buffer_1d(
  * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
  * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
  */
-template <class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
 void compute_output_buffer_1d(
-    const data_T &in_elem, stream<res_T> &res_stream,
+    const data_T &in_elem,
     nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[CONFIG_T::n_chan],
-    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan],
+    data_window_T &kernel_window,
     const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
     // Thresholds
     static constexpr int lShiftX = CONFIG_T::filt_width - 1;
 
@@ -105,14 +106,13 @@ void compute_output_buffer_1d(
     nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
 
     // Step 2 - Kernel shift
-    nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+    nnet::kernel_shift_1d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
 
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
         // Step 3 - Dense matrix multiplication
-        [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt];
-        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-            kernel_window, res_out, weights, biases);
+        [[intel::fpga_register]] res_T res_out;
+        dense_resource<data_window_T, res_T, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
 
         // Write result to output stream
         [[intel::fpga_register]] res_T res_pack;
@@ -121,7 +121,7 @@ void compute_output_buffer_1d(
         for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
             res_pack[channel] = res_out[channel];
         }
-        res_stream.write(res_pack);
+        res_pipe::write(res_pack);
     }
 
     // Reached end of image
@@ -135,35 +135,42 @@ void compute_output_buffer_1d(
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(stream<data_T> &data, stream<res_T> &res,
-                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void conv_1d_cl_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::filt_width * CONFIG_T::n_chan>;
+
     // Line buffer and kernel window
-    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type,
+    [[intel::fpga_register]] static nnet::shift_reg<data_element_T,
                                                     CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[CONFIG_T::n_chan];
-    [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    [[intel::fpga_register]] static data_window_T kernel_window;
 
     // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
-    static const data_T padds(0);
+    constexpr auto padds = zero_array<data_arr_T>();
 
 // Input image left-side padding
 PaddingLeftWidth:
     for (int col = 0; col < CONFIG_T::pad_left; col++) {
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
+                                                                                biases);
     }
 
 // Read input image
 ReadInputWidth:
     for (int col = 0; col < CONFIG_T::in_width; col++) {
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer,
+                                                                                kernel_window, weights, biases);
     }
 
 // Input image right-side padding
 PaddingRightWidth:
     for (int col = 0; col < CONFIG_T::pad_right; col++) {
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
+                                                                                biases);
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 8c7fdcad2..2e508ad33 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -288,12 +288,12 @@ void conv_2d_resource_cl(const data_T &data, res_T &res,
         // Winograd's minimal filtering algorithm not applicable to stride != 1
         CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 &&
 
-            // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
-            CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 &&
+        // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+        CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 &&
 
-            // Verify user opted for Winograd
-            CONFIG_T::implementation == nnet::conv2d_implementation::combination ||
-        CONFIG_T::implementation == nnet::conv2d_implementation::winograd;
+        // Verify user opted for Winograd
+        (CONFIG_T::implementation == nnet::conv2d_implementation::combination ||
+         CONFIG_T::implementation == nnet::conv2d_implementation::winograd);
 
     if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) {
         winograd_conv2d_3x3_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 8e48121c1..6e6d68b0b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -11,6 +11,16 @@ namespace nnet {
 // Define the pipe type that we use
 template <class T, std::size_t N> using array = std::array<T, N>;
 
+// T should be an array
+template <class T> constexpr T zero_array() {
+    T ar;
+    #pragma unroll
+    for (auto &a : ar) {
+        a[0] = 0;
+    }
+    return ar;
+}
+
 // This is a helper to extract the value_type of a pipe
 template <typename T> struct ExtractPipeType { typedef T value_type; };
 

From 0cbf5be5ab283198ba1245840ab12f6868e90dc7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 27 Feb 2024 23:22:17 -0600
Subject: [PATCH 030/100] snapshot fixing test for streaming

---
 hls4ml/backends/oneapi/oneapi_types.py        |  4 ---
 .../oneapi/passes/convolution_templates.py    | 16 ++++++-----
 .../firmware/nnet_utils/nnet_conv1d_stream.h  | 27 +++++++++----------
 .../oneapi/firmware/nnet_utils/nnet_helpers.h | 23 ++++++++++++++++
 .../oneapi/firmware/nnet_utils/nnet_types.h   |  2 +-
 hls4ml/templates/oneapi/myproject_test.cpp    | 19 +++++--------
 hls4ml/writer/oneapi_writer.py                | 15 +++++++++++
 7 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 2ddc97d36..b46956ada 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -82,9 +82,6 @@ def definition_cpp(self, name_suffix='', as_reference=False):
 
     def declare_cpp(self, pipe_min_size=0, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        lines += (
-            indent + f'using {self.type.name} = nnet::array<{self.type.precision.definition_cpp()}, {self.size_cpp()}>;\n'
-        )
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
             + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
@@ -107,7 +104,6 @@ def definition_cpp(self, name_suffix='', as_reference=True):
 
     def declare_cpp(self, indent=''):
         lines = indent + f'class {self.pipe_id};\n'
-        # lines += indent + f'using {self.name} = nnet::array<{self.type.name}, {self.size_cpp()}>;\n'
         lines += indent + (
             f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
             + f'{self.type.name}, {self.pragma[-1]}>;\n'
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 4d42f4111..590b1327f 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -80,7 +80,7 @@ def format(self, node):
         conv_params = self._default_config_params(node)
         conv_params['dilation'] = node.get_attr('dilation', 1)
         if conv_params['dilation'] != 1:
-            raise Exception('dilation != 1 not supported yet')
+            raise RuntimeError('dilation != 1 not supported yet')
         conv_params['config_t'] = f'config{node.index}_mult'
         conv_config = self.template.format(**conv_params)
 
@@ -103,7 +103,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise Exception('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on Quartus')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -118,7 +118,9 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
         return self.template.format(**params)
 
 
@@ -191,7 +193,7 @@ def format(self, node):
         conv_params = self._default_config_params(node)
         conv_params['dilation'] = node.get_attr('dilation', 1)
         if conv_params['dilation'] != 1:
-            raise Exception('dilation != 1 not supported yet')
+            raise RuntimeError('dilation != 1 not supported yet')
         conv_params['config_t'] = f'config{node.index}_mult'
         conv_config = self.template.format(**conv_params)
 
@@ -214,7 +216,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise Exception('channels_first not supported for Quartus')
+            raise RuntimeError('channels_first not supported for Quartus')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -229,7 +231,9 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
         return self.template.format(**params)
 
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
index 7bb8a4fe1..e99a3c864 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -89,17 +89,11 @@ void compute_output_buffer_1d(
         line_buffer[CONFIG_T::n_chan],
     data_window_T &kernel_window,
     const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], int &pX, int &sX) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
     // Thresholds
-    static constexpr int lShiftX = CONFIG_T::filt_width - 1;
-
-    // X position pixel
-    static int pX = 0;
-
-    // X strides
-    static int sX = 0;
+    constexpr int lShiftX = CONFIG_T::filt_width - 1;
 
     // Step 1 - Shift line buffer
     [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
@@ -144,33 +138,38 @@ void conv_1d_cl_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::filt_
     using data_window_T = array<data_element_T, CONFIG_T::filt_width * CONFIG_T::n_chan>;
 
     // Line buffer and kernel window
-    [[intel::fpga_register]] static nnet::shift_reg<data_element_T,
-                                                    CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[CONFIG_T::n_chan];
-    [[intel::fpga_register]] static data_window_T kernel_window;
+    [[intel::fpga_register]] data_window_T kernel_window;
 
     // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
     constexpr auto padds = zero_array<data_arr_T>();
 
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // X strides
+    int sX = 0;
+
 // Input image left-side padding
 PaddingLeftWidth:
     for (int col = 0; col < CONFIG_T::pad_left; col++) {
         compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
-                                                                                biases);
+                                                                                biases, pX, sX);
     }
 
 // Read input image
 ReadInputWidth:
     for (int col = 0; col < CONFIG_T::in_width; col++) {
         compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer,
-                                                                                kernel_window, weights, biases);
+                                                                                kernel_window, weights, biases, pX, sX);
     }
 
 // Input image right-side padding
 PaddingRightWidth:
     for (int col = 0; col < CONFIG_T::pad_right; col++) {
         compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
-                                                                                biases);
+                                                                                biases, pX, sX);
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
index 284bbfd6f..7f29633a8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -12,18 +12,41 @@
 
 namespace nnet {
 
+// I think this can be removed
 template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
     for (size_t i = 0; i < SIZE; i++) {
         dst[i] = dstType(src[i]);
     }
 }
 
+// I think this can be removed
 template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
     for (size_t i = 0; i < SIZE; i++) {
         dst[i] = static_cast<dstType>(src[i].to_double());
     }
 }
 
+template <class srcType, class dest_pipe, size_t SIZE> void convert_data(sycl::queue &q, srcType *src) {
+    constexpr auto dstTypeSize = std::tuple_size<typename ExtractPipeType<dest_pipe>::value_type>{};
+    for (size_t i = 0; i < SIZE / dstTypeSize; i++) {
+        typename ExtractPipeType<dest_pipe>::value_type ctype;
+        for (size_t j = 0; j < dstTypeSize; j++) {
+            ctype[j] = src[i * dstTypeSize + j];
+        }
+        dest_pipe::write(q, ctype);
+    }
+}
+
+template <class src_pipe, class dstType, size_t SIZE> void convert_data_back(sycl::queue &q, dstType *dst) {
+    constexpr auto srcTypeSize = std::tuple_size<typename ExtractPipeType<src_pipe>::value_type>{};
+    for (size_t i = 0; i < SIZE / srcTypeSize; i++) {
+        auto ctype = src_pipe::read(q);
+        for (size_t j = 0; j < srcTypeSize; j++) {
+            dst[i * srcTypeSize + j] = ctype[j].to_double();
+        }
+    }
+}
+
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
 extern size_t trace_type_size;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 6e6d68b0b..82c58244d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -16,7 +16,7 @@ template <class T> constexpr T zero_array() {
     T ar;
     #pragma unroll
     for (auto &a : ar) {
-        a[0] = 0;
+        a = 0;
     }
     return ar;
 }
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index c64fb6549..61f62f4d6 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -52,10 +52,9 @@ int main(int argc, char **argv) {
     std::string iline;
     std::string pline;
 
-    // hls-fpga-machine-learning insert inputs
-    // hls-fpga-machine-learning insert results
-
     if (fin.is_open() && fpr.is_open()) {
+        // hls-fpga-machine-learning insert inputs
+        // hls-fpga-machine-learning insert results
         std::vector<std::vector<float>> predictions;
         unsigned int num_iterations = 0;
         for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
@@ -125,28 +124,22 @@ int main(int argc, char **argv) {
         const unsigned int num_iterations = 10;
         std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
                   << " invocations." << std::endl;
-        // hls-fpga-machine-learning insert zero
-        for (int i = 0; i < num_iterations; i++) {
-            inputs.emplace_back();
-            outputs.emplace_back();
-            inputs.back().fill(0.0);
-        }
 
         // hls-fpga-machine-learning insert top-level-function
         for (int i = 0; i < num_iterations; i++) {
-            // hls-fpga-machine-learning insert tb-input
+            // hls-fpga-machine-learning insert zero
             q.single_task(MyProject{});
         }
         q.wait();
 
         for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert tb-output
-            for (auto outval : outputs[j]) {
+            // hls-fpga-machine-learning convert output
+            for (auto outval : outputs) {
                 std::cout << outval << " ";
             }
             std::cout << std::endl;
 
-            for (auto outval : outputs[j]) {
+            for (auto outval : outputs) {
                 fout << outval << " ";
             }
             fout << std::endl;
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 91dbaef76..eef376cb1 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -439,6 +439,21 @@ def write_test_bench(self, model):
                     newline = line
                     inp = model_inputs[0]
                     newline += indent + f'{inp.pipe_name}::write(q, inputs[i]);\n'
+                elif '// hls-fpga-machine-learning insert zero' in line:
+                    newline = line
+                    inp = model_inputs[0]
+                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
+                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
+                    newline += indent + indent + 'vals[j] = 0.0; \n'
+                    newline += indent + '}\n'
+                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+
+                elif '// hls-fpga-machine-learning convert output' in line:
+                    newline = line
+                    out = model_outputs[0]
+                    newline += indent + f'float outputs[{out.size_cpp()}];\n'
+                    newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n'
+
                 elif '// hls-fpga-machine-learning insert tb-output' in line:
                     newline = line
                     out = model_outputs[0]

From 75c9301aadea9a8ce8bb086bf8d3322709d363c8 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 28 Feb 2024 14:57:23 -0600
Subject: [PATCH 031/100] fix conv1d

---
 .../oneapi/firmware/nnet_utils/nnet_helpers.h | 14 ------
 hls4ml/templates/oneapi/myproject_bridge.cpp  |  1 +
 hls4ml/templates/oneapi/myproject_test.cpp    | 44 +++++-----------
 hls4ml/writer/oneapi_writer.py                | 50 +++++--------------
 4 files changed, 26 insertions(+), 83 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
index 7f29633a8..c7af2e7a6 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -12,20 +12,6 @@
 
 namespace nnet {
 
-// I think this can be removed
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-// I think this can be removed
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = static_cast<dstType>(src[i].to_double());
-    }
-}
-
 template <class srcType, class dest_pipe, size_t SIZE> void convert_data(sycl::queue &q, srcType *src) {
     constexpr auto dstTypeSize = std::tuple_size<typename ExtractPipeType<dest_pipe>::value_type>{};
     for (size_t i = 0; i < SIZE / dstTypeSize; i++) {
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index 3beb224ea..0c89f4334 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -65,6 +65,7 @@ void myproject_double(
 ) {
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
     sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+
     // hls-fpga-machine-learning insert wrapper #double
 }
 }
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index 61f62f4d6..a2a2a3cec 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -53,13 +53,11 @@ int main(int argc, char **argv) {
     std::string pline;
 
     if (fin.is_open() && fpr.is_open()) {
-        // hls-fpga-machine-learning insert inputs
-        // hls-fpga-machine-learning insert results
         std::vector<std::vector<float>> predictions;
-        unsigned int num_iterations = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
-            if (num_iterations % CHECKPOINT == 0) {
-                std::cout << "Processing input " << num_iterations << std::endl;
+        unsigned int iteration = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) {
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "Processing input " << iteration << std::endl;
             }
 
             std::vector<float> in;
@@ -77,42 +75,27 @@ int main(int argc, char **argv) {
             }
 
             // hls-fpga-machine-learning insert data
-            inputs.emplace_back();
-            if (in.size() != inputs[0].size()) {
-                throw std::runtime_error("The input size does not match");
-            }
 
-            std::copy(in.cbegin(), in.cend(), inputs.back().begin());
+            q.single_task(MyProject{});
+
+            // hls-fpga-machine-learning convert output
 
-            outputs.emplace_back();
-            if (pr.size() != outputs[0].size()) {
-                throw std::runtime_error("The output size does not match");
-            }
             std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
-        }
-        // Do this separately to avoid vector reallocation
-        for (int i = 0; i < num_iterations; i++) {
-            // hls-fpga-machine-learning insert tb-input
-            q.single_task(MyProject{}); // once or once for each
-        }
-        q.wait();
 
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert tb-output
-            for (auto outval : outputs[j]) {
+            for (auto outval : outputs) {
                 fout << outval << " ";
             }
             fout << std::endl;
-            if (j % CHECKPOINT == 0) {
+            if (iteration % CHECKPOINT == 0) {
                 std::cout << "Predictions" << std::endl;
                 // hls-fpga-machine-learning insert predictions
-                for (auto predval : predictions[j]) {
+                for (auto predval : pr) {
                     std::cout << predval << " ";
                 }
                 std::cout << std::endl;
                 std::cout << "Quantized predictions" << std::endl;
                 // hls-fpga-machine-learning insert quantized
-                for (auto outval : outputs[j]) {
+                for (auto outval : outputs) {
                     std::cout << outval << " ";
                 }
                 std::cout << std::endl;
@@ -129,10 +112,6 @@ int main(int argc, char **argv) {
         for (int i = 0; i < num_iterations; i++) {
             // hls-fpga-machine-learning insert zero
             q.single_task(MyProject{});
-        }
-        q.wait();
-
-        for (int j = 0; j < num_iterations; j++) {
             // hls-fpga-machine-learning convert output
             for (auto outval : outputs) {
                 std::cout << outval << " ";
@@ -145,6 +124,7 @@ int main(int argc, char **argv) {
             fout << std::endl;
         }
     }
+    q.wait();
 
     fout.close();
     std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index eef376cb1..a7f6fafbe 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -424,40 +424,27 @@ def write_test_bench(self, model):
                     newline = line
                     for bram in model_brams:
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
-                elif '// hls-fpga-machine-learning insert inputs' in line:
-                    newline = line
-                    # there should really be only one input
-                    inp = model_inputs[0]
-                    newline += indent + f'std::vector<{inp.type.name}> inputs;\n'
-
-                elif '// hls-fpga-machine-learning insert results' in line:
-                    newline = line
-                    # there should really be only one out
-                    out = model_outputs[0]
-                    newline += indent + f'std::vector<{out.type.name}> outputs;\n'
-                elif '// hls-fpga-machine-learning insert tb-input' in line:
+                elif '// hls-fpga-machine-learning insert zero' in line:
                     newline = line
                     inp = model_inputs[0]
-                    newline += indent + f'{inp.pipe_name}::write(q, inputs[i]);\n'
-                elif '// hls-fpga-machine-learning insert zero' in line:
+                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
+                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
+                    newline += indent + '    vals[j] = 0.0; \n'
+                    newline += indent + '}\n'
+                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                elif '// hls-fpga-machine-learning insert data' in line:
                     newline = line
                     inp = model_inputs[0]
                     newline += indent + f'float vals[{inp.size_cpp()}]; \n'
                     newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
-                    newline += indent + indent + 'vals[j] = 0.0; \n'
+                    newline += indent + '    vals[j] = in[j]; \n'
                     newline += indent + '}\n'
                     newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
-
                 elif '// hls-fpga-machine-learning convert output' in line:
                     newline = line
                     out = model_outputs[0]
                     newline += indent + f'float outputs[{out.size_cpp()}];\n'
                     newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n'
-
-                elif '// hls-fpga-machine-learning insert tb-output' in line:
-                    newline = line
-                    out = model_outputs[0]
-                    newline += indent + f'outputs[j] = {out.pipe_name}::read(q);\n'
                 else:
                     newline = line
 
@@ -510,28 +497,17 @@ def write_bridge(self, model):
                     dtype = line.split('#', 1)[1].strip()
                     newline = ''
                     for i in model_inputs:
-                        newline += indent + f'{i.definition_cpp(name_suffix="_input")};\n'
-                        newline += (
-                            indent
-                            + f'nnet::convert_data<{dtype}, typename {i.type.name}::value_type, {i.size_cpp()}>'
-                            + f'({i.name}, {i.name}_input.data());\n'
-                        )
-                        newline += indent + f'{i.pipe_name}::write(q, {i.name}_input);\n'
-
-                    newline += '\n'
+                        newline += indent + f'nnet::convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n'
 
                     newline += indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
-                    newline += indent + 'q.wait();\n'
-
-                    newline += '\n'
 
                     for o in model_outputs:
-                        newline += indent + f'{o.definition_cpp(name_suffix="_output")} = {o.pipe_name}::read(q);\n'
                         newline += (
-                            indent
-                            + f'nnet::convert_data_back<typename {o.type.name}::value_type, {dtype}, {o.size_cpp()}>'
-                            + f'({o.name}_output.data(), {o.name});\n'
+                            indent + f'nnet::convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n'
                         )
+                    newline += '\n'
+                    newline += indent + 'q.wait();\n'
+
                 elif '// hls-fpga-machine-learning insert trace_outputs' in line:
                     newline = ''
                     for layer in model.get_layers():

From ba2e2838029ec5f332483187d651386784a9f682 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 28 Feb 2024 17:21:06 -0600
Subject: [PATCH 032/100] fix conv2d

---
 .../firmware/nnet_utils/nnet_conv1d_stream.h  |  1 +
 .../nnet_utils/nnet_conv2d_resource.h         |  8 +-
 .../firmware/nnet_utils/nnet_conv2d_stream.h  | 92 ++++++++++---------
 3 files changed, 53 insertions(+), 48 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
index e99a3c864..bbc86b3e2 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -92,6 +92,7 @@ void compute_output_buffer_1d(
     const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], int &pX, int &sX) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
     // Thresholds
     constexpr int lShiftX = CONFIG_T::filt_width - 1;
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 2e508ad33..626751cf5 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -68,8 +68,7 @@ void conv_2d_im2col_cl(const data_T &data, res_T &res,
     for (int i = 0; i < CONFIG_T::out_height; i++) {
     WidthLoop:
         #pragma unroll pfc
-        #pragma ii CONFIG_T::reuse_factor
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
+        [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::out_width; j++) {
             // Loop variables should always be declared in the deepest scope available
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
@@ -214,7 +213,7 @@ void winograd_conv2d_3x3_kernel_cl(
 //       2D Convolution for 1x1 kernels using optimized im2col
 // ****************************************************************
 
-template <class data_T, class data_col_Ttypename CONFIG_T>
+template <class data_T, class data_col_T, typename CONFIG_T>
 void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) {
     // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
 
@@ -255,8 +254,7 @@ void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
     for (int row = 0; row < CONFIG_T::out_height; row++) {
     WidthLoop:
         #pragma unroll pfc
-        #pragma ii CONFIG_T::reuse_factor
-        for (int col = 0; col < CONFIG_T::out_width; col++) {
+        [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) {
             // Loop variables should always be declared in the deepest scope available
             // See Intel's HLS - Loop Best Practices
             // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
index 1090f9bda..af48e92e3 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -15,10 +15,9 @@ namespace nnet {
  *
  * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
  */
-template <class data_T, typename CONFIG_T>
-void kernel_shift_2d(
-    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
-    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) {
+template <class data_T, class data_window_T, typename CONFIG_T>
+void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
+                     data_window_T &kernel_window) {
 /*
  * Manually shift kernel_window by one step to the left
  * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
@@ -110,39 +109,33 @@ void shift_line_buffer_2d(
  * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
  * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
  */
-template <class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
 void compute_output_buffer_2d(
-    const data_T &in_elem, stream<res_T> &res_stream,
+    const data_T &in_elem,
     nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
-    typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    data_window_T &kernel_window,
     const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    // Thresholds
-    static constexpr int lShiftX = CONFIG_T::filt_width - 1;
-    static constexpr int lShiftY = CONFIG_T::filt_height - 1;
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], int &pX, int &pY, int &sX, int &sY) {
 
-    // X, Y position pixels
-    static int pX = 0;
-    static int pY = 0;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
-    // X, Y strides
-    static int sX = 0;
-    static int sY = 0;
+    // Thresholds
+    constexpr int lShiftX = CONFIG_T::filt_width - 1;
+    constexpr int lShiftY = CONFIG_T::filt_height - 1;
 
     // Step 1 - Shift line buffer
     [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan];
     nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
 
     // Step 2 - Kernel shift
-    nnet::kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+    nnet::kernel_shift_2d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
 
     // Check to see if we have a full kernel
     if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
         // Step 3 - Dense matrix multiplication
-        [[intel::fpga_register]] typename res_T::value_type res_out[CONFIG_T::n_filt];
-        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-            kernel_window, res_out, weights, biases);
+        [[intel::fpga_register]] res_T res_out;
+        dense_resource<data_window_T, res_T, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
 
         // Write result to output stream
         [[intel::fpga_register]] res_T res_pack;
@@ -151,7 +144,7 @@ void compute_output_buffer_2d(
         for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
             res_pack[channel] = res_out[channel];
         }
-        res_stream.write(res_pack);
+        res_pipe::write(res_pack);
     }
 
     // Reached end of image
@@ -174,61 +167,74 @@ void compute_output_buffer_2d(
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(stream<data_T> &data, stream<res_T> &res,
-                const typename CONFIG_T::weight_t
-                    weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void conv_2d_cl_stream(const typename CONFIG_T::weight_t
+                           weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan>;
 
     // Line buffer and kernel window
-    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type,
-                                                    CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan];
-    [[intel::fpga_register]] static
-        typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    [[intel::fpga_register]] data_window_T kernel_window;
 
     // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
-    static const data_T padds(0);
+    constexpr auto padds = zero_array<data_arr_T>();
+
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // Y position pixel
+    int pY = 0;
+    // X strides
+    int sX = 0;
+    // Y strides
+    int sY = 0;
 
 // Padding above input image
 PaddingTopHeight:
-    #pragma loop_coalesce 2
-    for (int row = 0; row < CONFIG_T::pad_top; row++) {
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_top; row++) {
     PaddingTopWidth:
         for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
-            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
         }
     }
 
 ReadInputHeight:
-    #pragma loop_coalesce 2
-    for (int row = 0; row < CONFIG_T::in_height; row++) {
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) {
     // Input image left-side padding
     PaddingLeftWidth:
         for (int col = 0; col < CONFIG_T::pad_left; col++) {
-            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
         }
 
     // Read input image
     ReadInputWidth:
         for (int col = 0; col < CONFIG_T::in_width; col++) {
-            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(
+                data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY);
         }
 
     // Input image right-side padding
     PaddingRightWidth:
         for (int col = 0; col < CONFIG_T::pad_right; col++) {
-            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
         }
     }
 
 // Padding below input image
 PaddingBottomHeight:
-    #pragma loop_coalesce 2
-    for (int row = 0; row < CONFIG_T::pad_bottom; row++) {
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_bottom; row++) {
     PaddingBottomWidth:
         for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
-            compute_output_buffer_2d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
         }
     }
 }

From a7c08d3ad28a4d9f93ece42c940187df2cee47f0 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 29 Feb 2024 00:07:56 -0600
Subject: [PATCH 033/100] fix reshape and flatten for oneAPI

---
 hls4ml/backends/oneapi/oneapi_backend.py      | 28 +-----
 .../oneapi/passes/reshaping_templates.py      | 51 +++++++++-
 .../oneapi/firmware/nnet_utils/nnet_stream.h  | 92 +++++++++++--------
 3 files changed, 106 insertions(+), 65 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 0f7ca7a77..ae5054c98 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -7,7 +7,7 @@
 from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
 from hls4ml.model.flow import register_flow
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
-from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
+from hls4ml.model.optimizer import layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
 
 # from hls4ml.report import parse_oneapi_report
@@ -38,7 +38,7 @@ def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
-        streaming_passes = ['oneapi:reshape_stream', 'oneapi:clone_output']
+        streaming_passes = ['oneapi:clone_output']
         streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
 
         oneapi_types = [
@@ -54,6 +54,7 @@ def _register_flows(self):
             'oneapi:quantize_dense_output',
             'fuse_consecutive_batch_normalization',
             'oneapi:xnor_pooling',
+            'oneapi:generate_conv_im2col',
         ]
         quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
 
@@ -61,39 +62,18 @@ def _register_flows(self):
             'oneapi:remove_final_reshape',
             'oneapi:optimize_pointwise_conv',
             'oneapi:inplace_parallel_reshape',
-            'oneapi:inplace_stream_flatten',
             'oneapi:skip_softmax',
             'oneapi:fix_softmax_table_size',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 
-        templates = self._get_layer_templates()
         template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
 
         writer_passes = ['make_stamp', 'oneapi:write_hls']
 
         self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name)
 
-        all_passes = get_backend_passes(self.name)
-
-        extras = [
-            # Ideally this should be empty
-            opt_pass
-            for opt_pass in all_passes
-            if opt_pass
-            not in initializers
-            + streaming_passes
-            + oneapi_types
-            + quantization_passes
-            + templates
-            + optimization_passes
-            + writer_passes
-        ]
-
-        if len(extras) > 0:
-            extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name)
-        else:
-            extras_flow = None
+        extras_flow = None
 
         ip_flow_requirements = [
             'optimize',
diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
index 0db01e654..658185d1d 100644
--- a/hls4ml/backends/oneapi/passes/reshaping_templates.py
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -1,5 +1,8 @@
+import numpy as np
+
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D
+from hls4ml.model.layers import Reshape, Resize, Transpose, ZeroPadding1D, ZeroPadding2D
 
 # ZeroPadding templates
 
@@ -136,3 +139,49 @@ def format(self, node):
         params['dim'] = node.get_attr('dim')
 
         return self.template.format(**params)
+
+
+# Reshape template (only used in streaming)
+reshape_task_sequence_template = 'task_sequence<nnet::repack_stream<{input_pipe}, {output_pipe}, {size}>> {name};'
+reshape_stream_function_template = '{name}.async();'
+reshape_include_list = ['nnet_utils/nnet_stream.h']
+
+
+class ReshapeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Reshape)
+
+    def format(self, node):
+        return ''
+
+
+class ReshapeFunctionTemplate(FunctionCallTemplate):
+    """Only used to add the include list"""
+
+    def __init__(self):
+        super().__init__(Reshape, include_header=reshape_include_list)
+
+    def format(self, node):
+        return ''
+
+
+class ReshapeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Reshape)
+        self.template = reshape_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['size'] = np.prod(node.get_output_variable().shape)
+        return self.template.format(**params)
+
+
+class ReshapeStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Reshape)
+        self.template = reshape_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
index 2bee64476..3328a9728 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -12,69 +12,81 @@ struct broadcast_config {
     static const unsigned n_dupl = 2;
 };
 
-template <class data_T, class res_T, int N>
-void clone_stream(stream<data_T> &data, stream<res_T> &res1, stream<res_T> &res2) {
+template <class data_pipe, class res1_pipe, class res2_pipe, int N> void clone_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res1_T = typename ExtractPipeType<res1_pipe>::value_type;
+    using res2_T = typename ExtractPipeType<res2_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
 CloneLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
-        data_T in_data = data.read();
-        res_T out_data1;
-        res_T out_data2;
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+        data_T in_data = data_pipe::read();
+        res1_T out_data1;
+        res2_T out_data2;
 
     ClonePack:
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
+        for (int j = 0; j < datasize; j++) {
             out_data1[j] = in_data[j];
             out_data2[j] = in_data[j];
         }
 
-        res1.write(out_data1);
-        res2.write(out_data2);
+        res1_pipe::write(out_data1);
+        res2_pipe::write(out_data2);
     }
 }
 
-template <class data_T, class res_T, int N>
-void clone_stream(stream<data_T> &data, stream<res_T> &res1, stream<res_T> &res2, stream<res_T> &res3) {
+template <class data_pipe, class res1_pipe, class res2_pipe, class res3_pipe, int N> void clone_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res1_T = typename ExtractPipeType<res1_pipe>::value_type;
+    using res2_T = typename ExtractPipeType<res2_pipe>::value_type;
+    using res3_T = typename ExtractPipeType<res3_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
 CloneLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
-        data_T in_data = data.read();
-        res_T out_data1;
-        res_T out_data2;
-        res_T out_data3;
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+        data_T in_data = data_pipe::read();
+        res1_T out_data1;
+        res2_T out_data2;
+        res3_T out_data3;
 
     ClonePack:
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
+        for (int j = 0; j < datasize; j++) {
             out_data1[j] = in_data[j];
             out_data2[j] = in_data[j];
             out_data3[j] = in_data[j];
         }
 
-        res1.write(out_data1);
-        res2.write(out_data2);
-        res3.write(out_data3);
+        res1_pipe::write(out_data1);
+        res2_pipe::write(out_data2);
+        res3_pipe::write(out_data3);
     }
 }
 
-template <class data_T, class res_T, int N> void repack_stream(stream<data_T> &data, stream<res_T> &res) {
-    if (data_T::size == res_T::size) {
-        [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+template <class data_pipe, class res_pipe, int N> void repack_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
+    constexpr auto ressize = std::tuple_size<res_T>{};
 
-            data_T in_data = data.read();
+    if (datasize == ressize) {
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+
+            data_T in_data = data_pipe::read();
             res_T out_data;
 
             #pragma unroll
-            for (int j = 0; j < data_T::size; j++) {
+            for (int j = 0; j < datasize; j++) {
                 out_data[j] = in_data[j];
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
-    } else if (data_T::size > res_T::size) {
-        constexpr unsigned pack_diff = data_T::size / res_T::size;
+    } else if (datasize > ressize) {
+        constexpr unsigned pack_diff = datasize / ressize;
 
-        for (int i = 0; i < N / data_T::size; i++) {
+        for (int i = 0; i < N / datasize; i++) {
 
-            data_T in_data = data.read();
+            data_T in_data = data_pipe::read();
             res_T out_data;
 
             [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
@@ -82,27 +94,27 @@ template <class data_T, class res_T, int N> void repack_stream(stream<data_T> &d
                 res_T out_data;
 
                 #pragma unroll
-                for (int k = 0; k < res_T::size; k++) {
-                    out_data[k] = in_data[j * res_T::size + k];
+                for (int k = 0; k < ressize; k++) {
+                    out_data[k] = in_data[j * ressize + k];
                 }
-                res.write(out_data);
+                res_pipe::write(out_data);
             }
         }
-    } else { // data_T::size < res_T::size
+    } else { // datasize < ressize
         res_T out_data;
-        constexpr unsigned pack_diff = res_T::size / data_T::size;
+        constexpr unsigned pack_diff = ressize / datasize;
         unsigned pack_cnt = 0;
-        [[intel::initiation_interval(1)]] for (int i = 0; i < N / data_T::size; i++) {
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
-            data_T in_data = data.read();
+            data_T in_data = data_pipe::read();
 
             #pragma unroll
-            for (int j = 0; j < data_T::size; j++) {
-                out_data[pack_cnt * data_T::size + j] = in_data[j];
+            for (int j = 0; j < datasize; j++) {
+                out_data[pack_cnt * datasize + j] = in_data[j];
             }
 
             if (pack_cnt == pack_diff - 1) {
-                res.write(out_data);
+                res_pipe::write(out_data);
                 pack_cnt = 0;
             } else {
                 pack_cnt++;

From fa05c8b3c821ced24c09deacbfa4d33a8b0f8751 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 29 Feb 2024 00:08:46 -0600
Subject: [PATCH 034/100] initial oneAPI tests

---
 test/pytest/test_batchnorm.py |  4 ++--
 test/pytest/test_conv1d.py    |  4 ++++
 test/pytest/test_keras_api.py | 10 +++++-----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
index c0ef0705a..750aff3b5 100644
--- a/test/pytest/test_batchnorm.py
+++ b/test/pytest/test_batchnorm.py
@@ -29,10 +29,10 @@ def model(request):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('model', [True, False], indirect=True)
 def test_batchnorm(model, data, backend, io_type):
-    default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
+    default_precision = 'fixed<32, 1>'
 
     center = model.layers[0].center
     scale = model.layers[0].scale
diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py
index 79beb01a2..e29bddf33 100644
--- a/test/pytest/test_conv1d.py
+++ b/test/pytest/test_conv1d.py
@@ -33,6 +33,8 @@ def keras_model():
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
@@ -83,6 +85,8 @@ def hls_model(keras_model, backend, io_type, strategy):
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index 64f68302e..a884c7438 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -25,7 +25,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(backend, io_type):
     model = tf.keras.models.Sequential()
@@ -67,7 +67,7 @@ def test_dense(backend, io_type):
     assert len(model.layers) + 1 == len(hls_model.get_layers())
     assert list(hls_model.get_layers())[0].attributes['class_name'] == "InputLayer"
     assert list(hls_model.get_layers())[1].attributes["class_name"] == model.layers[0]._name
-    assert list(hls_model.get_layers())[2].attributes['class_name'] == model.layers[1]._name
+    assert list(hls_model.get_layers())[2].attributes['class_name'] == 'ELU'
     assert list(hls_model.get_layers())[0].attributes['input_shape'] == list(model.layers[0].input_shape[1:])
     assert list(hls_model.get_layers())[1].attributes['n_in'] == model.layers[0].input_shape[1:][0]
     assert list(hls_model.get_layers())[1].attributes['n_out'] == model.layers[0].output_shape[1:][0]
@@ -90,7 +90,7 @@ def test_dense(backend, io_type):
     ],
 )
 # ThresholdedReLU(theta=1.0)])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -121,7 +121,7 @@ def test_activations(activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -195,7 +195,7 @@ def test_conv1d(padds, backend, io_type):
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(chans, padds, backend, io_type):
     model = tf.keras.models.Sequential()

From 36d5c851e72a13c9f29442576c63cdd19b0e6357 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 14 Mar 2024 18:09:49 -0500
Subject: [PATCH 035/100] remove nnet_dense_compressed from oneAPI

---
 .../backends/oneapi/passes/core_templates.py  |  2 +-
 .../nnet_utils/nnet_dense_compressed.h        | 79 -------------------
 2 files changed, 1 insertion(+), 80 deletions(-)
 delete mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 093c46b65..6fb7d7733 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -40,7 +40,7 @@
 
 dense_stream_function_template = '{name}.async({w}, {b});'
 
-dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
 class DenseConfigTemplate(LayerConfigTemplate):
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
deleted file mode 100644
index cb50e4e4b..000000000
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_compressed.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <cstdint>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-InitAccum:
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_out; i++) {
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-    [[intel::fpga_register]] data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-
-    #pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32_t w = ir + CONFIG_T::reuse_factor * im;
-            inputs[ir][im] = data[weights[w].row_index];
-            out_index[ir][im] = weights[w].col_index;
-        }
-    }
-ReuseLoop:
-    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
-    CompressedMultLoop:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32_t w = ir + CONFIG_T::reuse_factor * im;
-            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
-            typename CONFIG_T::accum_t prod = mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
-            #pragma unroll
-            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
-                inputs[is][im] = inputs[is + 1][im];
-            }
-        }
-        [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
-    ResetMult:
-        #pragma unroll
-        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
-            tmp_acc[tacc] = 0;
-        }
-    AccumLoop1:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            int col = out_index[ir][im];
-            tmp_acc[col] += mult[im];
-        }
-    AccumLoop2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += tmp_acc[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    #pragma unroll
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif

From de8b76df0ab76538770c4d9192a8bc9a5b292f9a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 15 Mar 2024 19:45:21 -0500
Subject: [PATCH 036/100] add merge functionality (untested)

---
 .../backends/oneapi/passes/core_templates.py  |   6 -
 .../backends/oneapi/passes/merge_templates.py |  37 ++++-
 .../oneapi/firmware/nnet_utils/nnet_merge.h   |  95 ++++++-------
 .../firmware/nnet_utils/nnet_merge_stream.h   | 130 ++++++++----------
 4 files changed, 130 insertions(+), 138 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 6fb7d7733..3ab5ae017 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -35,11 +35,8 @@
 }};\n"""
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
-
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-
 dense_stream_function_template = '{name}.async({w}, {b});'
-
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
@@ -111,11 +108,8 @@ def format(self, node):
 }};\n"""
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
-
 batchnorm_task_sequence_template = 'task_sequence<nnet::normalize_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-
 batchnorm_stream_function_template = '{name}.async({scale}, {bias});'
-
 batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
 
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
index 0cf612166..3147daa05 100644
--- a/hls4ml/backends/oneapi/passes/merge_templates.py
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -1,4 +1,5 @@
 from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Concatenate, Dot, Merge
 
@@ -12,6 +13,13 @@
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
+
+merge_task_sequence_template = (
+    'task_sequence<nnet::{merge}_stream<{input1_pipe}, {input2_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+merge_stream_function_template = '{name}.async();'
+
 merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h']
 
 
@@ -33,15 +41,36 @@ def __init__(self):
         self.template = merge_function_template
 
     def format(self, node):
-        params = {}
+        params = self._default_function_params(node)
         params['merge'] = node.get_attr('op').lower()
-        params['config'] = f'config{node.index}'
         params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name
         params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
-        params['output_t'] = node.get_output_variable().type.name
         params['input1'] = node.get_input_variable(node.inputs[0]).name
         params['input2'] = node.get_input_variable(node.inputs[1]).name
-        params['output'] = node.get_output_variable().name
+
+        return self.template.format(**params)
+
+
+class MergeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot))
+        self.template = merge_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['merge'] = node.get_attr('op').lower()
+        params['input1_pipe'] = node.get_input_variable(node.inputs[0]).pipe_name
+        params['input2_pipe'] = node.get_input_variable(node.inputs[1]).pipe_name
+        return self.template.format(**params)
+
+
+class MergeStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot))
+        self.template = merge_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
 
         return self.template.format(**params)
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
index 766ef2e20..045702796 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -32,126 +32,120 @@ struct concat_config {
 };
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void add(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] + data2[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i] + data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void subtract(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] - data2[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i] - data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void multiply(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] * data2[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i] * data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void average(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
+        res[i] = static_cast<typename res_T::value_type>((data1[i] + data2[i]) / (res_T)2);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void maximum(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+        res[i] = static_cast<typename res_T::value_type>((data1[i] > data2[i]) ? data1[i] : data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+void minimum(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+        res[i] = static_cast<typename res_T::value_type>((data1[i] < data2[i]) ? data1[i] : data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+void dot1d(const input1_T &data1, const input2_T &data2, res_T &res) {
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
 
-    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
 Product:
     #pragma unroll multiplier_limit
     for (int i = 0; i < CONFIG_T::n_in; i++) {
-        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
+        mult[i] = CONFIG_T::template product<typename input1_T::value_type, typename input2_T::value_type>::product(
+            data1[i], data2[i]);
     }
 
-    hls_register typename CONFIG_T::accum_t acc = 0;
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc = 0;
 Accum:
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_in; i++) {
         acc += mult[i];
     }
 
-    res[0] = static_cast<res_T>(acc);
+    res[0] = static_cast<typename res_T::value_type>(acc);
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+void concatenate1d(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
     }
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<typename res_T::value_type>(data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+void concatenate2d_0(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
     }
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<typename res_T::value_type>(data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+void concatenate2d_1(const input1_T &data1, const input2_T &data2, res_T &res) {
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
             res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
-                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
+                static_cast<typename res_T::value_type>(data1[i * CONFIG_T::n_elem1_1 + j]);
         }
 
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
             res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
-                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
+                static_cast<typename res_T::value_type>(data2[i * CONFIG_T::n_elem2_1 + j]);
         }
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+void concatenate2d(const input1_T &data1, const input2_T &data2, res_T &res) {
     if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
         concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
     } else {
@@ -160,26 +154,21 @@ void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+void concatenate3d_0(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
     }
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] =
+            static_cast<typename res_T::value_type>(data2[i]);
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+void concatenate3d_1(const input1_T &data1, const input2_T &data2, res_T &res) {
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
         for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
             #pragma unroll
@@ -187,7 +176,7 @@ void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 *
                 int res_idx =
                     i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
                 int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+                res[res_idx] = static_cast<typename res_T::value_type>(data1[data_idx]);
             }
         }
 
@@ -197,17 +186,14 @@ void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 *
                 int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
                               (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
                 int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+                res[res_idx] = static_cast<typename res_T::value_type>(data2[data_idx]);
             }
         }
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+void concatenate3d_2(const input1_T &data1, const input2_T &data2, res_T &res) {
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
         for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
@@ -216,7 +202,7 @@ void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 *
                 int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
                               j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
                 int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+                res[res_idx] = static_cast<typename res_T::value_type>(data1[data_idx]);
             }
 
             #pragma unroll
@@ -224,17 +210,14 @@ void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 *
                 int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
                               j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
                 int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+                res[res_idx] = static_cast<typename res_T::value_type>(data2[data_idx]);
             }
         }
     }
 }
 
 template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+void concatenate3d(const input1_T &data1, const input2_T &data2, res_T &res) {
     if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
         concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
     } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
index aeafc00ca..6ed462d9c 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
@@ -3,14 +3,13 @@
 
 namespace nnet {
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void add_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 AddLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -20,18 +19,17 @@ void add(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
             out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] + in_data2[j]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void subtract_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 SubtractLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -41,18 +39,17 @@ void subtract(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &r
             out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] - in_data2[j]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void multiply_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 MultLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -62,18 +59,17 @@ void multiply(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &r
             out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] * in_data2[j]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void average_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 AvgLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -84,18 +80,17 @@ void average(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &re
                 static_cast<typename res_T::value_type>((in_data1[j] + in_data2[j]) / (typename res_T::value_type)2);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void maximum_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 MaxLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -106,18 +101,17 @@ void maximum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &re
                                                                                                             : in_data2[j]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void minimum_stream() {
     assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
 
 MinLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
 
         [[intel::fpga_register]] res_T out_data;
 
@@ -128,17 +122,16 @@ void minimum(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &re
                                                                                                             : in_data2[j]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate1d_stream() {
     [[intel::fpga_register]] res_T out_data;
 
 ConcatLoop1:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
     ConcatPack1:
         #pragma unroll
         for (int j = 0; j < input1_T::size; j++) {
@@ -148,7 +141,7 @@ void concatenate1d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_
 
 ConcatLoop2:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
     ConcatPack2:
         #pragma unroll
         for (int j = 0; j < input2_T::size; j++) {
@@ -156,15 +149,14 @@ void concatenate1d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_
                 static_cast<typename res_T::value_type>(in_data2[j]);
         }
     }
-    res.write(out_data);
+    res_pipe::write(out_data);
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_0_stream() {
 ConcatLoopHeight1:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
 
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
         [[intel::fpga_register]] res_T out_data;
 
     ConcatPackInput1:
@@ -173,12 +165,12 @@ void concatenate2d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
             out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 
 ConcatLoopHeight2:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
         [[intel::fpga_register]] res_T out_data;
 
     ConcatPackInput2:
@@ -187,16 +179,15 @@ void concatenate2d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
             out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_1_stream() {
 ConcatLoopHeight:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = data1.read();
-        [[intel::fpga_register]] input2_T in_data2 = data2.read();
+        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
         [[intel::fpga_register]] res_T out_data;
 
     ConcatPackInput1:
@@ -211,27 +202,25 @@ void concatenate2d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
             out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_stream() {
     if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+        concatenate2d_1_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
     } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+        concatenate2d_0_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_0_stream() {
 ConcatLoopHeight1:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth1:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = data1.read();
+            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
             [[intel::fpga_register]] res_T out_data;
         ConcatPackInput1:
             #pragma unroll
@@ -239,7 +228,7 @@ void concatenate3d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
                 out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
     }
 
@@ -248,7 +237,7 @@ void concatenate3d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
     ConcatLoopWidth2:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
 
-            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
             [[intel::fpga_register]] res_T out_data;
 
         ConcatPackInput2:
@@ -257,19 +246,18 @@ void concatenate3d_0(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
                 out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_1_stream() {
 ConcatLoopHeight:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth1:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = data1.read();
+            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
             [[intel::fpga_register]] res_T out_data;
 
         ConcatPackInput1:
@@ -278,12 +266,12 @@ void concatenate3d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
                 out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
     ConcatLoopWidth2:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
 
-            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
             [[intel::fpga_register]] res_T out_data;
 
         ConcatPackInput2:
@@ -292,20 +280,19 @@ void concatenate3d_1(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
                 out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_2_stream() {
 ConcatLoopHeight:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = data1.read();
-            [[intel::fpga_register]] input2_T in_data2 = data2.read();
+            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
             [[intel::fpga_register]] res_T out_data;
 
         ConcatPackInput1:
@@ -320,19 +307,18 @@ void concatenate3d_2(stream<input1_T> &data1, stream<input2_T> &data2, stream<re
                 out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
             }
 
-            res.write(out_data);
+            res_pipe::write(out_data);
         }
     }
 }
 
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(stream<input1_T> &data1, stream<input2_T> &data2, stream<res_T> &res) {
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_stream() {
     if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+        concatenate3d_2_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
     } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+        concatenate3d_1_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
     } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+        concatenate3d_0_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
     }
 }
 

From 058adb4b4d017dc66742042bcfe1355d4d169adf Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sat, 16 Mar 2024 12:42:11 -0500
Subject: [PATCH 037/100] fix merge for oneAPI

---
 .../oneapi/firmware/nnet_utils/nnet_merge.h   |   2 +-
 .../firmware/nnet_utils/nnet_merge_stream.h   | 218 ++++++++++--------
 hls4ml/writer/oneapi_writer.py                |   1 -
 test/pytest/test_merge.py                     |  10 +-
 4 files changed, 131 insertions(+), 100 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
index 045702796..550663b88 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -59,7 +59,7 @@ template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
 void average(const input1_T &data1, const input2_T &data2, res_T &res) {
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<typename res_T::value_type>((data1[i] + data2[i]) / (res_T)2);
+        res[i] = static_cast<typename res_T::value_type>((data1[i] + data2[i]) / 2);
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
index 6ed462d9c..1edaeed65 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
@@ -4,19 +4,21 @@
 namespace nnet {
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void add_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 AddLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     AddPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] + in_data2[j]);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] + in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -24,19 +26,21 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void subtract_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 SubtractLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     SubtractPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] - in_data2[j]);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] - in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -44,19 +48,21 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void multiply_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 MultLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     MultPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = static_cast<typename res_T::value_type>(in_data1[j] * in_data2[j]);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] * in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -64,20 +70,22 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void average_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 AvgLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     AvgPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] =
-                static_cast<typename res_T::value_type>((in_data1[j] + in_data2[j]) / (typename res_T::value_type)2);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                (in_data1[j] + in_data2[j]) / (typename ExtractPipeType<res_pipe>::value_type::value_type)2);
         }
 
         res_pipe::write(out_data);
@@ -85,20 +93,22 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void maximum_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 MaxLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     MaxPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = static_cast<typename res_T::value_type>(out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j]
-                                                                                                            : in_data2[j]);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -106,20 +116,22 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void minimum_stream() {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
 
 MinLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
 
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     MinPack:
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = static_cast<typename res_T::value_type>(out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j]
-                                                                                                            : in_data2[j]);
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -127,42 +139,49 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate1d_stream() {
-    [[intel::fpga_register]] res_T out_data;
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
 ConcatLoop1:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input2Size; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
     ConcatPack1:
         #pragma unroll
-        for (int j = 0; j < input1_T::size; j++) {
-            out_data[j + (i * input1_T::size)] = static_cast<typename res_T::value_type>(in_data1[j]);
+        for (int j = 0; j < input1Size; j++) {
+            out_data[j + (i * input1Size)] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j]);
         }
     }
 
 ConcatLoop2:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2Size; i++) {
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
     ConcatPack2:
         #pragma unroll
-        for (int j = 0; j < input2_T::size; j++) {
-            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] =
-                static_cast<typename res_T::value_type>(in_data2[j]);
+        for (int j = 0; j < input2Size; j++) {
+            out_data[j + (i * input2Size) + (CONFIG_T::n_elem1_0)] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[j]);
         }
     }
     res_pipe::write(out_data);
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_0_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
 ConcatLoopHeight1:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
 
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     ConcatPackInput1:
         #pragma unroll
-        for (int k = 0; k < input1_T::size; k++) {
-            out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+        for (int k = 0; k < input1Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
         }
 
         res_pipe::write(out_data);
@@ -170,13 +189,13 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 
 ConcatLoopHeight2:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     ConcatPackInput2:
         #pragma unroll
-        for (int k = 0; k < input2_T::size; k++) {
-            out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+        for (int k = 0; k < input2Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
         }
 
         res_pipe::write(out_data);
@@ -184,22 +203,25 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_1_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
 ConcatLoopHeight:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-        [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
-        [[intel::fpga_register]] res_T out_data;
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
     ConcatPackInput1:
         #pragma unroll
-        for (int k = 0; k < input1_T::size; k++) {
-            out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+        for (int k = 0; k < input1Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
         }
 
     ConcatPackInput2:
         #pragma unroll
-        for (int k = 0; k < input2_T::size; k++) {
-            out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
+        for (int k = 0; k < input2Size; k++) {
+            out_data[input1Size + k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
         }
 
         res_pipe::write(out_data);
@@ -215,17 +237,20 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_0_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
 ConcatLoopHeight1:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth1:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-            [[intel::fpga_register]] res_T out_data;
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
         ConcatPackInput1:
             #pragma unroll
-            for (int k = 0; k < input1_T::size; k++) {
-                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
             }
 
             res_pipe::write(out_data);
@@ -237,13 +262,13 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
     ConcatLoopWidth2:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
 
-            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
-            [[intel::fpga_register]] res_T out_data;
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
         ConcatPackInput2:
             #pragma unroll
-            for (int k = 0; k < input2_T::size; k++) {
-                out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            for (int k = 0; k < input2Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
             }
 
             res_pipe::write(out_data);
@@ -252,18 +277,21 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_1_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
 ConcatLoopHeight:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth1:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-            [[intel::fpga_register]] res_T out_data;
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
         ConcatPackInput1:
             #pragma unroll
-            for (int k = 0; k < input1_T::size; k++) {
-                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
             }
 
             res_pipe::write(out_data);
@@ -271,13 +299,13 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
     ConcatLoopWidth2:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
 
-            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
-            [[intel::fpga_register]] res_T out_data;
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
         ConcatPackInput2:
             #pragma unroll
-            for (int k = 0; k < input2_T::size; k++) {
-                out_data[k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            for (int k = 0; k < input2Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
             }
 
             res_pipe::write(out_data);
@@ -286,25 +314,29 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
 }
 
 template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_2_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
 ConcatLoopHeight:
     for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
     ConcatLoopWidth:
         [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
 
-            [[intel::fpga_register]] input1_T in_data1 = input1_pipe::read();
-            [[intel::fpga_register]] input2_T in_data2 = input2_pipe::read();
-            [[intel::fpga_register]] res_T out_data;
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
         ConcatPackInput1:
             #pragma unroll
-            for (int k = 0; k < input1_T::size; k++) {
-                out_data[k] = static_cast<typename res_T::value_type>(in_data1[k]);
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
             }
 
         ConcatPackInput2:
             #pragma unroll
-            for (int k = 0; k < input2_T::size; k++) {
-                out_data[input1_T::size + k] = static_cast<typename res_T::value_type>(in_data2[k]);
+            for (int k = 0; k < input2Size; k++) {
+                out_data[input1Size + k] =
+                    static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
             }
 
             res_pipe::write(out_data);
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index a7f6fafbe..5d56879db 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -387,7 +387,6 @@ def write_test_bench(self, model):
         if len(model_inputs) != 1 or len(model_outputs) != 1:
             print("The testbench supports only single input arrays and single output arrays.")
             print("Please modify it before using it.")
-            return
 
         if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'):
             os.mkdir(f'{model.config.get_output_dir()}/tb_data/')
diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py
index cabe3968f..3ecbc951d 100644
--- a/test/pytest/test_merge.py
+++ b/test/pytest/test_merge.py
@@ -12,7 +12,7 @@
 
 @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('swap_inputs', [True, False])
 def test_merge(merge_layer, io_type, backend, swap_inputs):
     input_shape = (10, 10, 3)
@@ -48,7 +48,7 @@ def test_merge(merge_layer, io_type, backend, swap_inputs):
 
 @pytest.mark.parametrize('axes', [1])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # No io_stream implementation yet
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_dot(axes, io_type, backend):
     # Only 1D implemented
     input_shape = (10,)
@@ -77,7 +77,7 @@ def test_dot(axes, io_type, backend):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate1d(io_type, backend):
     input_shape = (10,)
 
@@ -106,7 +106,7 @@ def test_concatenate1d(io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate2d(axis, io_type, backend):
     input_shape = (10, 3)
 
@@ -135,7 +135,7 @@ def test_concatenate2d(axis, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2, 3])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate3d(axis, io_type, backend):
     input_shape = (10, 10, 3)
 

From 0a7c76185994c43b472ef55ea3ddd9fd9d25e776 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sat, 16 Mar 2024 12:45:41 -0500
Subject: [PATCH 038/100] fix merge for oneAPI (missing commit)

---
 .../templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
index 1edaeed65..60028ea52 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
@@ -108,7 +108,7 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
         #pragma unroll
         for (int j = 0; j < outputSize; j++) {
             out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
-                out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]);
+                (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]);
         }
 
         res_pipe::write(out_data);
@@ -131,7 +131,7 @@ template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_
         #pragma unroll
         for (int j = 0; j < outputSize; j++) {
             out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
-                out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]);
+                (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]);
         }
 
         res_pipe::write(out_data);

From 4c847b2b8ae866bacd81d02bb186620d1370a8cb Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 17 Mar 2024 14:04:17 -0500
Subject: [PATCH 039/100] add zeropadding

---
 .../oneapi/passes/reshaping_templates.py      | 37 +++++++++++++++++
 .../oneapi/firmware/nnet_utils/nnet_padding.h | 31 ++++++++------
 .../firmware/nnet_utils/nnet_padding_stream.h | 40 +++++++++----------
 test/pytest/test_zeropadding.py               |  2 +-
 4 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
index 658185d1d..34a1d8d89 100644
--- a/hls4ml/backends/oneapi/passes/reshaping_templates.py
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -31,6 +31,15 @@
 zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 
+zeropad1d_task_sequence_template = (
+    'task_sequence<nnet::zeropad1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+zeropad2d_task_sequence_template = (
+    'task_sequence<nnet::zeropad2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+padding_stream_function_template = '{name}.async();'
+
 padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
 
 
@@ -64,6 +73,34 @@ def format(self, node):
         return self.templates[node.class_name].format(**params)
 
 
+class ZeroPaddingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_task_sequence_template,
+            'ZeroPadding2D': zeropad2d_task_sequence_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+
+        return self.templates[node.class_name].format(**params)
+
+
+class ZeroPaddingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.template = padding_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
 # Resize templates
 
 resize_config_template = """struct config{index} : nnet::resize_config {{
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
index a95f9ab00..e8e3d6509 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
@@ -12,26 +12,29 @@ struct padding1d_config {
     static const unsigned pad_right = 0;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+template <class data_T, class res_T, typename CONFIG_T> void zeropad1d_cl(const data_T &data, res_T &res) {
+
+    auto resIter = res.begin();
+    auto dataIter = data.cbegin();
+
     for (int i = 0; i < CONFIG_T::pad_left; i++) {
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
+            *(resIter++) = 0;
         }
     }
 
     for (int i = 0; i < CONFIG_T::in_width; i++) {
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
+            *(resIter++) = static_cast<typename res_T::value_type>(*(dataIter++));
         }
     }
 
     for (int i = 0; i < CONFIG_T::pad_right; i++) {
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
+            *(resIter++) = 0;
         }
     }
 }
@@ -51,14 +54,16 @@ struct padding2d_config {
     static const unsigned pad_right = 0;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(const data_T &data, res_T &res) {
+
+    auto resIter = res.begin();
+    auto dataIter = data.cbegin();
+
     for (int i = 0; i < CONFIG_T::pad_top; i++) {
         for (int j = 0; j < CONFIG_T::out_width; j++) {
             #pragma unroll
             for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
+                *(resIter++) = 0;
             }
         }
     }
@@ -67,19 +72,19 @@ void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T:
         for (int j = 0; j < CONFIG_T::pad_left; j++) {
             #pragma unroll
             for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
+                *(resIter++) = 0;
             }
         }
         for (int j = 0; j < CONFIG_T::in_width; j++) {
             #pragma unroll
             for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
+                *(resIter++) = static_cast<typename res_T::value_type>(*(dataIter++));
             }
         }
         for (int j = 0; j < CONFIG_T::pad_right; j++) {
             #pragma unroll
             for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
+                *(resIter++) = 0;
             }
         }
     }
@@ -88,7 +93,7 @@ void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T:
         for (int j = 0; j < CONFIG_T::out_width; j++) {
             #pragma unroll
             for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
+                *(resIter++) = 0;
             }
         }
     }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
index 8990a3339..adb2efee2 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
@@ -3,69 +3,67 @@
 
 namespace nnet {
 
-template <class res_T, typename CONFIG_T> inline void fill_zero(stream<res_T> &res) {
-    [[intel::fpga_register]] res_T res_part;
+template <class res_pipe, typename CONFIG_T> inline void fill_zero() {
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res_part;
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_chan; i++) {
         res_part[i] = 0;
     }
-    res.write(res_part);
+    res_pipe::write(res_part);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> inline void fill_data(stream<data_T> &data, stream<res_T> &res) {
-    [[intel::fpga_register]] data_T data_part = data.read();
-    [[intel::fpga_register]] res_T res_part;
+template <class data_pipe, class res_pipe, typename CONFIG_T> inline void fill_data() {
+    [[intel::fpga_register]] auto data_part = data_pipe::read();
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res_part;
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_chan; i++) {
         res_part[i] = data_part[i];
     }
-    res.write(res_part);
+    res_pipe::write(res_part);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void zeropad1d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void zeropad1d_cl_stream() {
 PadLeft:
     for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        fill_zero<res_T, CONFIG_T>(res);
+        fill_zero<res_pipe, CONFIG_T>();
     }
 
 CopyMain:
     for (int i = 0; i < CONFIG_T::in_width; i++) {
-        fill_data<data_T, res_T, CONFIG_T>(data, res);
+        fill_data<data_pipe, res_pipe, CONFIG_T>();
     }
 
 PadRight:
     for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        fill_zero<res_T, CONFIG_T>(res);
+        fill_zero<res_pipe, CONFIG_T>();
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void zeropad2d_cl_stream() {
 PadTop:
-    #pragma loop_coalesce 2
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+    [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::pad_top; i++) {
     PadTopWidth:
         for (int j = 0; j < CONFIG_T::out_width; j++) {
-            fill_zero<res_T, CONFIG_T>(res);
+            fill_zero<res_pipe, CONFIG_T>();
         }
     }
 
 PadMain:
-    #pragma loop_coalesce 2
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
+    [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::in_height; i++) {
 
     PadLeft:
         for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            fill_zero<res_T, CONFIG_T>(res);
+            fill_zero<res_pipe, CONFIG_T>();
         }
 
     CopyMain:
         for (int j = 0; j < CONFIG_T::in_width; j++) {
-            fill_data<data_T, res_T, CONFIG_T>(data, res);
+            fill_data<data_pipe, res_pipe, CONFIG_T>();
         }
 
     PadRight:
         for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            fill_zero<res_T, CONFIG_T>(res);
+            fill_zero<res_pipe, CONFIG_T>();
         }
     }
 
@@ -73,7 +71,7 @@ template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(stream
     for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
     PadBottomWidth:
         for (int j = 0; j < CONFIG_T::out_width; j++) {
-            fill_zero<res_T, CONFIG_T>(res);
+            fill_zero<res_pipe, CONFIG_T>();
         }
     }
 }
diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py
index 962a3334a..cb5671de7 100644
--- a/test/pytest/test_zeropadding.py
+++ b/test/pytest/test_zeropadding.py
@@ -50,7 +50,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':

From f690c98908351cab2737527eda4a275b499787d8 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 17 Mar 2024 14:14:00 -0500
Subject: [PATCH 040/100] standardize paralellization spelling

---
 .../oneapi/passes/convolution_templates.py     |  4 ++--
 .../quartus/passes/convolution_templates.py    |  4 ++--
 .../objectives/vivado_objectives.py            |  4 ++--
 .../oneapi/firmware/nnet_utils/nnet_conv1d.h   |  2 +-
 .../firmware/nnet_utils/nnet_conv1d_resource.h | 12 ++++++------
 .../oneapi/firmware/nnet_utils/nnet_conv2d.h   |  2 +-
 .../firmware/nnet_utils/nnet_conv2d_resource.h | 18 +++++++++---------
 .../quartus/firmware/nnet_utils/nnet_conv1d.h  |  2 +-
 .../firmware/nnet_utils/nnet_conv1d_resource.h | 12 ++++++------
 .../quartus/firmware/nnet_utils/nnet_conv2d.h  |  2 +-
 .../firmware/nnet_utils/nnet_conv2d_resource.h | 18 +++++++++---------
 11 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 590b1327f..0c7ff467c 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -47,7 +47,7 @@
     static const unsigned dilation = {dilation};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation};
@@ -161,7 +161,7 @@ def format(self, node):
     static const unsigned stride_width = {stride_width};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation};
diff --git a/hls4ml/backends/quartus/passes/convolution_templates.py b/hls4ml/backends/quartus/passes/convolution_templates.py
index 75f8ca687..d1c36fe1b 100644
--- a/hls4ml/backends/quartus/passes/convolution_templates.py
+++ b/hls4ml/backends/quartus/passes/convolution_templates.py
@@ -46,7 +46,7 @@
     static const unsigned dilation = {dilation};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation};
@@ -127,7 +127,7 @@ def format(self, node):
     static const unsigned stride_width = {stride_width};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation};
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
index c0c0c33e0..1e1a0a979 100644
--- a/hls4ml/optimization/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -32,7 +32,7 @@ def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
             return [0]
         else:
-            # TOOD - Extend for parallelisation factor
+            # TOOD - Extend for parallelization factor
             return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor]
 
     @classmethod
@@ -117,7 +117,7 @@ def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
 
-        # TOOD - Extend for parallelisation factor
+        # TOOD - Extend for parallelization factor
         if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             return [
                 int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
index 549cb2c19..1dd931ac3 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
@@ -32,7 +32,7 @@ struct conv1d_config {
     // Run-time Configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
index b8405b126..6417ab2a7 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -44,8 +44,8 @@ void conv_1d_im2col_cl(
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
     using data_col_T = array<typename data_T::value_type, CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
     using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
@@ -97,8 +97,8 @@ void winograd_conv1d_3x1_kernel_cl(
     assert(CONFIG_T::stride_width == 1);
     assert(CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -184,8 +184,8 @@ void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
     using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
     using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
index 0038ce7d1..993a75e04 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
@@ -38,7 +38,7 @@ struct conv2d_config {
     // Run-time configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 626751cf5..5d9c35807 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -54,10 +54,10 @@ void conv_2d_im2col_cl(const data_T &data, res_T &res,
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
     using data_col_T =
         array<typename data_T::value_type, CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
@@ -131,10 +131,10 @@ void winograd_conv2d_3x3_kernel_cl(
     assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom);
     assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -241,10 +241,10 @@ void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
     using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
     using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
index 8897e1315..005b84217 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
@@ -32,7 +32,7 @@ struct conv1d_config {
     // Run-time Configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
index a110d6d42..c1a0e9bf3 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -45,8 +45,8 @@ void conv_1d_im2col_cl(
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
 ColLoop:
     #pragma unroll pf
@@ -96,8 +96,8 @@ void winograd_conv1d_3x1_kernel_cl(
     assert(CONFIG_T::stride_width == 1);
     assert(CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -183,8 +183,8 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
 ColLoop:
     #pragma unroll pf
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
index 3aa71a74b..55b635908 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
@@ -38,7 +38,7 @@ struct conv2d_config {
     // Run-time configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
index 73ad45592..52c18ff5c 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -57,10 +57,10 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
 HeightLoop:
     #pragma unroll pfr
@@ -132,10 +132,10 @@ void winograd_conv2d_3x3_kernel_cl(
     assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom);
     assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -240,10 +240,10 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
 HeightLoop:
     #pragma unroll pfr

From 262bc0ca73af64e1700b99e11c4e33cbfe6af703 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 14 Apr 2024 22:48:54 -0500
Subject: [PATCH 041/100] fix pointwise for oneAPI

---
 .../oneapi/passes/convolution_templates.py    | 23 +-----
 hls4ml/backends/oneapi/passes/pointwise.py    | 82 +++++++++++++++++--
 .../nnet_utils/nnet_conv1d_resource.h         |  2 +-
 .../nnet_utils/nnet_conv2d_resource.h         |  2 +-
 test/pytest/test_pointwiseconv.py             |  8 +-
 5 files changed, 87 insertions(+), 30 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 0c7ff467c..8c4aa7c9b 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -65,7 +65,7 @@
     'task_sequence<nnet::conv_1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 )
 
-conv1d_stream_function_template = '{name}.async({w}, {b});'
+conv_stream_function_template = '{name}.async({w}, {b});'
 
 conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
@@ -124,10 +124,10 @@ def format(self, node):
         return self.template.format(**params)
 
 
-class Conv1DStreamFunctionTemplate(StreamFunctionCallTemplate):
+class ConvStreamFunctionTemplate(StreamFunctionCallTemplate):
     def __init__(self):
-        super().__init__(Conv1D)
-        self.template = conv1d_stream_function_template
+        super().__init__((Conv1D, Conv2D))
+        self.template = conv_stream_function_template
 
     def format(self, node):
         params = self._default_function_params(node)
@@ -178,8 +178,6 @@ def format(self, node):
     'task_sequence<nnet::conv_2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 )
 
-conv2d_stream_function_template = '{name}.async({w}, {b});'
-
 conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
 
 
@@ -235,16 +233,3 @@ def format(self, node):
             raise RuntimeError('channels_first not supported on Quartus')
         params['data_format'] = 'cl'
         return self.template.format(**params)
-
-
-class Conv2DStreamFunctionTemplate(StreamFunctionCallTemplate):
-    def __init__(self):
-        super().__init__((Conv2D, Conv2DBatchnorm))
-        self.template = conv2d_stream_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-        params['w'] = node.get_weights('weight').name
-        params['b'] = node.get_weights('bias').name
-
-        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index 84ae79e49..23a7d8d6b 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -3,15 +3,15 @@
 import numpy as np
 
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
-from hls4ml.backends.quartus.passes.convolution_templates import (
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.oneapi.passes.convolution_templates import (
     Conv1DConfigTemplate,
-    Conv1DFunctionTemplate,
     Conv2DConfigTemplate,
-    Conv2DFunctionTemplate,
     conv1d_config_template,
     conv2d_config_template,
     conv_mult_config_template,
 )
+from hls4ml.backends.template import FunctionCallTemplate
 from hls4ml.model.layers import register_layer
 from hls4ml.model.optimizer import OptimizerPass
 
@@ -27,6 +27,16 @@
     'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 )
 
+pointwise_conv1d_task_sequence_template = (
+    'task_sequence<nnet::pintwise_conv_1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+pointwise_conv2d_task_sequence_template = (
+    'task_sequence<nnet::pintwise_conv_2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+pointwise_conv_stream_function_template = '{name}.async({w}, {b});'
+
 sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h']
 sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h']
 
@@ -38,11 +48,34 @@ def __init__(self):
         self.mult_template = conv_mult_config_template
 
 
-class PointwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+class PointwiseConv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv1D, include_header=sepconv1d_include_list)
+        self.template = pointwise_conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class PointwiseConv1DTaskSequenceTemplate(TaskSequenceTemplate):
     def __init__(self):
-        super(Conv1DFunctionTemplate, self).__init__(PointwiseConv1D, include_header=sepconv1d_include_list)
+        super().__init__(PointwiseConv1D)
         self.template = pointwise_conv1d_function_template
 
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
+
 
 class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
     def __init__(self):
@@ -51,11 +84,46 @@ def __init__(self):
         self.mult_template = conv_mult_config_template
 
 
-class PointwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+class PointwiseConv2DFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super(Conv2DFunctionTemplate, self).__init__(PointwiseConv2D, include_header=sepconv2d_include_list)
+        super().__init__(PointwiseConv2D, include_header=sepconv2d_include_list)
         self.template = pointwise_conv2d_function_template
 
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class PointwiseConv2DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv2D)
+        self.template = pointwise_conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on Quartus')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
+
+
+class PointwiseConvStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((PointwiseConv1D, PointwiseConv2D))
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
 
 def register_pointwise(backend):
     # Register the layer types to the layer map
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
index 6417ab2a7..d73163f5a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -200,7 +200,7 @@ void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
         [[intel::fpga_register]] data_col_T data_col;
         im2col_1d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, col);
 
-        [[intel::fpga_register]] res_T res_col;
+        [[intel::fpga_register]] res_col_T res_col;
         dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
     // Unroll fully, since
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index 5d9c35807..def4db2b4 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -262,7 +262,7 @@ void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
             [[intel::fpga_register]] data_col_T data_col;
             im2col_2d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, row, col);
 
-            [[intel::fpga_register]] res_T res_col;
+            [[intel::fpga_register]] res_col_T res_col;
             dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
 
         FiltLoop:
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index b7fee0a4a..d28f61fb9 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -23,6 +23,8 @@
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vitis', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
@@ -69,7 +71,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend == 'Quartus' and io_type == 'io_stream'):
+    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
@@ -83,6 +85,8 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
@@ -127,7 +131,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend == 'Quartus' and io_type == 'io_stream'):
+    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)

From a8da30e960efead7e2e6e10c0fe38e671e4e487f Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 14 Apr 2024 23:48:15 -0500
Subject: [PATCH 042/100] remove references to quartus

---
 .../oneapi/passes/convolution_templates.py    | 12 +++----
 .../backends/oneapi/passes/core_templates.py  |  4 +--
 .../backends/oneapi/passes/merge_templates.py |  2 +-
 hls4ml/backends/oneapi/passes/pointwise.py    |  8 ++---
 .../oneapi/passes/quantization_templates.py   | 31 +++++++++++++++++--
 5 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 8c4aa7c9b..3a2cc8928 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -87,7 +87,7 @@ def format(self, node):
         mult_params = self._default_config_params(node)
         mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
         mult_params['n_out'] = node.get_attr('n_filt')
-        mult_params['product_type'] = get_backend('quartus').product_type(
+        mult_params['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
         mult_config = self.mult_template.format(**mult_params)
@@ -103,7 +103,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -119,7 +119,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
 
@@ -198,7 +198,7 @@ def format(self, node):
         mult_params = self._default_config_params(node)
         mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width')
         mult_params['n_out'] = node.get_attr('n_filt')
-        mult_params['product_type'] = get_backend('quartus').product_type(
+        mult_params['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
         mult_config = self.mult_template.format(**mult_params)
@@ -214,7 +214,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported for Quartus')
+            raise RuntimeError('channels_first not supported for oneAPI')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -230,6 +230,6 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 3ab5ae017..0733b0d60 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -49,7 +49,7 @@ def format(self, node):
         params = self._default_config_params(node)
         params['nzeros'] = node.get_weights('weight').nzeros
         params['nonzeros'] = node.get_weights('weight').nonzeros
-        params['product_type'] = get_backend('quartus').product_type(
+        params['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
@@ -121,7 +121,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
-        params['product_type'] = get_backend('quartus').product_type(
+        params['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('scale').type.precision
         )
 
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
index 3147daa05..c38e1e055 100644
--- a/hls4ml/backends/oneapi/passes/merge_templates.py
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -100,7 +100,7 @@ def format(self, node):
         params = self._default_config_params(node)
         params['n_out'] = 1
         params['n_in'] = inp1.shape[0]
-        params['product_type'] = get_backend('quartus').product_type(inp1.type.precision, inp2.type.precision)
+        params['product_type'] = get_backend('oneAPI').product_type(inp1.type.precision, inp2.type.precision)
 
         return self.template.format(**params)
 
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index 23a7d8d6b..9fa81f42f 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -56,7 +56,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -72,7 +72,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
 
@@ -92,7 +92,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         params['w'] = node.get_weights('weight').name
         params['b'] = node.get_weights('bias').name
@@ -108,7 +108,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
 
diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py
index d6cf2d2da..c46e17485 100644
--- a/hls4ml/backends/oneapi/passes/quantization_templates.py
+++ b/hls4ml/backends/oneapi/passes/quantization_templates.py
@@ -1,8 +1,11 @@
 from hls4ml.backends.backend import get_backend
-from hls4ml.backends.quartus.passes.core_templates import (
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.oneapi.passes.core_templates import (
     batchnorm_config_template,
     batchnorm_function_template,
     batchnorm_include_list,
+    batchnorm_stream_function_template,
+    batchnorm_task_sequence_template,
 )
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha
@@ -16,7 +19,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
-        params['product_type'] = get_backend('quartus').product_type(
+        params['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('scale').type.precision
         )
 
@@ -34,3 +37,27 @@ def format(self, node):
         params['bias'] = node.get_weights('bias').name
 
         return self.template.format(**params)
+
+
+class ApplyAlphaTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)

From 46ccc1dc31e8c020f9865656c8ab01e7b888a228 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 14 Apr 2024 23:50:23 -0500
Subject: [PATCH 043/100] more replace quartus with oneapi

---
 hls4ml/backends/oneapi/passes/recurrent_templates.py | 4 ++--
 hls4ml/backends/oneapi/passes/reshaping_templates.py | 4 ++--
 hls4ml/backends/oneapi/passes/resource_strategy.py   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
index 2bf45351b..c550425a9 100644
--- a/hls4ml/backends/oneapi/passes/recurrent_templates.py
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -112,7 +112,7 @@ def format(self, node):
         mult_params_x = self._default_config_params(node)
         mult_params_x['n_in'] = node.get_attr('n_in')
         mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3'
-        mult_params_x['product_type'] = get_backend('quartus').product_type(
+        mult_params_x['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
         mult_params_x['index'] = str(node.index) + '_x'
@@ -123,7 +123,7 @@ def format(self, node):
         mult_params_h['n_in'] = node.get_attr('n_out')
         mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3'
         mult_params_h['reuse_factor'] = params['recurrent_reuse_factor']
-        mult_params_h['product_type'] = get_backend('quartus').product_type(
+        mult_params_h['product_type'] = get_backend('oneAPI').product_type(
             node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision
         )
         mult_params_h['index'] = str(node.index) + '_h'
diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
index 34a1d8d89..32881aab6 100644
--- a/hls4ml/backends/oneapi/passes/reshaping_templates.py
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -67,7 +67,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise Exception('Quartus only supports channels_last data format')
+            raise Exception('oneAPI only supports channels_last data format')
         params['data_format'] = 'cl'
 
         return self.templates[node.class_name].format(**params)
@@ -84,7 +84,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise RuntimeError('channels_first not supported on Quartus')
+            raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
 
         return self.templates[node.class_name].format(**params)
diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py
index 00fe89038..15af1d197 100644
--- a/hls4ml/backends/oneapi/passes/resource_strategy.py
+++ b/hls4ml/backends/oneapi/passes/resource_strategy.py
@@ -10,7 +10,7 @@ class ApplyResourceStrategy(OptimizerPass):
     def match(self, node):
         node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN))
         is_resource_strategy = (
-            True  # node.get_attr('strategy', '').lower() == 'resource' -> Quartus only supportr Resource strategy
+            True  # node.get_attr('strategy', '').lower() == 'resource' -> oneAPI only supportr Resource strategy
         )
         already_transformed = node.get_attr('_weights_transposed', False) is True
         return node_matches and is_resource_strategy and not already_transformed

From 8c9313b0b23c1c845f4157009b2c6d81bfb3f767 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 15 Apr 2024 17:38:59 -0500
Subject: [PATCH 044/100] snapshot on the way towards implementing pooling

---
 .../oneapi/passes/pooling_templates.py        |  48 +++++-
 .../oneapi/firmware/nnet_utils/nnet_pooling.h | 159 ++++++------------
 .../firmware/nnet_utils/nnet_pooling_stream.h | 136 ++++++++-------
 test/pytest/test_globalpooling.py             |   4 +-
 test/pytest/test_keras_api.py                 |   2 +-
 test/pytest/test_pooling.py                   |  37 +++-
 6 files changed, 204 insertions(+), 182 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py
index 9a3ee4192..97136ed84 100644
--- a/hls4ml/backends/oneapi/passes/pooling_templates.py
+++ b/hls4ml/backends/oneapi/passes/pooling_templates.py
@@ -1,8 +1,7 @@
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D
 
-# TODO - Move to ../fpga/passes, once streaming is supported on Quartus (should be identical to Vivado)
-
 pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
     static const unsigned stride_width = {stride_width};
     static const unsigned pool_width = {pool_width};
@@ -75,6 +74,21 @@
     'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 )
 
+pooling1d_task_sequence_template = (
+    'task_sequence<nnet::pooling1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+pooling2d_task_sequence_template = (
+    'task_sequence<nnet::pooling2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+global_pooling1d_task_sequence_template = (
+    'task_sequence<nnet::global_pooling1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+global_pooling2d_task_sequence_template = (
+    'task_sequence<nnet::global_pooling2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+
+pooling_stream_function_template = '{name}.async();'
+
 pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']
 
 
@@ -106,6 +120,34 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         if node.get_attr('data_format') == 'channels_first':
-            raise Exception('channels_first not supported for Quartus')
+            raise Exception('channels_first not supported for oneAPI')
+        params['data_format'] = 'cl'
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.templates = {
+            'Pooling1D': pooling1d_task_sequence_template,
+            'Pooling2D': pooling2d_task_sequence_template,
+            'GlobalPooling1D': global_pooling1d_task_sequence_template,
+            'GlobalPooling2D': global_pooling2d_task_sequence_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported for oneAPI')
         params['data_format'] = 'cl'
         return self.templates[node.class_name].format(**params)
+
+
+class PoolingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.template = pooling_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
index c50c34601..cb602b712 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
@@ -6,13 +6,12 @@
 namespace nnet {
 
 // Returns the maximum value from an array of size N
-template <typename T, int N> T max(T x[N]) {
+template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
     [[intel::fpga_register]] T y = x[0];
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
-    #pragma disable_loop_pipelining
-    for (int i = 1; i < N; i++) {
+    [[intel::disable_loop_pipelining]] for (int i = 1; i < N; i++) {
         if (x[i] > y)
             y = x[i];
     }
@@ -21,69 +20,32 @@ template <typename T, int N> T max(T x[N]) {
 }
 
 // Returns the mean value of an array of size N
-template <typename T, int N> T avg(T (&x)[N]) {
-    [[intel::fpga_register]] T y = 0;
+template <typename T, int N, typename accum_t> accum_t avg(T x[N], unsigned length) {
+    [[intel::fpga_register]] accum_t y = 0;
 
     // Due to loop dependencies, pipelining & unrolling is not possible
     // Explictily disabling pipeline significantly reduces resource usage
-    #pragma disable_loop_pipelining
-    for (int i = 0; i < N; i++) {
-        y += x[i];
-    }
-
-    y /= N;
-    return y;
-}
-
-// Returns the mean value of an array of size N
-// Overload of the above function; using a wider accumulator than the input to avoid overflow
-template <int W, int N> ac_int<W, true> avg(ac_int<W, true> (&x)[N]) {
-    [[intel::fpga_register]] ac_int<W + ceillog2(N), true> tmp = 0;
-
-    // Due to loop dependencies, pipelining & unrolling is not possible
-    // Explictily disabling pipeline significantly reduces resource usage
-    #pragma disable_loop_pipelining
-    for (int i = 0; i < N; i++) {
-        tmp += x[i];
-    }
-
-    tmp /= N;
-
-    // Cast back to original type
-    ac_int<W, true> y = static_cast<ac_int<W, true>>(tmp);
-    return tmp;
-}
-
-// Returns the mean value of an array of size N
-// Overload of the above function; using a wider accumulator than the input to avoid overflow
-template <int W, int I, int N> ac_fixed<W, I, true> avg(ac_fixed<W, I, true> (&x)[N]) {
-    [[intel::fpga_register]] ac_fixed<W + ceillog2(N), I + ceillog2(N), true> tmp = 0;
-
-    // Due to loop dependencies, pipelining & unrolling is not possible
-    // Explictily disabling pipeline significantly reduces resource usage
-    #pragma disable_loop_pipelining
-    for (int i = 0; i < N; i++) {
-        tmp += x[i];
-    }
-
-    tmp /= N;
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < N; i++) { y += x[i]; }
 
-    // Cast back to original type
-    ac_fixed<W, I, true> y = tmp;
+    y /= length;
     return y;
 }
 
 // Enumeration for pooling functions
 enum Pool_Op { Max, Average };
-template <typename T, int N, Pool_Op op> T pool_op(T (&x)[N]) {
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T x[N], unsigned length) {
     switch (op) {
     case Max:
-        return max<T, N>(x);
+        return max<T, N, accum_t>(x);
     case Average:
-        return avg(x);
+        return avg<T, N, accum_t>(x, length);
     }
 }
 
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
+    return pool_op<T, N, op, accum_t>(x, N);
+}
+
 /*
  * In Tensorflow, pooling ignores the value in the padded cells
  * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.)
@@ -120,8 +82,7 @@ struct pooling1d_config {
     static const Pool_Op pool_op = Max;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(const data_T &data, res_T &res) {
     // For 'same' padding, increase input width by left- and right-side padding
     // For 'valid' padding, reduce input width to area covered by pooling function
     static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
@@ -130,24 +91,21 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
 
 FiltLoop:
     #pragma unroll
-    #pragma disable_loop_pipelining
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
     InputWidthLoop:
         #pragma unroll
-        #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
-            [[intel::fpga_register]] data_T pool[CONFIG_T::pool_width];
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
+            [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_width];
 
             // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
             [[intel::fpga_register]] unsigned img_overlap = 0;
 
         PoolWidthLoop:
             #pragma unroll
-            #pragma disable_loop_pipelining
-            for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
+            [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
                 if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) {
                     // Add padding
-                    pool[pool_col] = pad_val<data_T, CONFIG_T::pool_op>();
+                    pool[pool_col] = pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
                     if (CONFIG_T::count_pad)
                         img_overlap++;
                 } else {
@@ -158,36 +116,30 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
             }
 
             // Pooling operation
-            res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] =
-                static_cast<res_T>(pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool));
-
-            // If the pool op is Average, the zero-padding needs to be removed from the results
-            if (CONFIG_T::pool_op == Average)
-                res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *=
-                    (static_cast<data_T>(CONFIG_T::pool_width) / img_overlap);
+            res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(
+                    pool, img_overlap));
         }
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling1d_cl(const data_T &data, res_T &res) {
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
     assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
 
 FiltLoop:
     #pragma unroll
-    #pragma disable_loop_pipelining
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        [[intel::fpga_register]] data_T pool[CONFIG_T::n_in];
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::n_in];
 
     InputWidthLoop:
         #pragma unroll
-        #pragma disable_loop_pipelining
-        for (int col = 0; col < CONFIG_T::n_in; col++) {
+        [[intel::disable_loop_pipelining]] for (int col = 0; col < CONFIG_T::n_in; col++) {
             pool[col] = data[col * CONFIG_T::n_filt + filt];
         }
 
-        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool));
+        res[filt] = static_cast<typename res_T::value_type>(
+            pool_op<typename data_T::value_type, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
     }
 }
 
@@ -217,9 +169,7 @@ struct pooling2d_config {
     static const Pool_Op pool_op = Max;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(const data_T &data, res_T &res) {
     // For 'same' padding, increase input width by left- and right-side padding
     // For 'valid' padding, reduce input width to area covered by pooling function
     static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
@@ -231,35 +181,34 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
 
 FiltLoop:
     #pragma unroll
-    #pragma disable_loop_pipelining
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
     InputHeightLoop:
         #pragma unroll
-        #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_height; inp_col += CONFIG_T::stride_height) {
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < padded_height;
+                                                inp_col += CONFIG_T::stride_height) {
         InputWidthLoop:
             #pragma unroll
-            #pragma disable_loop_pipelining
-            for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) {
-                [[intel::fpga_register]] data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+            [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < padded_width;
+                                                    inp_width += CONFIG_T::stride_width) {
+                [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
 
                 // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
                 [[intel::fpga_register]] unsigned img_overlap = 0;
 
             PoolHeightLoop:
                 #pragma unroll
-                #pragma disable_loop_pipelining
-                for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) {
+                [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) {
                 PoolWidthLoop:
                     #pragma unroll
-                    #pragma disable_loop_pipelining
-                    for (int pool_row = 0; pool_row < CONFIG_T::stride_width; pool_row++) {
+                    [[intel::disable_loop_pipelining]] for (int pool_row = 0; pool_row < CONFIG_T::stride_width;
+                                                            pool_row++) {
                         if (inp_col + pool_col < CONFIG_T::pad_top ||
                             inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) ||
                             inp_width + pool_row < CONFIG_T::pad_left ||
                             inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) {
                             // Add padding
-                            pool[pool_col * CONFIG_T::stride_width + pool_row] = pad_val<data_T, CONFIG_T::pool_op>();
+                            pool[pool_col * CONFIG_T::stride_width + pool_row] =
+                                pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
                             if (CONFIG_T::count_pad)
                                 img_overlap++;
                         } else {
@@ -275,23 +224,15 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
                 // Pooling operation
                 res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
                     (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] =
-                    static_cast<res_T>(
-                        pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool));
-
-                // If the pool op is Average, the zero-padding needs to be removed from the results
-                if (CONFIG_T::pool_op == Average)
-                    res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
-                        (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] *=
-                        (static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) /
-                         img_overlap);
+                    static_cast<typename res_T::value_type>(
+                        pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                                typename CONFIG_T::accum_t>(pool, img_overlap));
             }
         }
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                         res_T res[CONFIG_T::n_filt]) {
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling2d_cl(const data_T &data, res_T &res) {
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
     assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
     assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
@@ -299,18 +240,18 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *
 
 FiltLoop:
     #pragma unroll
-    #pragma disable_loop_pipelining
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        [[intel::fpga_register]] data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::in_height * CONFIG_T::in_width];
 
     InputLoop:
         #pragma unroll
-        #pragma disable_loop_pipelining
-        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+        [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
             pool[i] = data[i * CONFIG_T::n_filt + filt];
         }
 
-        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+        res[filt] = static_cast<typename res_T::value_type>(
+            pool_op<typename data_T::value_type, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op,
+                    typename CONFIG_T::accum_t>(pool));
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
index ffaf74b2f..0a79c2f8f 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
@@ -23,25 +23,22 @@ namespace nnet {
  * Counter housekeeping - performs the required pooling operation
  *
  */
-template <class data_T, class res_T, typename CONFIG_T>
-void compute_pool_buffer_1d(const data_T &in_elem, stream<res_T> &res_stream,
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_pool_buffer_1d(const data_T &in_elem,
                             nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt],
-                            typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt]) {
-    // Thresholds
-    static constexpr int lShiftX = CONFIG_T::pool_width - 1;
+                            data_window_T kernel_window, int &pX, int &sX) {
 
-    // X position pixels
-    static int pX = 0;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
-    // X strides
-    static int sX = 0;
+    // Thresholds
+    constexpr int lShiftX = CONFIG_T::pool_width - 1;
 
     // Step 1 - Shift line buffer
     [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt];
     nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
 
     // Step 2 - Kernel shift
-    nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+    nnet::kernel_shift_1d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
 
     // Check to see if we have a full pool window
     if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
@@ -61,11 +58,12 @@ void compute_pool_buffer_1d(const data_T &in_elem, stream<res_T> &res_stream,
 
             // Step 3 - Pooling
             res_pack[filter] = static_cast<typename res_T::value_type>(
-                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool_window));
+                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(
+                    pool_window));
         }
 
         // Write result to output stream
-        res_stream.write(res_pack);
+        res_pipe::write(res_pack);
     }
 
     // Reached end of image
@@ -79,19 +77,29 @@ void compute_pool_buffer_1d(const data_T &in_elem, stream<res_T> &res_stream,
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling1d_cl_stream() {
     assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
 
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::pool_width * CONFIG_T::n_filt>;
+
     // Line buffer and kernel window
-    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
-        line_buffer[CONFIG_T::n_filt];
-    [[intel::fpga_register]] static typename data_T::value_type kernel_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt];
+    [[intel::fpga_register]] data_window_T kernel_window;
+
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // X strides
+    int sX = 0;
 
 // Read input image
 ReadInputWidth:
     for (int col = 0; col < CONFIG_T::in_width; col++) {
-        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window);
+        compute_pool_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer, kernel_window,
+                                                                              pX, sX);
     }
 }
 
@@ -110,24 +118,20 @@ template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(stream
  * Counter housekeeping - performs the required pooling operation
  *
  */
-template <class data_T, class res_T, typename CONFIG_T>
+template <class data_T, class res_pipe, typename CONFIG_T>
 void compute_pool_buffer_2d(
-    const data_T &in_elem, stream<res_T> &res_stream,
+    const data_T &in_elem,
     nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::pool_height - 1]
                                                                                 [CONFIG_T::n_filt],
-    typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt]) {
+    typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt], int &pX,
+    int &pY, int &sX, int &sY) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
     // Thresholds
     static constexpr int lShiftX = CONFIG_T::pool_width - 1;
     static constexpr int lShiftY = CONFIG_T::pool_height - 1;
 
-    // X, Y position pixels
-    static int pX = 0;
-    static int pY = 0;
-
-    // X, Y strides
-    static int sX = 0;
-    static int sY = 0;
-
     // Step 1 - Shift line buffer
     [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt];
     nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
@@ -153,12 +157,12 @@ void compute_pool_buffer_2d(
 
             // Step 3 - Pooling
             res_pack[filter] = static_cast<typename res_T::value_type>(
-                pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(
-                    pool_window));
+                pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                        typename CONFIG_T::accum_t>(pool_window));
         }
 
         // Write result to output stream
-        res_stream.write(res_pack);
+        res_pipe::write(res_pack);
     }
 
     // Reached end of image
@@ -181,46 +185,39 @@ void compute_pool_buffer_2d(
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling2d_cl_stream() {
     assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
     assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
 
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+
     // Line buffer and kernel window
     [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
         line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt];
     [[intel::fpga_register]] static
         typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
 
+    // former static variables
+    // X, Y position pixels
+    int pX = 0;
+    int pY = 0;
+
+    // X, Y strides
+    int sX = 0;
+    int sY = 0;
+
 ReadInputHeight:
-    #pragma loop_coalesce 2
-    for (int row = 0; row < CONFIG_T::in_height; row++) {
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) {
     // Read input image
     ReadInputWidth:
         for (int col = 0; col < CONFIG_T::in_width; col++) {
-            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window);
+            compute_pool_buffer_2d<data_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer, kernel_window, pX, pY, sX,
+                                                               sY);
         }
     }
 }
 
-/*
- * A function used with Global Pooling
- * Returns the value before pooling
- * Max : Return the minimal possible value
- * Avg : Return 0
- */
-template <typename T, Pool_Op op> inline T init_pool_value() {
-    switch (op) {
-    case Max: {
-        T x = 0;
-        x[x.width - 1] = 1;
-        return x;
-    }
-    case Average:
-        return 0;
-    }
-}
-
 /*
  * A function used with Global Pooling
  * Updates the output pooling value
@@ -239,8 +236,7 @@ template <class T_y, class T_x, Pool_Op op> inline T_y reduce_global_pool(T_y y,
  * A function used with Global Pooling
  * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max)
  */
-template <class data_T, class res_T, typename CONFIG_T>
-void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt]) {
+template <class data_T, class res_T, typename CONFIG_T> void compute_global_pool(const data_T &in_elem, res_T &data_input) {
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_filt; i++) {
         data_input[i] = reduce_global_pool<typename CONFIG_T::accum_t, typename data_T::value_type, CONFIG_T::pool_op>(
@@ -248,18 +244,23 @@ void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void global_pooling1d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void global_pooling1d_cl_stream() {
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
 
-    [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt];
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    using accum_arr_t = array<typename CONFIG_T::accum_t, CONFIG_T::n_filt>;
+
+    [[intel::fpga_register]] accum_arr_t data_input;
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_filt; i++) {
-        data_input[i] = init_pool_value<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+        data_input[i] = pad_val<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
     }
 
     for (int i = 0; i < CONFIG_T::n_in; i++) {
-        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_input);
+        compute_global_pool<data_T, accum_arr_t, CONFIG_T>(data_pipe::read(), data_input);
     }
 
     [[intel::fpga_register]] res_T res_pack;
@@ -275,23 +276,28 @@ template <class data_T, class res_T, typename CONFIG_T> void global_pooling1d_cl
         }
     }
 
-    res.write(res_pack);
+    res_pipe::write(res_pack);
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void global_pooling2d_cl(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void global_pooling2d_cl_stream() {
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
     assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
 
-    [[intel::fpga_register]] typename CONFIG_T::accum_t data_input[CONFIG_T::n_filt];
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    using accum_arr_t = array<typename CONFIG_T::accum_t, CONFIG_T::n_filt>;
+
+    [[intel::fpga_register]] accum_arr_t data_input;
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_filt; i++) {
-        data_input[i] = init_pool_value<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+        data_input[i] = pad_val<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
     }
 
     for (int i = 0; i < CONFIG_T::in_height; i++) {
         for (int j = 0; j < CONFIG_T::in_width; j++) {
-            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_input);
+            compute_global_pool<data_T, accum_arr_t, CONFIG_T>(data_pipe::read(), data_input);
         }
     }
 
@@ -309,7 +315,7 @@ template <class data_T, class res_T, typename CONFIG_T> void global_pooling2d_cl
         }
     }
 
-    res.write(res_pack);
+    res_pipe::write(res_pack);
 }
 
 } // namespace nnet
diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py
index c402a53cd..0bc881a2e 100644
--- a/test/pytest/test_globalpooling.py
+++ b/test/pytest/test_globalpooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -87,7 +87,7 @@ def keras_model_2d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index a884c7438..d7889e013 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -357,7 +357,7 @@ def test_depthwise1d(backend, io_type):
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_pooling(pooling, padds, chans, backend):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
 
diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py
index 1f958696d..85e7dfcd8 100644
--- a/test/pytest/test_pooling.py
+++ b/test/pytest/test_pooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -69,6 +69,39 @@ def test_pool1d(backend, keras_model_1d, data_1d, io_type):
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
 
 
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
+@pytest.mark.parametrize(
+    'keras_model_1d',
+    [
+        {'model_type': 'max', 'padding': 'valid'},
+        {'model_type': 'avg', 'padding': 'valid'},
+    ],
+    ids=[
+        'model_type-max-padding-valid',
+        'model_type-avg-padding-valid',
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize('io_type', ['io_stream'])
+def test_pool1d_stream(backend, keras_model_1d, data_1d, io_type):
+    model, model_type, padding = keras_model_1d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        io_type=io_type,
+        output_dir=str(test_root_path / f'hls4mlprj_globalplool1d_{backend}_{io_type}_{model_type}_padding_{padding}'),
+        backend=backend,
+    )
+    hls_model.compile()
+
+    y_keras = model.predict(data_1d)
+    y_hls = hls_model.predict(data_1d).reshape(y_keras.shape)
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
+
+
 @pytest.fixture(scope='module')
 def data_2d():
     return np.random.rand(100, in_shape, in_shape, in_filt)
@@ -87,7 +120,7 @@ def keras_model_2d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [

From 0498d440be9d70f8fe87f196e42d64b4eea67d0f Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 16 Apr 2024 14:21:02 -0500
Subject: [PATCH 045/100] fix io_stream pooling for oneAPI

---
 .../firmware/nnet_utils/nnet_pooling_stream.h | 29 ++++++++--------
 test/pytest/test_pooling.py                   | 33 +++++++++++++++++++
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
index 0a79c2f8f..9c30aab67 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
@@ -26,7 +26,7 @@ namespace nnet {
 template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
 void compute_pool_buffer_1d(const data_T &in_elem,
                             nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt],
-                            data_window_T kernel_window, int &pX, int &sX) {
+                            data_window_T &kernel_window, int &pX, int &sX) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
@@ -118,13 +118,11 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling1d_cl_
  * Counter housekeeping - performs the required pooling operation
  *
  */
-template <class data_T, class res_pipe, typename CONFIG_T>
-void compute_pool_buffer_2d(
-    const data_T &in_elem,
-    nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::pool_height - 1]
-                                                                                [CONFIG_T::n_filt],
-    typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt], int &pX,
-    int &pY, int &sX, int &sY) {
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_pool_buffer_2d(const data_T &in_elem,
+                            nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                line_buffer[CONFIG_T::pool_height - 1][CONFIG_T::n_filt],
+                            data_window_T &kernel_window, int &pX, int &pY, int &sX, int &sY) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
@@ -137,7 +135,7 @@ void compute_pool_buffer_2d(
     nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
 
     // Step 2 - Kernel shift
-    nnet::kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+    nnet::kernel_shift_2d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
 
     // Check to see if we have a full pool window
     if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
@@ -190,13 +188,14 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling2d_cl_
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
     assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
 
-    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt>;
 
     // Line buffer and kernel window
-    [[intel::fpga_register]] static nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::in_width>
         line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt];
-    [[intel::fpga_register]] static
-        typename data_T::value_type kernel_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    [[intel::fpga_register]] data_window_T kernel_window;
 
     // former static variables
     // X, Y position pixels
@@ -212,8 +211,8 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling2d_cl_
     // Read input image
     ReadInputWidth:
         for (int col = 0; col < CONFIG_T::in_width; col++) {
-            compute_pool_buffer_2d<data_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer, kernel_window, pX, pY, sX,
-                                                               sY);
+            compute_pool_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer,
+                                                                                  kernel_window, pX, pY, sX, sY);
         }
     }
 }
diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py
index 85e7dfcd8..1de1c635c 100644
--- a/test/pytest/test_pooling.py
+++ b/test/pytest/test_pooling.py
@@ -155,3 +155,36 @@ def test_pool2d(backend, keras_model_2d, data_2d, io_type):
     y_keras = model.predict(data_2d)
     y_hls = hls_model.predict(data_2d).reshape(y_keras.shape)
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
+
+
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
+@pytest.mark.parametrize(
+    'keras_model_2d',
+    [
+        {'model_type': 'max', 'padding': 'valid'},
+        {'model_type': 'avg', 'padding': 'valid'},
+    ],
+    ids=[
+        'model_type-max-padding-valid',
+        'model_type-avg-padding-valid',
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize('io_type', ['io_stream'])
+def test_pool2d_stream(backend, keras_model_2d, data_2d, io_type):
+    model, model_type, padding = keras_model_2d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        io_type=io_type,
+        output_dir=str(test_root_path / f'hls4mlprj_globalplool2d_{backend}_{io_type}_{model_type}_padding_{padding}'),
+        backend=backend,
+    )
+    hls_model.compile()
+
+    y_keras = model.predict(data_2d)
+    y_hls = hls_model.predict(data_2d).reshape(y_keras.shape)
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)

From 7bd7ba55ff60a2621e613c854cdb1e49694446ff Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 16 Apr 2024 14:28:12 -0500
Subject: [PATCH 046/100] add fix for Conv2DBatchnorm

---
 hls4ml/backends/oneapi/passes/convolution_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 3a2cc8928..17154559d 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -126,7 +126,7 @@ def format(self, node):
 
 class ConvStreamFunctionTemplate(StreamFunctionCallTemplate):
     def __init__(self):
-        super().__init__((Conv1D, Conv2D))
+        super().__init__((Conv1D, Conv2D, Conv2DBatchnorm))
         self.template = conv_stream_function_template
 
     def format(self, node):

From 4ff035f8ccc92d3bde58e02274a993f501c10c1d Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 16 Apr 2024 15:09:30 -0500
Subject: [PATCH 047/100] accidentally committed CMakeLists.txt in my debug
 setup

---
 hls4ml/templates/oneapi/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index 1ab0d3748..f200780db 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -98,8 +98,8 @@ endif()
 
 set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS})
 # for debugging need to do this. Not sure why
-set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
-# set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+# set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
 
 # A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
 # 1. The "compile" stage compiles the device code to an intermediate

From b754f766b4a550dfd4e29bf47070a22da8e1c940 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 17 Apr 2024 16:46:47 -0500
Subject: [PATCH 048/100] reshaping, not fully tested

---
 .../oneapi/passes/reshaping_templates.py      | 54 +++++++++++++------
 .../oneapi/firmware/nnet_utils/nnet_resize.h  |  4 +-
 .../firmware/nnet_utils/nnet_resize_stream.h  |  8 +--
 .../oneapi/firmware/nnet_utils/nnet_stream.h  | 10 ++--
 .../firmware/nnet_utils/nnet_transpose.h      | 14 +++--
 .../nnet_utils/nnet_transpose_stream.h        | 25 +++++----
 .../quartus/firmware/nnet_utils/nnet_stream.h |  1 +
 test/pytest/test_reshape.py                   |  4 +-
 test/pytest/test_transpose_concat.py          |  4 +-
 test/pytest/test_upsampling.py                |  2 +-
 10 files changed, 76 insertions(+), 50 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
index 32881aab6..85357cdb2 100644
--- a/hls4ml/backends/oneapi/passes/reshaping_templates.py
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -38,7 +38,7 @@
     'task_sequence<nnet::zeropad2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 )
 
-padding_stream_function_template = '{name}.async();'
+reshaping_stream_function_template = '{name}.async();'
 
 padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
 
@@ -90,10 +90,10 @@ def format(self, node):
         return self.templates[node.class_name].format(**params)
 
 
-class ZeroPaddingStreamFunctionTemplate(StreamFunctionCallTemplate):
+class ReshapingStreamFunctionTemplate(StreamFunctionCallTemplate):
     def __init__(self):
-        super().__init__((ZeroPadding1D, ZeroPadding2D))
-        self.template = padding_stream_function_template
+        super().__init__((ZeroPadding1D, ZeroPadding2D, Resize, Reshape, Transpose))
+        self.template = reshaping_stream_function_template
 
     def format(self, node):
         params = self._default_function_params(node)
@@ -113,7 +113,10 @@ def format(self, node):
     static const unsigned n_chan = {n_chan};
 }};\n"""
 
-resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});'
+resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {output_t}, {config}>({input}, {output});'
+resize_task_sequence_template = (
+    'task_sequence<nnet::resize_{algorithm}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
 resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h']
 
 
@@ -142,6 +145,20 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class ResizeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Resize)
+        self.template = resize_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('algorithm') != 'nearest':
+            raise Exception('Currently only supporting resize_nearest')
+        params['algorithm'] = node.get_attr('algorithm')
+
+        return self.template.format(**params)
+
+
 # Transpose templates
 
 transpose_config_template = """struct config{index} : nnet::transpose_config {{
@@ -152,6 +169,9 @@ def format(self, node):
 }};\n"""
 
 transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
+transpose_task_sequence_template = (
+    'task_sequence<nnet::transpose_{dim}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
 transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']
 
 
@@ -178,9 +198,20 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class TransposeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Transpose)
+        self.template = transpose_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dim'] = node.get_attr('dim')
+
+        return self.template.format(**params)
+
+
 # Reshape template (only used in streaming)
 reshape_task_sequence_template = 'task_sequence<nnet::repack_stream<{input_pipe}, {output_pipe}, {size}>> {name};'
-reshape_stream_function_template = '{name}.async();'
 reshape_include_list = ['nnet_utils/nnet_stream.h']
 
 
@@ -211,14 +242,3 @@ def format(self, node):
         params = self._default_function_params(node)
         params['size'] = np.prod(node.get_output_variable().shape)
         return self.template.format(**params)
-
-
-class ReshapeStreamFunctionTemplate(StreamFunctionCallTemplate):
-    def __init__(self):
-        super().__init__(Reshape)
-        self.template = reshape_stream_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-
-        return self.template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
index a8e3ffe85..c461e337d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
@@ -13,9 +13,7 @@ struct resize_config {
     static const unsigned n_chan = 10;
 };
 
-template <class data_T, typename CONFIG_T>
-void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan],
-                    data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) {
+template <class data_T, class res_T, typename CONFIG_T> void resize_nearest(const data_T &image, res_T &resized) {
     int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1;
     int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1;
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
index c619edb7c..9a37f098e 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
@@ -5,10 +5,12 @@
 
 namespace nnet {
 
-template <class data_T, typename CONFIG_T> void resize_nearest(stream<data_T> &image, stream<data_T> &resized) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void resize_nearest_stream() {
     assert(CONFIG_T::new_height % CONFIG_T::height == 0);
     assert(CONFIG_T::new_width % CONFIG_T::width == 0);
 
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+
     constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height;
     constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width;
 
@@ -18,7 +20,7 @@ template <class data_T, typename CONFIG_T> void resize_nearest(stream<data_T> &i
 
     ImageWidth:
         for (unsigned i = 0; i < CONFIG_T::width; i++) {
-            [[intel::fpga_register]] data_T in_data = image.read();
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
 
         ImageChan:
             #pragma unroll
@@ -44,7 +46,7 @@ template <class data_T, typename CONFIG_T> void resize_nearest(stream<data_T> &i
                         out_data[k] = data_in_row[l][k];
                     }
 
-                    resized.write(out_data);
+                    res_pipe::write(out_data);
                 }
             }
         }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
index 3328a9728..56605633e 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -2,6 +2,7 @@
 #define NNET_CLONE_H
 
 #include "nnet_common.h"
+#include "nnet_printf.h"
 
 namespace nnet {
 
@@ -71,7 +72,7 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
     if (datasize == ressize) {
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
-            data_T in_data = data_pipe::read();
+            auto in_data = data_pipe::read();
             res_T out_data;
 
             #pragma unroll
@@ -86,8 +87,7 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
 
         for (int i = 0; i < N / datasize; i++) {
 
-            data_T in_data = data_pipe::read();
-            res_T out_data;
+            auto in_data = data_pipe::read();
 
             [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
 
@@ -101,12 +101,12 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
             }
         }
     } else { // datasize < ressize
-        res_T out_data;
         constexpr unsigned pack_diff = ressize / datasize;
         unsigned pack_cnt = 0;
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
-            data_T in_data = data_pipe::read();
+            auto in_data = data_pipe::read();
+            res_T out_data;
 
             #pragma unroll
             for (int j = 0; j < datasize; j++) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
index 05fd5fe76..2c4991a13 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
@@ -10,19 +10,16 @@ struct transpose_config {
     static constexpr unsigned perm[3] = {2, 0, 1};
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T res[CONFIG_T::height * CONFIG_T::width]) {
+template <class data_T, class res_T, typename CONFIG_T> void transpose_2d(const data_T &data, res_T &res) {
     for (int i = 0; i < CONFIG_T::height; i++) {
         #pragma unroll
         for (int j = 0; j < CONFIG_T::width; j++) {
-            res[j * CONFIG_T::height + i] = static_cast<res_T>(data[i * CONFIG_T::width + j]);
+            res[j * CONFIG_T::height + i] = static_cast<typename res_T::value_type>(data[i * CONFIG_T::width + j]);
         }
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T res[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+template <class data_T, class res_T, typename CONFIG_T> void transpose_3d(const data_T &data, res_T &res) {
     static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
     static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]],
                                             dim_data[CONFIG_T::perm[2]]};
@@ -38,8 +35,9 @@ void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::wid
                 index_res[1] = index_data[CONFIG_T::perm[1]];
                 index_res[2] = index_data[CONFIG_T::perm[2]];
 
-                res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] = static_cast<res_T>(
-                    data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]);
+                res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] =
+                    static_cast<typename res_T::value_type>(
+                        data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]);
             }
         }
     }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
index 5fa126890..e15f63c13 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
@@ -3,27 +3,34 @@
 
 namespace nnet {
 
-template <class data_T, class res_T, typename CONFIG_T> void transpose_2d(stream<data_T> &data, stream<res_T> &res) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void transpose_2d_stream() {
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    constexpr auto data_size = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    constexpr auto res_size = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
     [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
 
-    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
-        [[intel::fpga_register]] data_T in_data = data.read();
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_size; i++) {
+        [[intel::fpga_register]] data_T in_data = data_pipe::read();
 
         #pragma unroll
-        for (int j = 0; j < data_T::size; j++) {
-            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        for (int j = 0; j < data_size; j++) {
+            data_array[i * data_size + j] = typename data_T::value_type(in_data[j]);
         }
     }
 
-    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_size; i++) {
         [[intel::fpga_register]] res_T out_data;
 
         #pragma unroll
-        for (int j = 0; j < res_T::size; j++) {
-            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+        for (int j = 0; j < res_size; j++) {
+            out_data[j] = typename res_T::value_type(data_array[j * data_size + i]);
         }
 
-        res.write(out_data);
+        res_pipe::write(out_data);
     }
 }
 
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
index b5b55e204..a0c115f0e 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
@@ -2,6 +2,7 @@
 #define NNET_CLONE_H
 
 #include "nnet_common.h"
+#include <cstdio>
 
 namespace nnet {
 
diff --git a/test/pytest/test_reshape.py b/test/pytest/test_reshape.py
index 3c421c147..726da29cd 100755
--- a/test/pytest/test_reshape.py
+++ b/test/pytest/test_reshape.py
@@ -21,7 +21,7 @@ def randX_20_10():
     return randX(20, 10)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_reshape_parallel(randX_20_10, backend, io_type):
     model = tf.keras.models.Sequential(
@@ -33,7 +33,7 @@ def test_reshape_parallel(randX_20_10, backend, io_type):
         ]
     )
     model.compile(optimizer='adam', loss='mse')
-    config = hls4ml.utils.config_from_keras_model(model)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='fixed<32,16>')
     prj_name = f'hls4mlprj_reshape_{backend}_{io_type}'
     output_dir = str(test_root_path / prj_name)
     hls_model = hls4ml.converters.convert_from_keras_model(
diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py
index db3e03125..9cfd1f288 100644
--- a/test/pytest/test_transpose_concat.py
+++ b/test/pytest/test_transpose_concat.py
@@ -29,7 +29,7 @@ def keras_model():
 
 @pytest.fixture
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
         keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name'
@@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type):
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_accuracy(data, keras_model, hls_model):
     X = data
     model = keras_model
diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py
index 8ec5cabda..408dfafc5 100644
--- a/test/pytest/test_upsampling.py
+++ b/test/pytest/test_upsampling.py
@@ -46,7 +46,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':

From 2e5a05e8d6ac97c6aca4404bb49d8794b7005ebb Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 18 Apr 2024 01:02:53 -0500
Subject: [PATCH 049/100] fix cloning of streams

---
 hls4ml/backends/oneapi/oneapi_template.py     |  2 ++
 .../backends/oneapi/passes/clone_templates.py | 33 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 hls4ml/backends/oneapi/passes/clone_templates.py

diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py
index 184e319f3..b11191939 100644
--- a/hls4ml/backends/oneapi/oneapi_template.py
+++ b/hls4ml/backends/oneapi/oneapi_template.py
@@ -16,6 +16,7 @@ def __init__(self, layer_class):
 
     def _default_function_params(self, layer):
         params = self._default_params(layer)
+        params['name'] = layer.name
         return params
 
     def transform(self, model, node):
@@ -33,6 +34,7 @@ def __init__(self, layer_class):
 
     def _default_function_params(self, layer):
         params = self._default_params(layer)
+        params['name'] = layer.name
         params['config'] = f'config{layer.index}'
         params['input_pipe'] = layer.get_input_variable().pipe_name
         params['output_pipe'] = layer.get_output_variable().pipe_name
diff --git a/hls4ml/backends/oneapi/passes/clone_templates.py b/hls4ml/backends/oneapi/passes/clone_templates.py
new file mode 100644
index 000000000..222ddbe1f
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/clone_templates.py
@@ -0,0 +1,33 @@
+""" The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts
+"""
+
+from hls4ml.backends.fpga.passes.clone import Clone
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+
+clone_sequence_template = 'task_sequence<nnet::{activation}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+clone_stream_function_template = '{name}.async();'
+
+
+class CloneTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Clone)
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        for i in range(len(node.outputs)):
+            params[f'output{i + 1}_pipe'] = node.variables[node.outputs[i]].pipe_name
+
+        output_pipes = ', '.join([f'{{output{i + 1}_pipe}}' for i in range(len(node.outputs))])
+
+        template = f'task_sequence<nnet::clone_stream<{{input_pipe}}, {output_pipes}, {{size}}>> {{name}};'
+        return template.format(**params)
+
+
+class CloneStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Clone)
+        self.template = clone_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)

From 8470a6c4aed4e6dc9b6cdc1f2b9b4d74e0e1820a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 22 Apr 2024 15:25:10 -0500
Subject: [PATCH 050/100] fix pytest library loading

---
 hls4ml/model/optimizer/passes/stamp.py       |  8 +++++---
 hls4ml/templates/oneapi/myproject_bridge.cpp |  4 ++++
 hls4ml/writer/oneapi_writer.py               | 11 ++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/stamp.py b/hls4ml/model/optimizer/passes/stamp.py
index f29ae2a18..84bb466aa 100644
--- a/hls4ml/model/optimizer/passes/stamp.py
+++ b/hls4ml/model/optimizer/passes/stamp.py
@@ -1,3 +1,5 @@
+import uuid
+
 from hls4ml.model.optimizer import ModelOptimizerPass
 
 
@@ -9,11 +11,11 @@ def transform(self, model):
         def _make_stamp():
             """Create a unique identifier for the generated code. This identifier is used to
             compile a unique library and link it with python."""
-            from random import choice
-            from string import hexdigits
 
             length = 8
-            return ''.join(choice(hexdigits) for m in range(length))
+
+            stamp = uuid.uuid4()
+            return str(stamp)[-length:]
 
         model.config.config['Stamp'] = _make_stamp()
 
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index 0c89f4334..a1f6105da 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -50,6 +50,8 @@ void collect_trace_output(struct trace_data *c_trace_outputs) {
     }
 }
 
+// hls-fpga-machine-learning insert class def #float
+
 // Wrapper of top level function for Python bridge
 void myproject_float(
     // hls-fpga-machine-learning insert header #float
@@ -60,6 +62,8 @@ void myproject_float(
     // hls-fpga-machine-learning insert wrapper #float
 }
 
+// hls-fpga-machine-learning insert class def #double
+
 void myproject_double(
     // hls-fpga-machine-learning insert header #double
 ) {
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 5d56879db..1c5aae372 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -456,6 +456,7 @@ def write_bridge(self, model):
             model (ModelGraph): the hls4ml model.
         """
         project_name = model.config.get_project_name()
+        stamp = model.config.get_config_value('Stamp')
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
         model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
@@ -483,6 +484,10 @@ def write_bridge(self, model):
                     for bram in model_brams:
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
 
+                elif '// hls-fpga-machine-learning insert class def' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    newline = f'class {convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp};\n'
+
                 elif '// hls-fpga-machine-learning insert header' in line:
                     dtype = line.split('#', 1)[1].strip()
                     inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
@@ -498,7 +503,11 @@ def write_bridge(self, model):
                     for i in model_inputs:
                         newline += indent + f'nnet::convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n'
 
-                    newline += indent + f'q.single_task({convert_to_pascal_case(project_name)}{{}});\n'
+                    newline += (
+                        indent
+                        + f'q.single_task<{convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp}>'
+                        + f'({convert_to_pascal_case(project_name)}{{}});\n'
+                    )
 
                     for o in model_outputs:
                         newline += (

From 20128bb36dde5f0aa38160fdf80efd98b7526786 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 22 Apr 2024 16:14:13 -0500
Subject: [PATCH 051/100] remove unused template

---
 hls4ml/backends/oneapi/passes/clone_templates.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/backends/oneapi/passes/clone_templates.py b/hls4ml/backends/oneapi/passes/clone_templates.py
index 222ddbe1f..447ae126e 100644
--- a/hls4ml/backends/oneapi/passes/clone_templates.py
+++ b/hls4ml/backends/oneapi/passes/clone_templates.py
@@ -4,7 +4,6 @@
 from hls4ml.backends.fpga.passes.clone import Clone
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 
-clone_sequence_template = 'task_sequence<nnet::{activation}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 clone_stream_function_template = '{name}.async();'
 
 

From efb6a7a513da5298d1950475e77c1e6935ac630a Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 22 Apr 2024 16:42:17 -0500
Subject: [PATCH 052/100] fix some activation bugs

---
 hls4ml/backends/oneapi/passes/core_templates.py    | 14 +++++++++++++-
 .../oneapi/firmware/nnet_utils/nnet_activation.h   |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 0733b0d60..79c9b7306 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -278,7 +278,7 @@ def format(self, node):
 
 class ActivationTaskSequenceTemplate(TaskSequenceTemplate):
     def __init__(self):
-        super().__init__((Activation, ParametrizedActivation, PReLU, HardActivation, Softmax, ParametrizedActivation, PReLU))
+        super().__init__((Activation, HardActivation, Softmax, PReLU))
         self.template = activ_task_sequence_template
 
     def format(self, node):
@@ -288,6 +288,18 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class ParametrizedActivationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation)
+        self.template = activ_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node._get_act_function_name()
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        return self.template.format(**params)
+
+
 class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
     def __init__(self):
         super().__init__((Activation, HardActivation, Softmax))
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 1aceaeb26..6addaf88d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -450,7 +450,8 @@ template <class data_T, class res_T, typename CONFIG_T> void selu(const data_T &
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void prelu(const data_T &data, const data_T &alpha, res_T &res) {
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(const data_T &data, const typename data_T::value_type alpha[CONFIG_T::n_in], res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = data[ii];

From 6f439d564f5f05332ac54ad0766da659476e6db0 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 22 Apr 2024 17:24:46 -0500
Subject: [PATCH 053/100] fix the overwriting of directories in the pytest

---
 test/pytest/test_keras_api.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index d7889e013..7ff9bff83 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -80,13 +80,13 @@ def test_dense(backend, io_type):
 @pytest.mark.parametrize(
     "activation_function",
     [
-        Activation(activation='relu', name='Activation'),
+        Activation(activation='relu', name='relu'),
         LeakyReLU(alpha=1.0),
         ELU(alpha=1.0),
         PReLU(
             alpha_initializer="zeros",
         ),
-        Activation(activation='sigmoid', name='Activation'),
+        Activation(activation='sigmoid', name='sigmoid'),
     ],
 )
 # ThresholdedReLU(theta=1.0)])
@@ -101,9 +101,12 @@ def test_activations(activation_function, backend, io_type):
     X_input = np.random.rand(100, 1)
     keras_prediction = model.predict(X_input)
     config = hls4ml.utils.config_from_keras_model(model)
-    output_dir = str(
-        test_root_path / f'hls4mlprj_keras_api_activations_{activation_function.__class__.__name__}_{backend}_{io_type}'
-    )
+    if isinstance(activation_function, Activation):
+        output_dir = str(test_root_path / f'hls4mlprj_keras_api_activations_{activation_function.name}_{backend}_{io_type}')
+    else:
+        output_dir = str(
+            test_root_path / f'hls4mlprj_keras_api_activations_{activation_function.__class__.__name__}_{backend}_{io_type}'
+        )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
     )

From 637e1926612e25f7ec347ec108d5f9d4a7ed220e Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 23 Apr 2024 16:47:14 -0500
Subject: [PATCH 054/100] update version of test repository

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 50e9f799f..0f76fc0da 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.4.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.1.base
   tags:
     - k8s-default
   before_script:

From 0f12c9678c9f7b22aaee3343d222d6a7801c74b6 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 23 Apr 2024 18:15:18 -0500
Subject: [PATCH 055/100] try to fix docker issue

---
 test/pytest/ci-template.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 0f76fc0da..ab178c92a 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -5,6 +5,7 @@
     - k8s-default
   before_script:
     - source ~/.bashrc
+    - git config --global --add safe.directory /builds/fastmachinelearning/hls4ml
     - git submodule update --init --recursive hls4ml/templates/catapult/
     - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi
     - conda activate hls4ml-testing

From a5aac2a90daa68cf8c7c2d0550d65aa180265646 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 23 Apr 2024 22:49:05 -0500
Subject: [PATCH 056/100] bump hls4ml-testing tag to 0.5.2

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index ab178c92a..c8d53a394 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.1.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.2.base
   tags:
     - k8s-default
   before_script:

From 412bd4382dbebce0633822f1ec39fb42de707fc1 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 24 Apr 2024 09:31:00 -0500
Subject: [PATCH 057/100] try not restricting tensorflow-model-optimizatoin

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 1911f3b32..f1f82c0a6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,7 +31,7 @@ install_requires =
     qkeras
     tabulate
     tensorflow
-    tensorflow-model-optimization<=0.7.5
+    tensorflow-model-optimization
 python_requires = >=3.8
 include_package_data = True
 scripts = scripts/hls4ml

From 5cffadfd93e76d04ad0a61f5dc638abc5cb95fc3 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 24 Apr 2024 10:18:05 -0500
Subject: [PATCH 058/100] Update to 0.5.3 for testing

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index c8d53a394..827014cff 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.2.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.3.base
   tags:
     - k8s-default
   before_script:

From d156339998cc7a1893d85308ddb0d2af1b575c55 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 24 Apr 2024 22:58:13 -0500
Subject: [PATCH 059/100] bump to docker image 0.5.4, suggested by Ben

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 827014cff..fa4e7c9d8 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.3.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.4.base
   tags:
     - k8s-default
   before_script:

From 924af074ba95a4c1ce39dce1afc73687186d83fb Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Apr 2024 16:03:05 -0500
Subject: [PATCH 060/100] fix pre-commit warning

---
 test/pytest/test_weight_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_weight_writer.py b/test/pytest/test_weight_writer.py
index 168b781a6..431f10970 100644
--- a/test/pytest/test_weight_writer.py
+++ b/test/pytest/test_weight_writer.py
@@ -29,5 +29,5 @@ def test_weight_writer(k, i, f):
     print(w_paths[0])
     assert len(w_paths) == 1
     w_loaded = np.loadtxt(w_paths[0], delimiter=',').reshape(1, 1)
-    print(f'{w[0,0]:.14}', f'{w_loaded[0,0]:.14}')
+    print(f'{w[0, 0]:.14}', f'{w_loaded[0, 0]:.14}')
     assert np.all(w == w_loaded)

From 7ded550c35e00e816ca8ac6ac7a3c99ca0e13715 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Apr 2024 16:04:15 -0500
Subject: [PATCH 061/100] dial down N_TESTS_PER_YAML to 4

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a4aa6d507..89535c193 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -7,7 +7,7 @@ generator:
   stage: generate
   image: python:3.8-alpine
   variables:
-    N_TESTS_PER_YAML: 5
+    N_TESTS_PER_YAML: 4
   tags:
     - k8s-default
   before_script:

From e966b18ec883e1c35d87250beb40e73d97def459 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Apr 2024 16:28:42 -0500
Subject: [PATCH 062/100] revert tensorflow-model-optimization change

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f1f82c0a6..1911f3b32 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,7 +31,7 @@ install_requires =
     qkeras
     tabulate
     tensorflow
-    tensorflow-model-optimization
+    tensorflow-model-optimization<=0.7.5
 python_requires = >=3.8
 include_package_data = True
 scripts = scripts/hls4ml

From e649f34ac39c826f03746237e682e3ccc31c7b3d Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Apr 2024 17:25:21 -0500
Subject: [PATCH 063/100] fix issue of saving in "obsolete" h5 format

---
 hls4ml/writer/catapult_writer.py | 2 +-
 hls4ml/writer/oneapi_writer.py   | 2 +-
 hls4ml/writer/quartus_writer.py  | 2 +-
 hls4ml/writer/vivado_writer.py   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
index 48d44e4a5..af3f28a59 100755
--- a/hls4ml/writer/catapult_writer.py
+++ b/hls4ml/writer/catapult_writer.py
@@ -884,7 +884,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 1c5aae372..123eeff9b 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -959,7 +959,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index f8f3d7618..8c0217f92 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -1322,7 +1322,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 412bb8d66..38b9de15f 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -686,7 +686,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 

From bf689589b7d1820eb54131581bb2afa384543cdc Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 8 May 2024 12:16:22 -0500
Subject: [PATCH 064/100] fix embedding for oneAPI

---
 .../oneapi/passes/embedding_templates.py      | 32 +++++++++++++++++++
 .../oneapi/firmware/nnet_utils/nnet_embed.h   |  5 ++-
 .../firmware/nnet_utils/nnet_embed_stream.h   | 16 ++++++----
 test/pytest/test_embed.py                     |  4 +--
 4 files changed, 45 insertions(+), 12 deletions(-)
 create mode 100644 hls4ml/backends/oneapi/passes/embedding_templates.py

diff --git a/hls4ml/backends/oneapi/passes/embedding_templates.py b/hls4ml/backends/oneapi/passes/embedding_templates.py
new file mode 100644
index 000000000..6fda678f0
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/embedding_templates.py
@@ -0,0 +1,32 @@
+"""
+These are the stream oneAPI templates for embedding layers. The io_parallel ones are in backends/fpga/passes/embedding.py.
+"""
+
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.model.layers import Embedding
+
+embed_task_sequence_template = 'task_sequence<nnet::embedding_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+embed_stream_function_template = '{name}.async({e});'
+
+
+class EmbeddingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Embedding)
+        self.template = embed_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class EmbeddingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Embedding)
+        self.template = embed_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['e'] = node.get_weights('embeddings').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
index 5191239b6..1d23686e7 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
@@ -21,7 +21,7 @@ struct embed_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out],
+void embedding(const data_T &data, res_T &res,
                const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
 
     /*
@@ -30,9 +30,8 @@ void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T:
      */
 
 InputSequence:
-    #pragma ii CONFIG_T::reuse_factor
     #pragma unroll
-    for (int j = 0; j < CONFIG_T::n_in; j++) {
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::n_in; j++) {
     DenseEmbedding:
         #pragma unroll
         for (int i = 0; i < CONFIG_T::n_out; i++) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
index 51e54e991..07c956bcd 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
@@ -3,14 +3,16 @@
 
 namespace nnet {
 
-template <class data_T, class res_T, typename CONFIG_T>
-void embedding(stream<data_T> &data, stream<res_T> &res,
-               const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
-    data_T in_data = data.read();
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void embedding_stream(const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+
+    auto in_data = data_pipe::read();
 
 InputSequence:
-    #pragma ii CONFIG_T::reuse_factor
-    for (int j = 0; j < data_T::size; j++) {
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < datasize; j++) {
 
         res_T res_pack;
 
@@ -20,7 +22,7 @@ void embedding(stream<data_T> &data, stream<res_T> &res,
             res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i];
         }
 
-        res.write(res_pack);
+        res_pipe::write(res_pack);
     }
 }
 
diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py
index a27fc45b9..995006040 100644
--- a/test/pytest/test_embed.py
+++ b/test/pytest/test_embed.py
@@ -25,7 +25,7 @@ def keras_model():
 
 
 @pytest.fixture
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name')
@@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type):
     return hls_model
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_embedding_accuracy(data, keras_model, hls_model):
     X = data

From d07985dcf0403afacfc0b9878e78058864cbd44b Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 8 May 2024 18:34:19 -0500
Subject: [PATCH 065/100] First attempt at adding RNNs to oneAPI

---
 .../oneapi/passes/recurrent_templates.py      |  31 ++
 .../firmware/nnet_utils/nnet_recurrent.h      | 275 +++++++++---------
 .../nnet_utils/nnet_recurrent_activation.h    |  14 +-
 .../nnet_utils/nnet_recurrent_stream.h        |  30 +-
 test/pytest/test_rnn.py                       |  46 +--
 5 files changed, 212 insertions(+), 184 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
index c550425a9..0d5e453fa 100644
--- a/hls4ml/backends/oneapi/passes/recurrent_templates.py
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -1,7 +1,10 @@
 from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import GRU, LSTM, SimpleRNN
 
+# Note:  currently only GRU is supported for stream; lstm and simpleRNN are parallel-only
+
 recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h']
 
 ################################################
@@ -70,6 +73,8 @@
 }};\n'''
 
 gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});'
+gru_task_sequence_template = 'task_sequence<nnet::gru_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+gru_stream_function_template = '{name}.async({w}, {wr}, {b}, {br});'
 
 
 class GRUConfigTemplate(LayerConfigTemplate):
@@ -146,6 +151,32 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class GRUTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.template = gru_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class GRUStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.template = gru_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['wr'] = node.get_weights('recurrent_weight').name
+        params['br'] = node.get_weights('recurrent_bias').name
+
+        return self.template.format(**params)
+
+
 ################################################
 # LSTM Template
 ################################################
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index 340a8eda1..d6e76d92a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -12,7 +12,7 @@ namespace nnet {
 //----------------------
 
 template <class data_T, class res_T, class weight_t, int N_IN, int N_OUT>
-void multiply_W(data_T input[N_IN], res_T out[N_OUT], const weight_t weight[N_IN * N_OUT]) {
+void multiply_W(const data_T &input, res_T &out, const weight_t weight[N_IN * N_OUT]) {
 MULTIPLY_W_LOOP_I:
     #pragma unroll
     for (int i = 0; i < N_OUT; i++) {
@@ -27,7 +27,7 @@ void multiply_W(data_T input[N_IN], res_T out[N_OUT], const weight_t weight[N_IN
 }
 
 template <class data_T, class res_T, class weight_t, int N_OUT>
-void multiply_U(data_T input[N_OUT], res_T out[N_OUT], const weight_t weight[N_OUT * N_OUT]) {
+void multiply_U(const data_T &input, res_T &out, const weight_t weight[N_OUT * N_OUT]) {
 MULTIPLY_U_LOOP_I:
     #pragma unroll
     for (int i = 0; i < N_OUT; i++) {
@@ -42,7 +42,7 @@ void multiply_U(data_T input[N_OUT], res_T out[N_OUT], const weight_t weight[N_O
 }
 
 template <class data_T, class res_T, class bias_t, int N>
-void add_bias(data_T inputs[N], res_T out[N], const bias_t bias[N]) {
+void add_bias(const data_T &inputs, res_T &out, const bias_t bias[N]) {
 ADD_BIAS_LOOP:
     #pragma unroll
     for (int i = 0; i < N; i++) {
@@ -50,7 +50,8 @@ void add_bias(data_T inputs[N], res_T out[N], const bias_t bias[N]) {
     }
 }
 
-template <class data_T, class res_T, int N> void multiply_vectors(data_T in1[N], data_T in2[N], res_T out[N]) {
+template <class data1_T, class data2_T, class res_T, int N>
+void multiply_vectors(const data1_T &in1, const data2_T &in2, res_T &out) {
 MULTIPLY_VECT_LOOP:
     #pragma unroll
     for (int i = 0; i < N; i++) {
@@ -58,7 +59,8 @@ template <class data_T, class res_T, int N> void multiply_vectors(data_T in1[N],
     }
 }
 
-template <class data_T, class res_T, int N> void add_vectors(data_T in1[N], data_T in2[N], res_T out[N]) {
+template <class data1_T, class data2_T, class res_T, int N>
+void add_vectors(const data1_T &in1, const data2_T &in2, res_T &out) {
 ADD_VECTOR_LOOP:
     #pragma unroll
     for (int i = 0; i < N; i++) {
@@ -95,27 +97,29 @@ struct gru_config {
     template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
-              const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+template <class data_T, class h_T, typename CONFIG_T>
+void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
               const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
               const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
               const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
     static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
     // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
     // candidate state gates, for each of the units
-    [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_x_w[3 * CONFIG_T::n_units];
-    nnet::dense_resource<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_x>(x, mat_mul_x_w, weights,
-                                                                                               bias);
+
+    using accum_array_T = array<typename CONFIG_T::accum_t, 3 * CONFIG_T::n_units>;
+
+    [[intel::fpga_register]] accum_array_T mat_mul_x_w;
+    nnet::dense_resource<data_T, accum_array_T, typename CONFIG_T::mult_config_x>(x, mat_mul_x_w, weights, bias);
 
     // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights),
     // for update, reset and candidate state gates, for each of the units
-    [[intel::fpga_register]] typename CONFIG_T::accum_t mat_mul_h_wr[3 * CONFIG_T::n_units];
-    nnet::dense_resource<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config_h>(
-        h, mat_mul_h_wr, recurrent_weights, recurrent_bias);
+    [[intel::fpga_register]] accum_array_T mat_mul_h_wr;
+    nnet::dense_resource<h_T, accum_array_T, typename CONFIG_T::mult_config_h>(h, mat_mul_h_wr, recurrent_weights,
+                                                                               recurrent_bias);
 
     // A vector containing both the values of z(t) and r(t) for every state
-    [[intel::fpga_register]] typename CONFIG_T::accum_t z_r[2 * CONFIG_T::n_units];
+    using z_activ_array_T = array<typename CONFIG_T::accum_t, 2 * CONFIG_T::n_units>;
+    [[intel::fpga_register]] z_activ_array_T z_r;
 
     // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1)
     // Unrolled fully, no DSPs used
@@ -125,19 +129,20 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
     }
 
     // Activation on z(t) and r(t)
-    [[intel::fpga_register]] typename CONFIG_T::accum_t z_r_act[2 * CONFIG_T::n_units];
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
+    [[intel::fpga_register]] z_activ_array_T z_r_act;
+    CONFIG_T::template activation_recr<z_activ_array_T, z_activ_array_T,
                                        typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(z_r, z_r_act);
 
     // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h
-    [[intel::fpga_register]] typename CONFIG_T::accum_t hadamard_r_h[CONFIG_T::n_units];
+    using h_activ_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_units>;
+    [[intel::fpga_register]] h_activ_array_T hadamard_r_h;
     #pragma unroll recurrent_unroll_factor
     for (int i = 0; i < (CONFIG_T::n_units); i++) {
         hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units];
     }
 
     // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h}
-    typename CONFIG_T::accum_t h_cand[CONFIG_T::n_units];
+    [[intel::fpga_register]] h_activ_array_T h_cand;
     // Addition - can unroll fully; no DSPs used here
     #pragma unroll
     for (int i = 0; i < (CONFIG_T::n_units); i++) {
@@ -145,26 +150,26 @@ void gru_cell(data_T x[CONFIG_T::n_in], res_T h[CONFIG_T::n_units],
     }
 
     // Activation on candidate state
-    [[intel::fpga_register]] typename CONFIG_T::accum_t h_cand_act[CONFIG_T::n_units];
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(h_cand, h_cand_act);
+    [[intel::fpga_register]] h_activ_array_T h_cand_act;
+    CONFIG_T::template activation<h_activ_array_T, h_activ_array_T, typename CONFIG_T::ACT_CONFIG_T>::activation(h_cand,
+                                                                                                                 h_cand_act);
 
     // Update state
     #pragma unroll recurrent_unroll_factor
     for (int i = 0; i < (CONFIG_T::n_units); i++) {
-        h[i] = static_cast<res_T>(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]);
+        h[i] = static_cast<typename h_T::value_type>(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]);
     }
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_units],
-         const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
          const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
          const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
          const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
 
-    [[intel::fpga_register]] data_T x[CONFIG_T::n_in];
-    [[intel::fpga_register]] res_T h[CONFIG_T::n_units];
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
+    [[intel::fpga_register]] data_T x;
+    [[intel::fpga_register]] h_T h;
 
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_units; i++) {
@@ -172,15 +177,14 @@ void gru(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::
     }
 
     // Loop depedency - cannot pipeline
-    #pragma disable_loop_pipelining
-    for (int t = 0; t < CONFIG_T::n_timesteps; t++) {
+    [[intel::disable_loop_pipelining]] for (int t = 0; t < CONFIG_T::n_timesteps; t++) {
         // Get data at current time step
         #pragma unroll
         for (int j = 0; j < CONFIG_T::n_in; j++) {
             x[j] = data[j + t * CONFIG_T::n_in];
         }
 
-        nnet::gru_cell<data_T, res_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
+        nnet::gru_cell<data_T, h_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
 
         if (CONFIG_T::return_sequences) {
             #pragma unroll
@@ -226,45 +230,45 @@ struct simpleRNN_config {
     template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void simple_rnn_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out],
-                     res_T hidden_state_o[CONFIG_T::n_out],
+template <class in_T, class h_T, typename CONFIG_T>
+void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o,
                      const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
                      const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
                      const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
+
+    using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
     // Weight multiplication
-    typename CONFIG_T::accum_t afterW[CONFIG_T::n_out] [[intel::fpga_register]];
-    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
-        inputs, afterW, kernel);
+    [[intel::fpga_register]] accum_array_T afterW;
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, afterW, kernel);
 
     // Bias addition
-    typename CONFIG_T::accum_t afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
-    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
-        afterW, afterBias, bias);
+    [[intel::fpga_register]] accum_array_T afterBias;
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(afterW, afterBias, bias);
 
     // Hidden state
-    typename CONFIG_T::accum_t hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
-    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
-                                                                                                 rec_kernel);
+    [[intel::fpga_register]] accum_array_T hiddenCand;
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand, rec_kernel);
 
     // Vector addition
-    typename CONFIG_T::accum_t afterAdd[CONFIG_T::n_out];
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(afterBias, hiddenCand, afterAdd);
+    [[intel::fpga_register]] accum_array_T afterAdd;
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(afterBias, hiddenCand, afterAdd);
 
     // Activation
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, data_T, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        afterAdd, hidden_state_o);
+    CONFIG_T::template activation<accum_array_T, h_T, typename CONFIG_T::ACT_CONFIG_T>::activation(afterAdd, hidden_state_o);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out],
-                const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
+void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
                 const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
                 const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
-    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
-    res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
-    res_T h[CONFIG_T::n_out] [[intel::fpga_register]];
-    data_T in[CONFIG_T::n_in] [[intel::fpga_register]];
+
+    using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
+
+    [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T hidden_state_temp;
+    [[intel::fpga_register]] h_T h;
+    [[intel::fpga_register]] in_T in;
 
 // Set initially hidden state (output) to zero
 INIT_LOOP:
@@ -273,8 +277,7 @@ void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[C
         hidden_state[x][0] = 0;
     }
 
-    #pragma disable_loop_pipelining
-    for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
 
         // Data at current time step
         #pragma unroll
@@ -289,7 +292,7 @@ void simple_rnn(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[C
         }
 
         // Do SimpleRNN
-        simple_rnn_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias);
+        simple_rnn_cell<in_T, h_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias);
 
         // Write result
         #pragma unroll
@@ -345,9 +348,8 @@ struct lstm_config {
     template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out], res_T hidden_state_o[CONFIG_T::n_out],
-               res_T cell_state[CONFIG_T::n_out], res_T cell_state_o[CONFIG_T::n_out],
+template <class in_T, class h_T, typename CONFIG_T>
+void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o,
                const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
                const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
                const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
@@ -359,144 +361,129 @@ void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out
                const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
                const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
 
+    using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
+
     // Internals definitions
-    typename CONFIG_T::accum_t i_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t i_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t c_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t c_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t o_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t o_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t f_afterW[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t f_afterBias[CONFIG_T::n_out] [[intel::fpga_register]];
+    [[intel::fpga_register]] accum_array_T i_afterW;
+    [[intel::fpga_register]] accum_array_T i_afterBias;
+    [[intel::fpga_register]] accum_array_T c_afterW;
+    [[intel::fpga_register]] accum_array_T c_afterBias;
+    [[intel::fpga_register]] accum_array_T o_afterW;
+    [[intel::fpga_register]] accum_array_T o_afterBias;
+    [[intel::fpga_register]] accum_array_T f_afterW;
+    [[intel::fpga_register]] accum_array_T f_afterBias;
 
     // Hidden state Gate candidates, intermediate variables
-    typename CONFIG_T::accum_t i_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t f_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t c_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t o_hiddenCand[CONFIG_T::n_out] [[intel::fpga_register]];
+    [[intel::fpga_register]] accum_array_T i_hiddenCand;
+    [[intel::fpga_register]] accum_array_T f_hiddenCand;
+    [[intel::fpga_register]] accum_array_T c_hiddenCand;
+    [[intel::fpga_register]] accum_array_T o_hiddenCand;
 
     // After addition, intermediate variables
-    typename CONFIG_T::accum_t i_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t f_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t c_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t o_afterAdd[CONFIG_T::n_out] [[intel::fpga_register]];
+    [[intel::fpga_register]] accum_array_T i_afterAdd;
+    [[intel::fpga_register]] accum_array_T f_afterAdd;
+    [[intel::fpga_register]] accum_array_T c_afterAdd;
+    [[intel::fpga_register]] accum_array_T o_afterAdd;
 
     // Gate outputs
-    typename CONFIG_T::accum_t gate_i[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t gate_f[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t gate_c[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t gate_o[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t gate_ic[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t gate_forget[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t h[CONFIG_T::n_out] [[intel::fpga_register]];
+    [[intel::fpga_register]] accum_array_T gate_i;
+    [[intel::fpga_register]] accum_array_T gate_f;
+    [[intel::fpga_register]] accum_array_T gate_c;
+    [[intel::fpga_register]] accum_array_T gate_o;
+    [[intel::fpga_register]] accum_array_T gate_ic;
+    [[intel::fpga_register]] accum_array_T gate_forget;
+    [[intel::fpga_register]] accum_array_T h;
 
     // Intermediate variable cell calculation
-    typename CONFIG_T::accum_t cell_act_multp[CONFIG_T::n_out] [[intel::fpga_register]];
-    typename CONFIG_T::accum_t cell_act_add[CONFIG_T::n_out] [[intel::fpga_register]];
+    [[intel::fpga_register]] accum_array_T cell_act_multp;
+    [[intel::fpga_register]] accum_array_T cell_act_add;
 
     //-----------Gate I Calculations
     // Weight multiplication
-    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
-        inputs, i_afterW, WI);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, i_afterW, WI);
 
     // Bias addition
-    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
-        i_afterW, i_afterBias, BI);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(i_afterW, i_afterBias, BI);
 
     // Hidden Candidate
-    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand,
-                                                                                                 RWI);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand, RWI);
 
     // Vector addition
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(i_afterBias, i_hiddenCand,
-                                                                                         i_afterAdd);
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(i_afterBias, i_hiddenCand, i_afterAdd);
 
     // Activation
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(i_afterAdd, gate_i);
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        i_afterAdd, gate_i);
 
     //-----------Gate F Calculations
     // Weight multiplication
-    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
-        inputs, f_afterW, WF);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, f_afterW, WF);
 
     // Bias addition
-    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
-        f_afterW, f_afterBias, BF);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(f_afterW, f_afterBias, BF);
 
     // Hidden Candidate
-    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand,
-                                                                                                 RWF);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand, RWF);
 
     // Vector addition
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(f_afterBias, f_hiddenCand,
-                                                                                         f_afterAdd);
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(f_afterBias, f_hiddenCand, f_afterAdd);
 
     // Activation
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(f_afterAdd, gate_f);
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        f_afterAdd, gate_f);
 
     //-----------Gate C Calculations
     // Weight multiplication
-    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
-        inputs, c_afterW, WC);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, c_afterW, WC);
 
     // Bias addition
-    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
-        c_afterW, c_afterBias, BC);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(c_afterW, c_afterBias, BC);
 
     // Hidden Candidate
-    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand,
-                                                                                                 RWC);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand, RWC);
 
     // Vector addition
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(c_afterBias, c_hiddenCand,
-                                                                                         c_afterAdd);
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(c_afterBias, c_hiddenCand, c_afterAdd);
 
     // Activation
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                  typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(c_afterAdd, gate_c);
+    CONFIG_T::template activation<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        c_afterAdd, gate_c);
 
     //-----------gate I and C multiply
     // Vector multiplication
-    multiply_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_i, gate_c, gate_ic);
+    multiply_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_i, gate_c, gate_ic);
 
     //-----------Gate O Calculations
     // Weight multiplication
-    multiply_W<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(
-        inputs, o_afterW, WO);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, o_afterW, WO);
 
     // Bias addition
-    add_bias<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, typename CONFIG_T::bias_t, CONFIG_T::n_out>(
-        o_afterW, o_afterBias, BO);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(o_afterW, o_afterBias, BO);
 
     // Hidden Candidate
-    multiply_U<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand,
-                                                                                                 RWO);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand, RWO);
 
     // Vector addition
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(o_afterBias, o_hiddenCand,
-                                                                                         o_afterAdd);
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(o_afterBias, o_hiddenCand, o_afterAdd);
 
     // Activation
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(o_afterAdd, gate_o);
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        o_afterAdd, gate_o);
 
     //-----------Cell State Calculation
     // Vector multiplication
-    multiply_vectors<typename CONFIG_T::accum_t, res_T, CONFIG_T::n_out>(gate_f, cell_state, cell_act_multp);
+    multiply_vectors<accum_array_T, h_T, accum_array_T, CONFIG_T::n_out>(gate_f, cell_state, cell_act_multp);
 
     // Vector addition
-    add_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_ic, cell_act_multp,
-                                                                                         cell_act_add);
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_ic, cell_act_multp, cell_act_add);
 
     //-----------Forget gate Calculation
     // Activation
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t,
-                                  typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(cell_act_add, gate_forget);
+    CONFIG_T::template activation<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        cell_act_add, gate_forget);
 
     // Vector multiplication
-    multiply_vectors<typename CONFIG_T::accum_t, typename CONFIG_T::accum_t, CONFIG_T::n_out>(gate_o, gate_forget, h);
+    multiply_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_o, gate_forget, h);
 
 OUTPUT_WRITE_LOOP:
     #pragma unroll
@@ -507,8 +494,7 @@ void lstm_cell(data_T inputs[CONFIG_T::n_in], res_T hidden_state[CONFIG_T::n_out
 }
 
 template <class data_T, class res_T, class CONFIG_T>
-void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
+void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
           const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
           const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
           const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out],
@@ -518,13 +504,17 @@ void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_
           const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
           const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
           const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
-    res_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
-    res_T hidden_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
-    res_T cell_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] [[intel::fpga_register]];
-    res_T cell_state_temp[CONFIG_T::n_out] [[intel::fpga_register]];
-    res_T h[CONFIG_T::n_out] [[intel::fpga_register]];
-    res_T c[CONFIG_T::n_out] [[intel::fpga_register]];
-    data_T in[CONFIG_T::n_in] [[intel::fpga_register]];
+
+    using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
+
+    [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T hidden_state_temp;
+    [[intel::fpga_register]] h_T cell_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T cell_state_temp;
+    [[intel::fpga_register]] h_T h;
+    [[intel::fpga_register]] h_T c;
+    [[intel::fpga_register]] in_T in;
 
 // Set initially hidden state (output) to zero
 INIT_LOOP:
@@ -535,8 +525,7 @@ void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_
     }
 
     // Input dimension
-    #pragma disable_loop_pipelining
-    for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
         // Data at current time step
         for (int x = 0; x < CONFIG_T::n_in; x++) {
             in[x] = data[x + i * CONFIG_T::n_in];
@@ -550,8 +539,8 @@ void lstm(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], res_T res[CONFIG_
         }
 
         // Do LSTM
-        lstm_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO,
-                                           BI, BF, BC, BO);
+        lstm_cell<in_T, h_T, CONFIG_T>(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI,
+                                       BF, BC, BO);
 
         // Write result
         #pragma unroll
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
index e5896e6da..893fd027c 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
@@ -13,7 +13,7 @@ template <class data_T, class res_T, typename CONFIG_T> class Activation {
     // *************************************************
     //       Blank Activation
     // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {}
+    static void activation(const data_T &data, res_T &res) {}
 };
 
 template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
@@ -21,9 +21,7 @@ template <class data_T, class res_T, typename CONFIG_T> class relu : public Acti
     // *************************************************
     //       Relu Activation
     // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
-    }
+    static void activation(const data_T &data, res_T &res) { nnet::relu<data_T, res_T, CONFIG_T>(data, res); }
 };
 
 template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
@@ -31,9 +29,7 @@ template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public A
     // *************************************************
     //       Sigmoid Activation
     // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
-    }
+    static void activation(const data_T &data, res_T &res) { nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res); }
 };
 
 template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
@@ -41,9 +37,7 @@ template <class data_T, class res_T, typename CONFIG_T> class tanh : public Acti
     // *************************************************
     //       TanH Activation
     // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::dense_tanh<data_T, res_T, CONFIG_T>(data, res);
-    }
+    static void activation(const data_T &data, res_T &res) { nnet::dense_tanh<data_T, res_T, CONFIG_T>(data, res); }
 };
 
 } // namespace activation
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
index 9e51d35a0..f86b26b95 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -6,24 +6,27 @@
 #include "nnet_recurrent_activation.h"
 
 namespace nnet {
-template <class data_T, class res_T, typename CONFIG_T>
-void gru(stream<data_T> &data_stream, stream<res_T> &res_stream,
-         const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
-         const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
-         const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
-         const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
-
-    [[intel::fpga_register]] typename res_T::value_type h[CONFIG_T::n_units];
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
+                const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
+                const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
+                const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
+
+    [[intel::fpga_register]] h_T h;
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_units; i++) {
         h[i] = 0;
     }
 
-    [[intel::fpga_register]] typename data_T::value_type x[CONFIG_T::n_in];
+    [[intel::fpga_register]] data_T x;
 
 DataPropagation:
     for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / data_T::size; i_in++) {
-        data_T data_pack = data_stream.read();
+        auto data_pack = data_pipe::read();
 
     DataPack:
         #pragma unroll
@@ -31,8 +34,7 @@ void gru(stream<data_T> &data_stream, stream<res_T> &res_stream,
             x[i_pack] = data_pack[i_pack];
         }
 
-        nnet::gru_cell<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(x, h, weights, recurrent_weights,
-                                                                                          bias, recurrent_bias);
+        nnet::gru_cell<data_T, h_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
 
         if (CONFIG_T::return_sequences) {
             res_T res_pack;
@@ -43,7 +45,7 @@ void gru(stream<data_T> &data_stream, stream<res_T> &res_stream,
                 res_pack[i_pack] = h[i_pack];
             }
 
-            res_stream.write(res_pack);
+            res_pipe::write(res_pack);
         }
     }
 
@@ -56,7 +58,7 @@ void gru(stream<data_T> &data_stream, stream<res_T> &res_stream,
             res_pack[i_pack] = h[i_pack];
         }
 
-        res_stream.write(res_pack);
+        res_pipe::write(res_pack);
     }
 }
 
diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py
index 3e6e97801..509bc5bdb 100644
--- a/test/pytest/test_rnn.py
+++ b/test/pytest/test_rnn.py
@@ -66,31 +66,42 @@ def test_rnn_parsing(rnn_layer, return_sequences):
 
 
 @pytest.mark.parametrize(
-    'rnn_layer,backend, io_type',
+    'rnn_layer, backend, io_type, strategy',
     [
-        (SimpleRNN, 'Quartus', 'io_parallel'),
-        (LSTM, 'Vivado', 'io_parallel'),
-        (LSTM, 'Vitis', 'io_parallel'),
-        (LSTM, 'Quartus', 'io_parallel'),
-        (LSTM, 'Vivado', 'io_stream'),
-        (LSTM, 'Vitis', 'io_stream'),
-        (GRU, 'Vivado', 'io_parallel'),
-        (GRU, 'Vivado', 'io_stream'),
-        (GRU, 'Vitis', 'io_parallel'),
-        (GRU, 'Vitis', 'io_stream'),
-        (GRU, 'Quartus', 'io_parallel'),
-        (GRU, 'Quartus', 'io_stream'),
+        (SimpleRNN, 'Quartus', 'io_parallel', 'resource'),
+        (SimpleRNN, 'oneAPI', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_parallel', 'latency'),
+        (LSTM, 'Vitis', 'io_parallel', 'resource'),
+        (LSTM, 'Vitis', 'io_parallel', 'latency'),
+        (LSTM, 'Quartus', 'io_parallel', 'resource'),
+        (LSTM, 'oneAPI', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_stream', 'resource'),
+        (LSTM, 'Vivado', 'io_stream', 'latency'),
+        (LSTM, 'Vitis', 'io_stream', 'resource'),
+        (LSTM, 'Vitis', 'io_stream', 'latency'),
+        (GRU, 'Vivado', 'io_parallel', 'resource'),
+        (GRU, 'Vivado', 'io_parallel', 'latency'),
+        (GRU, 'Vitis', 'io_parallel', 'resource'),
+        (GRU, 'Vitis', 'io_parallel', 'latency'),
+        (GRU, 'Quartus', 'io_parallel', 'resource'),
+        (GRU, 'oneAPI', 'io_parallel', 'resource'),
+        (GRU, 'Vivado', 'io_stream', 'resource'),
+        (GRU, 'Vivado', 'io_stream', 'latency'),
+        (GRU, 'Vitis', 'io_stream', 'resource'),
+        (GRU, 'Vitis', 'io_stream', 'latency'),
+        (GRU, 'Quartus', 'io_stream', 'resource'),
+        (GRU, 'oneAPI', 'io_stream', 'resource'),
     ],
 )
 @pytest.mark.parametrize('return_sequences', [True, False])
 @pytest.mark.parametrize('static', [True, False])
-@pytest.mark.parametrize('strategy', ['latency', 'resource'])
 def test_rnn_accuracy(rnn_layer, return_sequences, backend, io_type, strategy, static):
     # Subtract 0.5 to include negative values
     input_shape = (12, 8)
     X = np.random.rand(50, *input_shape) - 0.5
 
-    layer_name = rnn_layer.__class__.__name__.lower()
+    layer_name = rnn_layer.__name__
     keras_model = Sequential()
     keras_model.add(
         rnn_layer(
@@ -111,8 +122,9 @@ def test_rnn_accuracy(rnn_layer, return_sequences, backend, io_type, strategy, s
     )
     hls_config['LayerName'][layer_name]['static'] = static
     hls_config['LayerName'][layer_name]['Strategy'] = strategy
-    prj_name = 'hls4mlprj_rnn_accuracy_{}_static_{}_ret_seq_{}_{}_{}_{}'.format(
-        rnn_layer.__class__.__name__.lower(), int(static), int(return_sequences), backend, io_type, strategy
+    prj_name = (
+        f'hls4mlprj_rnn_accuracy_{layer_name}_static_{int(static)}_ret_seq_{int(return_sequences)}_'
+        f'{backend}_{io_type}_{strategy}'
     )
     output_dir = str(test_root_path / prj_name)
 

From a58e4f58bda2fb67d441c4e4a710e31edd067d1b Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 8 May 2024 23:02:07 -0500
Subject: [PATCH 066/100] fix bug in array size

---
 .../firmware/nnet_utils/nnet_recurrent_stream.h       | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
index f86b26b95..23556e441 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -16,6 +16,9 @@ void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
     using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
 
+    constexpr auto datasize = std::tuple_size<data_T>{};
+    constexpr auto ressize = std::tuple_size<res_T>{};
+
     [[intel::fpga_register]] h_T h;
     #pragma unroll
     for (int i = 0; i < CONFIG_T::n_units; i++) {
@@ -25,12 +28,12 @@ void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units
     [[intel::fpga_register]] data_T x;
 
 DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / data_T::size; i_in++) {
+    for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / datasize; i_in++) {
         auto data_pack = data_pipe::read();
 
     DataPack:
         #pragma unroll
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+        for (int i_pack = 0; i_pack < datasize; i_pack++) {
             x[i_pack] = data_pack[i_pack];
         }
 
@@ -41,7 +44,7 @@ void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units
 
         ResPackRetSeq:
             #pragma unroll
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            for (int i_pack = 0; i_pack < ressize; i_pack++) {
                 res_pack[i_pack] = h[i_pack];
             }
 
@@ -54,7 +57,7 @@ void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units
 
     ResPackNoRetSeq:
         #pragma unroll
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+        for (int i_pack = 0; i_pack < ressize; i_pack++) {
             res_pack[i_pack] = h[i_pack];
         }
 

From eb9575a48fc406b44d97dfed17ccc757dcc5d0ec Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 9 May 2024 17:30:28 -0500
Subject: [PATCH 067/100] fix order or indices

---
 .../firmware/nnet_utils/nnet_recurrent.h      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index d6e76d92a..ce0dc6a6d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -274,7 +274,7 @@ void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_
 INIT_LOOP:
     #pragma unroll
     for (int x = 0; x < CONFIG_T::n_out; x++) {
-        hidden_state[x][0] = 0;
+        hidden_state[0][x] = 0;
     }
 
     [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
@@ -288,7 +288,7 @@ void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_
         // Hidden state at current time step
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state_temp[x] = hidden_state[x][i];
+            hidden_state_temp[x] = hidden_state[i][x];
         }
 
         // Do SimpleRNN
@@ -297,7 +297,7 @@ void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_
         // Write result
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state[x][i + 1] = h[x];
+            hidden_state[i + 1][x] = h[x];
         }
     }
 
@@ -305,7 +305,7 @@ void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_
         // Output when return_sequences is false
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
         }
     } else {
         // Output when return_sequences is true
@@ -313,7 +313,7 @@ void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_
         for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
             #pragma unroll
             for (int h = 0; h < CONFIG_T::n_out; h++) {
-                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
             }
         }
     }
@@ -520,8 +520,8 @@ void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[C
 INIT_LOOP:
     #pragma unroll
     for (int x = 0; x < CONFIG_T::n_out; x++) {
-        hidden_state[x][0] = 0;
-        cell_state[x][0] = 0;
+        hidden_state[0][x] = 0;
+        cell_state[0][x] = 0;
     }
 
     // Input dimension
@@ -534,8 +534,8 @@ void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[C
         // Hidden state at current time step
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state_temp[x] = hidden_state[x][i];
-            cell_state_temp[x] = cell_state[x][i];
+            hidden_state_temp[x] = hidden_state[i][x];
+            cell_state_temp[x] = cell_state[i][x];
         }
 
         // Do LSTM
@@ -545,8 +545,8 @@ void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[C
         // Write result
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state[x][i + 1] = h[x];
-            cell_state[x][i + 1] = c[x];
+            hidden_state[i + 1][x] = h[x];
+            cell_state[i + 1][x] = c[x];
         }
     }
 
@@ -554,14 +554,14 @@ void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[C
         // Output when return_sequences is false
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
         }
     } else {
         // Output when return_sequences is true
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
             for (int h = 0; h < CONFIG_T::n_out; h++) {
-                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
             }
         }
     }

From b4ed5bc0d8e429d86ea4b3fa82f059c037b87e50 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 6 Jun 2024 09:26:19 -0500
Subject: [PATCH 068/100] make queues static in bridge

---
 hls4ml/templates/oneapi/myproject_bridge.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index a1f6105da..ddad1d054 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -57,7 +57,7 @@ void myproject_float(
     // hls-fpga-machine-learning insert header #float
 ) {
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
-    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+    static sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
 
     // hls-fpga-machine-learning insert wrapper #float
 }
@@ -68,7 +68,7 @@ void myproject_double(
     // hls-fpga-machine-learning insert header #double
 ) {
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
-    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+    static sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
 
     // hls-fpga-machine-learning insert wrapper #double
 }

From ba55211f69bdceee6314b426ba8331407404545c Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 27 Jun 2024 12:20:10 -0500
Subject: [PATCH 069/100] fix logic error in repack stream

---
 .../oneapi/firmware/nnet_utils/nnet_stream.h   | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
index 56605633e..0b121222c 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -2,7 +2,6 @@
 #define NNET_CLONE_H
 
 #include "nnet_common.h"
-#include "nnet_printf.h"
 
 namespace nnet {
 
@@ -69,11 +68,13 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
     constexpr auto datasize = std::tuple_size<data_T>{};
     constexpr auto ressize = std::tuple_size<res_T>{};
 
-    if (datasize == ressize) {
+    // This may need to be valid across iterations so taken outside
+    [[intel::fpga_memory]] res_T out_data;
+
+    if constexpr (datasize == ressize) {
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
-            auto in_data = data_pipe::read();
-            res_T out_data;
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
 
             #pragma unroll
             for (int j = 0; j < datasize; j++) {
@@ -82,17 +83,15 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
 
             res_pipe::write(out_data);
         }
-    } else if (datasize > ressize) {
+    } else if constexpr (datasize > ressize) {
         constexpr unsigned pack_diff = datasize / ressize;
 
         for (int i = 0; i < N / datasize; i++) {
 
-            auto in_data = data_pipe::read();
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
 
             [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
 
-                res_T out_data;
-
                 #pragma unroll
                 for (int k = 0; k < ressize; k++) {
                     out_data[k] = in_data[j * ressize + k];
@@ -105,8 +104,7 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
         unsigned pack_cnt = 0;
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
-            auto in_data = data_pipe::read();
-            res_T out_data;
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
 
             #pragma unroll
             for (int j = 0; j < datasize; j++) {

From 9b790c59ca1817d4c05c269f4c24b1c803521800 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 27 Jun 2024 13:48:17 -0500
Subject: [PATCH 070/100] changing the style, but functionally identical

---
 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
index 0b121222c..6e5e86a58 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -68,13 +68,11 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
     constexpr auto datasize = std::tuple_size<data_T>{};
     constexpr auto ressize = std::tuple_size<res_T>{};
 
-    // This may need to be valid across iterations so taken outside
-    [[intel::fpga_memory]] res_T out_data;
-
     if constexpr (datasize == ressize) {
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
 
             [[intel::fpga_memory]] auto in_data = data_pipe::read();
+            [[intel::fpga_memory]] res_T out_data;
 
             #pragma unroll
             for (int j = 0; j < datasize; j++) {
@@ -89,6 +87,7 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
         for (int i = 0; i < N / datasize; i++) {
 
             [[intel::fpga_memory]] auto in_data = data_pipe::read();
+            [[intel::fpga_memory]] res_T out_data;
 
             [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
 
@@ -100,6 +99,7 @@ template <class data_pipe, class res_pipe, int N> void repack_stream() {
             }
         }
     } else { // datasize < ressize
+        [[intel::fpga_memory]] res_T out_data;
         constexpr unsigned pack_diff = ressize / datasize;
         unsigned pack_cnt = 0;
         [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {

From 056765eb3d00a5c678fa564705ffbd1e01362c37 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Jul 2024 11:42:35 -0500
Subject: [PATCH 071/100] update pointwise optimizer for oneAPI

---
 hls4ml/backends/oneapi/passes/pointwise.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index 9fa81f42f..6683b2341 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -1,7 +1,5 @@
 from copy import copy
 
-import numpy as np
-
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.oneapi.passes.convolution_templates import (
@@ -154,10 +152,6 @@ def transform(self, model, node):
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy()
         )
-        if len(node.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
-            expand_axis = tuple(range(int(dim[0])))
-            pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis)
-        pw_node.weights['bias'].data = node.weights['bias'].data
         model.replace_node(node, pw_node)
 
         return True

From ee6817d21f26d365f864a995878371abdee75922 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Jul 2024 11:52:27 -0500
Subject: [PATCH 072/100] add oneAPI to test_multi_dense.py

---
 test/pytest/test_multi_dense.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/pytest/test_multi_dense.py b/test/pytest/test_multi_dense.py
index e07dc119b..3dc183719 100644
--- a/test/pytest/test_multi_dense.py
+++ b/test/pytest/test_multi_dense.py
@@ -18,6 +18,7 @@
         ('Vitis', 'Latency'),
         ('Vitis', 'Resource'),
         ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
         ('Catapult', 'Latency'),
         ('Catapult', 'Resource'),
     ],

From 5a5b015e7de027cbfe17bc7c51ea7326ca7431e3 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 11:06:15 -0500
Subject: [PATCH 073/100] fix updating weight types

---
 hls4ml/backends/oneapi/oneapi_types.py        | 47 +++++++++++++++++++
 .../backends/oneapi/passes/transform_types.py |  5 +-
 hls4ml/utils/fixed_point_utils.py             | 11 +++--
 hls4ml/writer/oneapi_writer.py                | 34 +-------------
 4 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index b46956ada..fed33625f 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from hls4ml.backends.fpga.fpga_types import PackedType, VariableDefinition
+from hls4ml.utils.fixed_point_utils import next_pow2
 from hls4ml.utils.string_utils import convert_to_pascal_case
 
 # region ArrayVarable
@@ -128,4 +129,50 @@ def __init__(self, type_converter):
         )
 
 
+# region WeightsVariable
+
+
+class OneAPIStaticWeightVariableDefinition(VariableDefinition):
+    def definition_cpp(self, reuse_factor):
+        """Write the appropriate weight definiiton"""
+        # first determine whether to store in register or bram (heuristic)
+        if reuse_factor == 1 or self.data_length < 2048 or self.type.precision.width < 3:
+            attribute = '[[intel::fpga_register]]'
+        else:
+            # revisit this heuristic
+            nbanks = int(2 ** np.ceil(np.log2(self.data_length)) / 2)
+            var_width = int(np.ceil(self.type.precision.width / 8))
+            bwidth = next_pow2(var_width)
+            attribute = (
+                f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), '
+                'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
+            )
+        if self.storage == 'register':
+            return f'{attribute} constexpr {self.type.name} {self.name}'
+        else:
+            return f'{attribute} {self.type.name} {self.name}'
+
+
+class OneAPIStaticWeightVariableConverter:
+    def __init__(self, type_converter):
+        self.type_converter = type_converter
+
+    def convert(self, weight_var):
+        if isinstance(weight_var, OneAPIStaticWeightVariableDefinition):  # Already converted
+            return weight_var
+
+        weight_var.weight_class = weight_var.__class__.__name__
+        weight_var.storage = 'register'
+        weight_var.type = self.type_converter.convert(
+            PackedType(weight_var.type.name, weight_var.type.precision, weight_var.data_length, 1)
+        )
+
+        weight_var.__class__ = type(
+            'OneAPIStaticWeightVariable', (type(weight_var), OneAPIStaticWeightVariableDefinition), {}
+        )
+        return weight_var
+
+
+# endregion
+
 # endregion
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 665857445..1c626fc44 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -1,9 +1,10 @@
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter
 from hls4ml.backends.oneapi.oneapi_types import (
     OneAPIArrayVariableConverter,
     OneAPIInplaceArrayVariableConverter,
     OneAPIInplaceStreamVariableConverter,
     OneAPIInterfaceVariableConverter,
+    OneAPIStaticWeightVariableConverter,
     OneAPIStreamVariableConverter,
 )
 from hls4ml.model.optimizer import GlobalOptimizerPass
@@ -20,7 +21,7 @@ def __init__(self):
         self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)
         self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter)
         self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter)
-        self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
+        self.weight_var_converter = OneAPIStaticWeightVariableConverter(type_converter=self.type_converter)
 
     def transform(self, model, node):
         io_type = node.model.config.get_config_value('IOType')
diff --git a/hls4ml/utils/fixed_point_utils.py b/hls4ml/utils/fixed_point_utils.py
index 0ca29d7d1..020e5083c 100644
--- a/hls4ml/utils/fixed_point_utils.py
+++ b/hls4ml/utils/fixed_point_utils.py
@@ -125,14 +125,17 @@ def uint_to_binary(i, N):
     return bits
 
 
-'''
+def ceil_log2(i):
+    """
     Returns log2(i), rounding up
     Args:
         - i : Number
     Returns:
         - val : representing ceil(log2(i))
-'''
+    """
+    return i.bit_length() - 1
 
 
-def ceil_log2(i):
-    return i.bit_length() - 1
+def next_pow2(x):
+    """Return the next bigger power of 2 of an integer"""
+    return 1 << (x - 1).bit_length()
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 123eeff9b..e2c8cba4d 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -8,7 +8,6 @@
 import yaml
 
 from hls4ml.backends import get_backend
-from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, Dense
 from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
 from hls4ml.utils.string_utils import convert_to_pascal_case
 from hls4ml.writer.writers import Writer
@@ -17,8 +16,6 @@
 
 
 class OneAPIWriter(Writer):
-    def next_pow2(self, x):
-        return 1 << (x - 1).bit_length()
 
     def __make_dat_file(self, original_path, project_path):
         """
@@ -75,37 +72,8 @@ def print_array_to_cpp(self, var, layer, odir):
             h_file.write("\n")
 
             rf = int(layer.get_attr('reuse_factor', 1))
-            weight_header = ''
-
-            weight_size = 0
-            if isinstance(layer, (Conv2D, Conv2DBatchnorm)):
-                weight_size = (
-                    layer.get_attr('impl_filt_height')
-                    * layer.get_attr('impl_filt_width')
-                    * layer.get_attr('n_filt')
-                    * layer.get_attr('n_chan')
-                )
-            elif isinstance(layer, (Conv1D)):
-                weight_size = layer.get_attr('impl_filt_width') * layer.get_attr('n_filt') * layer.get_attr('n_chan')
-            elif isinstance(layer, (Dense)):
-                weight_size = layer.get_attr('n_in') * layer.get_attr('n_out')
 
-            if rf == 1 or var.name[0] == 'b' or weight_size <= 2048 or (var.name[0] == 'w' and var.type.precision.width < 3):
-                pass  # might want to modify this
-            else:
-                block_factor = (layer.get_attr('n_in') * layer.get_attr('n_out')) / rf
-                nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2)
-                var_width = int(np.ceil(var.type.precision.width / 8))
-                bwidth = self.next_pow2(var_width)
-                weight_header += (
-                    f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), '
-                    'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
-                )
-            if var.storage.lower() == 'bram':
-                weight_header += 'static '
-            else:
-                weight_header += 'static const '
-            h_file.write(weight_header + var.definition_cpp() + " = {")
+            h_file.write(var.definition_cpp(rf) + " = {")
 
             # fill c++ array.
             # not including internal brackets for multidimensional case

From 1d72aa8096f54904588737dc8777d2babb69a0ec Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 11:07:29 -0500
Subject: [PATCH 074/100] initial changes of templates, for testing

---
 .../firmware/nnet_utils/nnet_batchnorm.h      |  4 ++--
 .../nnet_utils/nnet_batchnorm_stream.h        |  3 +--
 .../oneapi/firmware/nnet_utils/nnet_conv1d.h  | 10 ++++----
 .../nnet_utils/nnet_conv1d_resource.h         | 23 +++++++------------
 .../firmware/nnet_utils/nnet_conv1d_stream.h  |  8 +++----
 .../oneapi/firmware/nnet_utils/nnet_conv2d.h  | 11 ++++-----
 .../nnet_utils/nnet_conv2d_resource.h         | 19 ++++++---------
 .../firmware/nnet_utils/nnet_conv2d_stream.h  |  9 +++-----
 .../oneapi/firmware/nnet_utils/nnet_dense.h   | 16 +++++--------
 .../firmware/nnet_utils/nnet_dense_stream.h   |  3 +--
 .../oneapi/firmware/nnet_utils/nnet_embed.h   |  3 +--
 .../firmware/nnet_utils/nnet_embed_stream.h   |  2 +-
 .../firmware/nnet_utils/nnet_recurrent.h      | 14 +++++------
 .../nnet_utils/nnet_recurrent_stream.h        |  6 ++---
 14 files changed, 49 insertions(+), 82 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
index 456a57b04..a01f74834 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -29,8 +29,8 @@ struct batchnorm_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale,
+               const typename CONFIG_T::bias_t &bias) {
 // Calcuate result
 Result:
     #pragma unroll
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index 6f7f5a60e..c3f5729fc 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -12,8 +12,7 @@ namespace nnet {
 //       Streaming Batch Normalization
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void normalize_stream(const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-                      const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+void normalize_stream(const typename CONFIG_T::scale_t &scale, const typename CONFIG_T::bias_t &bias) {
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
index 1dd931ac3..38560f120 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
@@ -44,16 +44,14 @@ struct conv1d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(const data_T &data, res_T &res,
-                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                const typename CONFIG_T::bias_t &biases) {
     conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(const data_T &data, res_T &res,
-                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
     assert(CONFIG_T::filt_width == 1);
     pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
index d73163f5a..85009d4a3 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -37,10 +37,8 @@ void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) {
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_im2col_cl(
-    const data_T &data, res_T &res,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                       const typename CONFIG_T::bias_t &biases) {
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
@@ -88,10 +86,8 @@ inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void winograd_conv1d_3x1_kernel_cl(
-    const data_T &data, res_T &res,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void winograd_conv1d_3x1_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
     // Ensure Winograd conditions are met
     assert(CONFIG_T::filt_width == 3);
     assert(CONFIG_T::stride_width == 1);
@@ -179,9 +175,8 @@ void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
-                                   const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                   const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
     assert(CONFIG_T::filt_width == 1);
 
     // Unroll factor for loop traversing input image, derived from parallelization_factor
@@ -218,10 +213,8 @@ void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res,
 //      Top-level function - handles different implementations
 // ****************************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_resource_cl(
-    const data_T &data, res_T &res,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                         const typename CONFIG_T::bias_t &biases) {
     static constexpr bool winograd_conditions =
         // Winograd's minimal filtering algorithm not applicable to stride != 1
         CONFIG_T::stride_width == 1 &&
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
index bbc86b3e2..af3e827a4 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -87,9 +87,8 @@ void compute_output_buffer_1d(
     const data_T &in_elem,
     nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[CONFIG_T::n_chan],
-    data_window_T &kernel_window,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], int &pX, int &sX) {
+    data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases,
+    int &pX, int &sX) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
@@ -131,8 +130,7 @@ void compute_output_buffer_1d(
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void conv_1d_cl_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_1d_cl_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
 
     using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
     using data_element_T = typename data_arr_T::value_type;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
index 993a75e04..79b1508c5 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
@@ -50,17 +50,14 @@ struct conv2d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(const data_T &data, res_T &res,
-                const typename CONFIG_T::weight_t
-                    weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                const typename CONFIG_T::bias_t &biases) {
     conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_cl(const data_T &data, res_T &res,
-                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
     pointwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
 }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index def4db2b4..b07e3913c 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -47,10 +47,8 @@ void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_im2col_cl(const data_T &data, res_T &res,
-                       const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
-                                                                 CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                       const typename CONFIG_T::bias_t &biases) {
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
@@ -124,7 +122,7 @@ void winograd_conv2d_3x3_kernel_cl(
     const data_T &data, res_T &res,
     const typename CONFIG_T::weight_t
         weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const typename CONFIG_T::bias_t &biases) {
     // Ensure Winograd conditions are met
     assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3);
     assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1);
@@ -236,9 +234,8 @@ void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
-                                   const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                                   const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
 
     // Unroll factors for loop traversing input image, derived from parallelization_factor
@@ -278,10 +275,8 @@ void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res,
 //      Top-level function - handles different implementations
 // ****************************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_resource_cl(const data_T &data, res_T &res,
-                         const typename CONFIG_T::weight_t weights[CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width *
-                                                                   CONFIG_T::n_chan * CONFIG_T::n_filt],
-                         const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                         const typename CONFIG_T::bias_t &biases) {
     static constexpr bool winograd_conditions =
         // Winograd's minimal filtering algorithm not applicable to stride != 1
         CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 &&
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
index af48e92e3..351b96c04 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -114,9 +114,8 @@ void compute_output_buffer_2d(
     const data_T &in_elem,
     nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
         line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
-    data_window_T &kernel_window,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt], int &pX, int &pY, int &sX, int &sY) {
+    data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases,
+    int &pX, int &pY, int &sX, int &sY) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
 
@@ -168,9 +167,7 @@ void compute_output_buffer_2d(
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void conv_2d_cl_stream(const typename CONFIG_T::weight_t
-                           weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+void conv_2d_cl_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
 
     using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
     using data_element_T = typename data_arr_T::value_type;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index 2bedac676..be04a46e1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -37,9 +37,8 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(const data_T &data, res_T &res,
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                 const typename CONFIG_T::bias_t &biases) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
            "The current Reuse Factor is not allowed");
     assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
@@ -103,9 +102,8 @@ void dense_rf_gt(const data_T &data, res_T &res,
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(const data_T &data, res_T &res,
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                 const typename CONFIG_T::bias_t &biases) {
     assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
            "The current Reuse Factor is not allowed");
     assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
@@ -152,10 +150,8 @@ void dense_rf_lt(const data_T &data, res_T &res,
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(
-    const data_T &data, res_T &res,
-    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                    const typename CONFIG_T::bias_t &biases) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
         dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     } else {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 53987a02d..b2313cf42 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -9,8 +9,7 @@ namespace nnet {
 
 // Note:  DataPack logic removed, at least in the initial version
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource_stream(const typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                           const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+void dense_resource_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
 
     [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
     [[intel::fpga_register]] auto data = data_pipe::read();
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
index 1d23686e7..1188fe3ec 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
@@ -21,8 +21,7 @@ struct embed_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void embedding(const data_T &data, res_T &res,
-               const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+void embedding(const data_T &data, res_T &res, const typename CONFIG_T::embeddings_t &embeddings) {
 
     /*
      * Can store embeddings[] in a register, but a large multiiplexer
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
index 07c956bcd..869387ac6 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
@@ -4,7 +4,7 @@
 namespace nnet {
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void embedding_stream(const typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+void embedding_stream(const typename CONFIG_T::embeddings_t &embeddings) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
     constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index ce0dc6a6d..d3ee2b4fc 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -98,10 +98,9 @@ struct gru_config {
 };
 
 template <class data_T, class h_T, typename CONFIG_T>
-void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
-              const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
-              const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
-              const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights,
+              const typename CONFIG_T::weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+              const typename CONFIG_T::bias_t &recurrent_bias) {
     static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
     // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
     // candidate state gates, for each of the units
@@ -162,10 +161,9 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t weights
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
-         const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
-         const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
-         const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+         const typename CONFIG_T::weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+         const typename CONFIG_T::bias_t &recurrent_bias) {
 
     using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
     [[intel::fpga_register]] data_T x;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
index 23556e441..193d2df46 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -7,10 +7,8 @@
 
 namespace nnet {
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void gru_stream(const typename CONFIG_T::weight_t weights[3 * CONFIG_T::n_units * CONFIG_T::n_in],
-                const typename CONFIG_T::weight_t recurrent_weights[3 * CONFIG_T::n_units * CONFIG_T::n_units],
-                const typename CONFIG_T::bias_t bias[3 * CONFIG_T::n_units],
-                const typename CONFIG_T::bias_t recurrent_bias[3 * CONFIG_T::n_units]) {
+void gru_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::weight_t &recurrent_weights,
+                const typename CONFIG_T::bias_t &bias, const typename CONFIG_T::bias_t &recurrent_bias) {
 
     using data_T = typename ExtractPipeType<data_pipe>::value_type;
     using res_T = typename ExtractPipeType<res_pipe>::value_type;

From 106d578af7272661c812f0a0df6b707815e9ed5b Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 11:40:09 -0500
Subject: [PATCH 075/100] fix weight naming, product selection

---
 hls4ml/backends/oneapi/oneapi_types.py                 |  2 +-
 .../templates/oneapi/firmware/nnet_utils/nnet_dense.h  | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index fed33625f..5c513c255 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -164,7 +164,7 @@ def convert(self, weight_var):
         weight_var.weight_class = weight_var.__class__.__name__
         weight_var.storage = 'register'
         weight_var.type = self.type_converter.convert(
-            PackedType(weight_var.type.name, weight_var.type.precision, weight_var.data_length, 1)
+            PackedType(weight_var.name + '_t', weight_var.type.precision, weight_var.data_length, 1)
         )
 
         weight_var.__class__ = type(
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index be04a46e1..dc7618908 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -72,8 +72,9 @@ void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight
                 continue;
             int data_index = d_index[ir][im];
             // Modified this
-            tmp_acc[im] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t>::product(
-                data[data_index], weights[w_index]);
+            tmp_acc[im] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                    data[data_index], weights[w_index]);
         }
         [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
     ResetMult:
@@ -124,8 +125,9 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight
             if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
                 continue;
             // Modified this
-            mult[im] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t>::product(
-                data[in_index], weights[w_index]);
+            mult[im] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                    data[in_index], weights[w_index]);
             in_index += CONFIG_T::reuse_factor;
             if (in_index >= CONFIG_T::n_in)
                 in_index = ir;

From 80902d7f73f2ebf52d927eec34006483e637e2d0 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 17:41:33 -0500
Subject: [PATCH 076/100] make im2col the default; fix winograd size

---
 hls4ml/backends/oneapi/oneapi_backend.py              | 2 +-
 hls4ml/backends/oneapi/passes/convolution_winograd.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index ae5054c98..74ab488d1 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -253,7 +253,7 @@ def init_conv1d(self, layer):
         # - combination - at compile-time, the decision between Winograd and im2col is made
         # - im2col - specifically use im2col
         # - Winograd - use Winograd, if possible
-        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col'))
 
         layer.set_attr(
             'n_partitions', 1
diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py
index 9a6686412..bf53e5b7c 100644
--- a/hls4ml/backends/oneapi/passes/convolution_winograd.py
+++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py
@@ -101,6 +101,8 @@ def transform(self, model, node):
                     for channel in range(0, weights.data.shape[1]):
                         node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel])
                         node.weights['weight'].data_length = node.weights['weight'].data.size
+                        # need to always be consistent
+                        node.weights['weight'].type.n_elem = node.weights['weight'].data_length
 
                 # Winograd's minimal filtering algorithm transforms the weight matrix
                 # This transformation consists of addition and division (by 2&4) of the weight matrix
@@ -146,6 +148,8 @@ def transform(self, model, node):
                     for channel in range(0, weights.data.shape[1]):
                         node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT)
                         node.weights['weight'].data_length = node.weights['weight'].data.size
+                        # need to always be consistent
+                        node.weights['weight'].type.n_elem = node.weights['weight'].data_length
 
                 # Winograd's minimal filtering algorithm transforms the weight matrix
                 # This transformation consists of addition and division (by 2&4) of the weight matrix

From ea213a3e2725c0efa7a64852a281f6520188f287 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 19:03:56 -0500
Subject: [PATCH 077/100] fix up streaming dense and convolution

---
 hls4ml/backends/oneapi/oneapi_types.py                     | 2 +-
 .../oneapi/firmware/nnet_utils/nnet_conv1d_stream.h        | 2 +-
 .../oneapi/firmware/nnet_utils/nnet_conv2d_resource.h      | 7 ++-----
 .../oneapi/firmware/nnet_utils/nnet_conv2d_stream.h        | 2 +-
 .../oneapi/firmware/nnet_utils/nnet_dense_stream.h         | 2 +-
 5 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 5c513c255..df6c5a5cd 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -148,7 +148,7 @@ def definition_cpp(self, reuse_factor):
                 'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
             )
         if self.storage == 'register':
-            return f'{attribute} constexpr {self.type.name} {self.name}'
+            return f'{attribute} static constexpr {self.type.name} {self.name}'
         else:
             return f'{attribute} {self.type.name} {self.name}'
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
index af3e827a4..1ffd11774 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -130,7 +130,7 @@ void compute_output_buffer_1d(
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void conv_1d_cl_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
+void conv_1d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
 
     using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
     using data_element_T = typename data_arr_T::value_type;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
index b07e3913c..7265d90e1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -118,11 +118,8 @@ inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void winograd_conv2d_3x3_kernel_cl(
-    const data_T &data, res_T &res,
-    const typename CONFIG_T::weight_t
-        weights[CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width],
-    const typename CONFIG_T::bias_t &biases) {
+void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
     // Ensure Winograd conditions are met
     assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3);
     assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
index 351b96c04..08f0eaa87 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -167,7 +167,7 @@ void compute_output_buffer_2d(
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void conv_2d_cl_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
+void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
 
     using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
     using data_element_T = typename data_arr_T::value_type;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index b2313cf42..92c9adc3b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -9,7 +9,7 @@ namespace nnet {
 
 // Note:  DataPack logic removed, at least in the initial version
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases) {
+void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
 
     [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
     [[intel::fpga_register]] auto data = data_pipe::read();

From 5ba9a2976ca6d18a44017e416bee0f842bbedcb2 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 29 Jul 2024 22:29:23 -0500
Subject: [PATCH 078/100] fix prelu, some batchnorm

---
 .../backends/oneapi/passes/core_templates.py  | 23 ++++++++++++++++++-
 .../firmware/nnet_utils/nnet_activation.h     |  2 +-
 .../nnet_utils/nnet_activation_stream.h       |  3 +--
 .../nnet_utils/nnet_batchnorm_stream.h        |  2 +-
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 79c9b7306..093e80182 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -175,6 +175,15 @@ def format(self, node):
     typedef {table_t.name} table_t;
 }};\n"""
 
+prelu_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+    typedef {alpha_t.name} alpha_t;
+}};\n"""
+
 hard_activ_config_template = """struct {type}_config{index} {{
     static const unsigned n_in = {n_in};
     static const {slope_t.name} slope;
@@ -207,7 +216,7 @@ def format(self, node):
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Activation, ParametrizedActivation, PReLU))
+        super().__init__((Activation, ParametrizedActivation))
         self.template = activ_config_template
 
     def format(self, node):
@@ -217,6 +226,18 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class PreluActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(PReLU)
+        self.template = prelu_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
 class HardActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(HardActivation)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index 6addaf88d..d42ef601f 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -451,7 +451,7 @@ template <class data_T, class res_T, typename CONFIG_T> void selu(const data_T &
 //       PReLU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void prelu(const data_T &data, const typename data_T::value_type alpha[CONFIG_T::n_in], res_T &res) {
+void prelu(const data_T &data, const typename CONFIG_T::alpha_t &alpha, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = data[ii];
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 9e789880a..3dd55b878 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -174,8 +174,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream()
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T>
-void prelu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha[CONFIG_T::n_in]) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void prelu_stream(typename CONFIG_T::alpha_t alpha) {
     constexpr unsigned multiplier_limit =
         DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index c3f5729fc..df9ea3922 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -12,7 +12,7 @@ namespace nnet {
 //       Streaming Batch Normalization
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void normalize_stream(const typename CONFIG_T::scale_t &scale, const typename CONFIG_T::bias_t &bias) {
+void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) {
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;

From fdd0bafd804afbf17d93f7703c651c670273c433 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 31 Jul 2024 18:22:47 -0500
Subject: [PATCH 079/100] fix weight array of exponential types

---
 hls4ml/backends/fpga/fpga_types.py            |  8 ++-
 hls4ml/backends/oneapi/oneapi_types.py        | 59 ++++++++++++++++++-
 .../backends/oneapi/passes/transform_types.py |  5 +-
 .../firmware/nnet_utils/nnet_batchnorm.h      | 14 +++--
 .../nnet_utils/nnet_batchnorm_stream.h        |  9 +--
 .../oneapi/firmware/nnet_utils/nnet_mult.h    |  6 +-
 .../oneapi/firmware/nnet_utils/nnet_types.h   |  2 +
 hls4ml/writer/oneapi_writer.py                |  4 +-
 test/pytest/test_qkeras.py                    | 22 +++----
 9 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index 15ad386c5..c6ec57ba9 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -57,6 +57,12 @@ def definition_cpp(self):
         return typestring
 
 
+class ACExponentPrecisionDefinition(PrecisionDefinition):
+    def definition_cpp(self):
+        typestring = f'std::pair<ac_int<1, false>, ac_int<{self.width}, true>>'
+        return typestring
+
+
 class ACFixedPrecisionDefinition(PrecisionDefinition):
     def _rounding_mode_cpp(self, mode):
         if mode is not None:
@@ -136,7 +142,7 @@ def __init__(self):
             type_map={
                 FixedPrecisionType: ACFixedPrecisionDefinition,
                 IntegerPrecisionType: ACIntegerPrecisionDefinition,
-                ExponentPrecisionType: ACIntegerPrecisionDefinition,
+                ExponentPrecisionType: ACExponentPrecisionDefinition,
                 XnorPrecisionType: ACIntegerPrecisionDefinition,
             },
             prefix='AC',
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index df6c5a5cd..dc4f64bd3 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -4,10 +4,67 @@
 
 import numpy as np
 
-from hls4ml.backends.fpga.fpga_types import PackedType, VariableDefinition
+from hls4ml.backends.fpga.fpga_types import (
+    HLSTypeConverter,
+    NamedTypeConverter,
+    TypeDefinition,
+    TypePrecisionConverter,
+    VariableDefinition,
+)
+from hls4ml.model.types import CompressedType, ExponentType, NamedType, PackedType
 from hls4ml.utils.fixed_point_utils import next_pow2
 from hls4ml.utils.string_utils import convert_to_pascal_case
 
+
+class OneAPICompressedTypeConverter(TypeDefinition, TypePrecisionConverter):
+    """Use a tuple for storing a compressed type for oneAPI since it's better supported. (Currently unused)"""
+
+    def definition_cpp(self):
+        """tuple format is row_index, col_index, weight"""
+        cpp_fmt = 'typedef std::tuple<{index}, {index}, {precision}> {name};\n'
+        return cpp_fmt.format(name=self.name, index=self.index_precision, precision=self.precision.definition_cpp())
+
+    def convert_precision(self, precision_converter):
+        super().convert_precision(precision_converter)
+        self.index_precision = precision_converter.convert(self.index_precision)
+
+
+class OneAPIExponentTypeConverter(TypeDefinition, TypePrecisionConverter):
+    """Use a pair for storing a exponent type for oneAPI since it's better supported"""
+
+    def definition_cpp(self):
+        cpp_fmt = 'typedef std::pair<{sign}, {precision}> {name};\n'
+        return cpp_fmt.format(name=self.name, precision=self.precision.definition_cpp(), sign=self.sign.definition_cpp())
+
+    def convert_precision(self, precision_converter):
+        super().convert_precision(precision_converter)
+        self.sign = precision_converter.convert(self.sign)
+
+
+class OneAPIPackedTypeConverter(TypeDefinition, TypePrecisionConverter):
+    def definition_cpp(self):
+        n_elem_expr = '/' if self.unpack else '*'
+        return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format(
+            name=self.name,
+            precision=self.precision.definition_cpp(),
+            n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack),
+        )
+
+    def convert_precision(self, precision_converter):
+        self.precision = precision_converter.convert(self.precision)
+
+
+class OneAPIHLSTypeConverter(HLSTypeConverter):
+    def __init__(self, precision_converter):
+        self.precision_converter = precision_converter
+        self.type_map = {
+            NamedType: NamedTypeConverter,
+            CompressedType: OneAPICompressedTypeConverter,
+            ExponentType: OneAPIExponentTypeConverter,
+            PackedType: OneAPIPackedTypeConverter,
+        }
+
+
 # region ArrayVarable
 
 
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 1c626fc44..038d65160 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -1,6 +1,7 @@
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter
 from hls4ml.backends.oneapi.oneapi_types import (
     OneAPIArrayVariableConverter,
+    OneAPIHLSTypeConverter,
     OneAPIInplaceArrayVariableConverter,
     OneAPIInplaceStreamVariableConverter,
     OneAPIInterfaceVariableConverter,
@@ -15,7 +16,7 @@
 
 class TransformTypes(GlobalOptimizerPass):
     def __init__(self):
-        self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
+        self.type_converter = OneAPIHLSTypeConverter(precision_converter=ACTypeConverter())
         self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter)
         self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter)
         self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
index a01f74834..00c8752b7 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -36,14 +36,16 @@ void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t
     #pragma unroll
     for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
         if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                            data[ires], scale[ires]) +
-                        bias[ires];
+            res[ires] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t::value_type>::product(
+                    data[ires], scale[ires]) +
+                bias[ires];
         } else {
             int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                            data[ires], scale[norm_index]) +
-                        bias[norm_index];
+            res[ires] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t::value_type>::product(
+                    data[ires], scale[norm_index]) +
+                bias[norm_index];
         }
     }
 }
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index df9ea3922..d8462b2dd 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -18,7 +18,7 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_
     constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
     constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
     CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                               typename CONFIG_T::scale_t>::limit(multiplier_limit);
+                               typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit);
 
 BatchNormLoop:
     [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
@@ -33,9 +33,10 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_
                 norm_index = i * datasize + j;
             else
                 norm_index = j % CONFIG_T::n_filt;
-            out_data[j] = CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                                     typename CONFIG_T::scale_t>::product(in_data[j], scale[norm_index]) +
-                          bias[norm_index];
+            out_data[j] =
+                CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                                           typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) +
+                bias[norm_index];
         }
 
         res_pipe::write(out_data);
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
index 5be772832..c7dfc2d7c 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
@@ -73,14 +73,14 @@ template <class x_T, class w_T> class mult : public Product {
 
 template <class x_T, class w_T> class weight_exponential : public Product {
   public:
-    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
+    using r_T = ac_fixed<2 * (w_T::second_type::width + x_T::width), (w_T::second_type::width + x_T::width), true>;
     inline static r_T product(x_T a, w_T w) {
         // Shift product for exponential weights
         // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
+        r_T y = static_cast<r_T>(a) << w.second;
 
         // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
+        return w.first == 1 ? y : static_cast<r_T>(-y);
     }
 };
 } // namespace product
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 82c58244d..8cf883c1d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -5,6 +5,8 @@
 #include <assert.h>
 #include <cstddef>
 #include <cstdio>
+#include <tuple>
+#include <utility>
 
 namespace nnet {
 
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index e2c8cba4d..fe633214f 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -73,7 +73,7 @@ def print_array_to_cpp(self, var, layer, odir):
 
             rf = int(layer.get_attr('reuse_factor', 1))
 
-            h_file.write(var.definition_cpp(rf) + " = {")
+            h_file.write(var.definition_cpp(rf) + " = {{")
 
             # fill c++ array.
             # not including internal brackets for multidimensional case
@@ -81,7 +81,7 @@ def print_array_to_cpp(self, var, layer, odir):
             for x in var:
                 h_file.write(sep + x)
                 sep = ", "
-            h_file.write("};\n")
+            h_file.write("}};\n")
             h_file.write("\n#endif\n")
 
     def write_project_dir(self, model):
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 45d015807..12bb1940c 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -134,7 +134,7 @@ def randX_100_16():
 # https://github.com/fastmachinelearning/hls4ml/issues/381
 # @pytest.mark.parametrize('bits', [4, 6, 8])
 @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type):
     '''
@@ -191,7 +191,7 @@ def randX_100_10():
 @pytest.mark.parametrize(
     'quantizer', [(quantized_tanh(8)), (quantized_sigmoid(5)), (quantized_sigmoid(7, use_real_sigmoid=True))]
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -232,7 +232,7 @@ def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
         (7, 10, binary(), quantized_bits(5, 2), binary(), False, True),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_btnn(make_btnn, randX_100_10, backend, io_type):
     model, is_xnor, test_no = make_btnn
@@ -275,7 +275,7 @@ def randX_1000_1():
         (quantized_relu(10, 5)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -315,7 +315,7 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type):
         (quantized_relu(10, 2, negative_slope=0.25)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -399,7 +399,7 @@ def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer)
         assert sum(wrong) / len(wrong) <= 0.005
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer_parsing(randX_100_10, backend, io_type):
     X = randX_100_10
@@ -436,7 +436,7 @@ def randX_100_8_8_1():
     return np.random.rand(100, 8, 8, 1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_qconv2dbn(randX_100_8_8_1, backend, io_type):
     '''
@@ -513,7 +513,7 @@ def test_qdepthwiseconv2d(randX_10_32_32_3, backend, io_type):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), rtol=1e-2, atol=0.01)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('strategy', ['Latency', 'Resource'])
 def test_quantised_po2_bit_width(backend, io_type, strategy):
@@ -551,7 +551,7 @@ def test_quantised_po2_bit_width(backend, io_type, strategy):
     np.testing.assert_allclose(y_hls.flatten(), y_keras.flatten(), rtol=2e-2)
 
 
-@pytest.mark.parametrize('backend', ['Quartus'])
+@pytest.mark.parametrize('backend', ['Quartus', 'oneAPI'])
 def test_qsimplernn(backend):
     '''
     Test proper handling of QSimpleRNN.
@@ -584,7 +584,7 @@ def test_qsimplernn(backend):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), atol=0.1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 def test_qlstm(backend):
     '''
     Test proper handling of QLSTM.
@@ -618,7 +618,7 @@ def test_qlstm(backend):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), atol=0.1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 def test_qgru(backend):
     '''
     Test proper handling of QGRU.

From 3ff54a9156cfbe684754f678042bfa1b5eb1e211 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 31 Jul 2024 18:40:11 -0500
Subject: [PATCH 080/100] move ACExponentialPrecisionDefinition to oneapi_types

---
 hls4ml/backends/fpga/fpga_types.py            |  8 +----
 hls4ml/backends/oneapi/oneapi_types.py        | 34 ++++++++++++++++++-
 .../backends/oneapi/passes/transform_types.py |  4 +--
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index c6ec57ba9..15ad386c5 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -57,12 +57,6 @@ def definition_cpp(self):
         return typestring
 
 
-class ACExponentPrecisionDefinition(PrecisionDefinition):
-    def definition_cpp(self):
-        typestring = f'std::pair<ac_int<1, false>, ac_int<{self.width}, true>>'
-        return typestring
-
-
 class ACFixedPrecisionDefinition(PrecisionDefinition):
     def _rounding_mode_cpp(self, mode):
         if mode is not None:
@@ -142,7 +136,7 @@ def __init__(self):
             type_map={
                 FixedPrecisionType: ACFixedPrecisionDefinition,
                 IntegerPrecisionType: ACIntegerPrecisionDefinition,
-                ExponentPrecisionType: ACExponentPrecisionDefinition,
+                ExponentPrecisionType: ACIntegerPrecisionDefinition,
                 XnorPrecisionType: ACIntegerPrecisionDefinition,
             },
             prefix='AC',
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index dc4f64bd3..3106e1e10 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -5,17 +5,49 @@
 import numpy as np
 
 from hls4ml.backends.fpga.fpga_types import (
+    ACFixedPrecisionDefinition,
+    ACIntegerPrecisionDefinition,
+    FixedPrecisionConverter,
     HLSTypeConverter,
     NamedTypeConverter,
+    PrecisionDefinition,
     TypeDefinition,
     TypePrecisionConverter,
     VariableDefinition,
 )
-from hls4ml.model.types import CompressedType, ExponentType, NamedType, PackedType
+from hls4ml.model.types import (
+    CompressedType,
+    ExponentPrecisionType,
+    ExponentType,
+    FixedPrecisionType,
+    IntegerPrecisionType,
+    NamedType,
+    PackedType,
+    XnorPrecisionType,
+)
 from hls4ml.utils.fixed_point_utils import next_pow2
 from hls4ml.utils.string_utils import convert_to_pascal_case
 
 
+class ACExponentPrecisionDefinition(PrecisionDefinition):
+    def definition_cpp(self):
+        typestring = f'std::pair<ac_int<1, false>, ac_int<{self.width}, true>>'
+        return typestring
+
+
+class OneAPIACTypeConverter(FixedPrecisionConverter):
+    def __init__(self):
+        super().__init__(
+            type_map={
+                FixedPrecisionType: ACFixedPrecisionDefinition,
+                IntegerPrecisionType: ACIntegerPrecisionDefinition,
+                ExponentPrecisionType: ACExponentPrecisionDefinition,
+                XnorPrecisionType: ACIntegerPrecisionDefinition,
+            },
+            prefix='AC',
+        )
+
+
 class OneAPICompressedTypeConverter(TypeDefinition, TypePrecisionConverter):
     """Use a tuple for storing a compressed type for oneAPI since it's better supported. (Currently unused)"""
 
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
index 038d65160..8a90bad82 100644
--- a/hls4ml/backends/oneapi/passes/transform_types.py
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -1,5 +1,5 @@
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter
 from hls4ml.backends.oneapi.oneapi_types import (
+    OneAPIACTypeConverter,
     OneAPIArrayVariableConverter,
     OneAPIHLSTypeConverter,
     OneAPIInplaceArrayVariableConverter,
@@ -16,7 +16,7 @@
 
 class TransformTypes(GlobalOptimizerPass):
     def __init__(self):
-        self.type_converter = OneAPIHLSTypeConverter(precision_converter=ACTypeConverter())
+        self.type_converter = OneAPIHLSTypeConverter(precision_converter=OneAPIACTypeConverter())
         self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter)
         self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter)
         self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)

From d6604f067e84b8334eca945580e2a98d5f791f62 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 2 Aug 2024 20:28:31 -0500
Subject: [PATCH 081/100] attempt to fix batchnorm and recurrent

---
 hls4ml/backends/fpga/passes/bn_quant.py       | 20 +++++--
 .../oneapi/passes/recurrent_templates.py      |  5 ++
 .../firmware/nnet_utils/nnet_batchnorm.h      |  8 ++-
 .../nnet_utils/nnet_batchnorm_stream.h        |  8 ++-
 .../firmware/nnet_utils/nnet_recurrent.h      | 53 ++++++++-----------
 .../nnet_utils/nnet_recurrent_stream.h        |  4 +-
 6 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/fpga/passes/bn_quant.py
index 3224b0002..515e16e68 100644
--- a/hls4ml/backends/fpga/passes/bn_quant.py
+++ b/hls4ml/backends/fpga/passes/bn_quant.py
@@ -6,12 +6,23 @@
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
 
-batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_filt = {n_filt};
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    typedef {threshold_t.name} threshold_t;
+}};\n"""
+
+batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {threshold_hi_t.name} threshold_t;
+    typedef {threshold_lo_t.name} threshold_t;
 }};\n"""
 
 batchnorm_quantized_tanh_function_template = (
@@ -24,13 +35,16 @@
 class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(BatchNormalizationQuantizedTanh)
-        self.template = batchnorm_quantized_tanh_config_template
+        self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template)
 
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
 
-        return self.template.format(**params)
+        if node.get_attr('quantize') == 2:
+            return self.template(0).format(**params)
+        else:
+            return self.template(1).format(**params)
 
 
 class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
index 0d5e453fa..291a3bea2 100644
--- a/hls4ml/backends/oneapi/passes/recurrent_templates.py
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -56,6 +56,8 @@
     typedef {accum_t.name} accum_t;
     typedef {weight_t.name} weight_t;
     typedef {bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} recurrent_weight_t;
+    typedef {recurrent_bias_t.name} recurrent_bias_t;
 
     typedef {config_mult_x} mult_config_x;
     typedef {config_mult_h} mult_config_h;
@@ -189,6 +191,8 @@ def format(self, node):
     typedef {accum_t.name} accum_t;
     typedef {weight_t.name} weight_t;
     typedef {bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} recurrent_weight_t;
+    typedef {recurrent_bias_t.name} recurrent_bias_t;  // not yet supported
 
     typedef {act_t} ACT_CONFIG_T;
     template<class x_T, class y_T, class config_T>
@@ -273,6 +277,7 @@ def format(self, node):
     typedef {accum_t.name} accum_t;
     typedef {weight_t.name} weight_t;
     typedef {bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} recurrent_weight_t;
 
     typedef {act_t} ACT_CONFIG_T;
     template<class x_T, class y_T, class config_T>
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
index 00c8752b7..f8e5bcb79 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -66,8 +66,7 @@ struct batchnorm_quantized_tanh_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize_binary_tanh(const data_T &data, res_T &res,
-                           const typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+void normalize_binary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_t &threshold) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         ac_int<1, false> cache;
@@ -83,9 +82,8 @@ void normalize_binary_tanh(const data_T &data, res_T &res,
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void normalize_ternary_tanh(const data_T &data, res_T &res,
-                            const typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
-                            const typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+void normalize_ternary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_hi_t &threshold_hi,
+                            const typename CONFIG_T::threshold_lo_t &threshold_lo) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         ac_int<2, true> cache;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index d8462b2dd..2f949c7c0 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -47,8 +47,7 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_
 //       Merged Batch Normalization and Quantized Tanh
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void normalize_binary_tanh_stream(
-    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold[CONFIG_T::n_scale_bias]) {
+void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) {
     constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type::value_type>{};
 
 BinaryNormLoop:
@@ -73,9 +72,8 @@ void normalize_binary_tanh_stream(
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void normalize_ternary_tanh_stream(
-    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold_hi[CONFIG_T::n_scale_bias],
-    const typename ExtractPipeType<data_pipe>::value_type::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi,
+                                   typename CONFIG_T::threshold_lo_t threshold_lo) {
     constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
 
 TernaryNormLoop:
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index d3ee2b4fc..36f50365e 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -99,8 +99,8 @@ struct gru_config {
 
 template <class data_T, class h_T, typename CONFIG_T>
 void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights,
-              const typename CONFIG_T::weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
-              const typename CONFIG_T::bias_t &recurrent_bias) {
+              const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+              const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
     static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
     // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
     // candidate state gates, for each of the units
@@ -162,8 +162,8 @@ void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weight
 
 template <class data_T, class res_T, typename CONFIG_T>
 void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
-         const typename CONFIG_T::weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
-         const typename CONFIG_T::bias_t &recurrent_bias) {
+         const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+         const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
 
     using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
     [[intel::fpga_register]] data_T x;
@@ -229,10 +229,8 @@ struct simpleRNN_config {
 };
 
 template <class in_T, class h_T, typename CONFIG_T>
-void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o,
-                     const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
-                     const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
-                     const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
+void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, const typename CONFIG_T::weight_t &kernel,
+                     const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) {
 
     using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
     // Weight multiplication
@@ -256,9 +254,8 @@ void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o,
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
-                const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
-                const typename CONFIG_T::bias_t bias[CONFIG_T::n_out]) {
+void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &kernel,
+                const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) {
 
     using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
     using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
@@ -348,16 +345,12 @@ struct lstm_config {
 
 template <class in_T, class h_T, typename CONFIG_T>
 void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o,
-               const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out],
-               const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
-               const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
-               const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
+               const typename CONFIG_T::weight_t &WI, const typename CONFIG_T::weight_t &WF,
+               const typename CONFIG_T::weight_t &WC, const typename CONFIG_T::weight_t &WO,
+               const typename CONFIG_T::recurrent_weight_t &RWI, const typename CONFIG_T::recurrent_weight_t &RWF,
+               const typename CONFIG_T::recurrent_weight_t &RWC, const typename CONFIG_T::recurrent_weight_t &RWO,
+               const typename CONFIG_T::bias_t &BI, const typename CONFIG_T::bias_t BF, const typename CONFIG_T::bias_t &BC,
+               const typename CONFIG_T::bias_t BO) {
 
     using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
 
@@ -492,16 +485,14 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 }
 
 template <class data_T, class res_T, class CONFIG_T>
-void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI[CONFIG_T::n_in * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t WF[CONFIG_T::n_in * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t WC[CONFIG_T::n_in * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t WO[CONFIG_T::n_in * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t RWI[CONFIG_T::n_out * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t RWF[CONFIG_T::n_out * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t RWC[CONFIG_T::n_out * CONFIG_T::n_out],
-          const typename CONFIG_T::weight_t RWO[CONFIG_T::n_out * CONFIG_T::n_out],
-          const typename CONFIG_T::bias_t BI[CONFIG_T::n_out], const typename CONFIG_T::bias_t BF[CONFIG_T::n_out],
-          const typename CONFIG_T::bias_t BC[CONFIG_T::n_out], const typename CONFIG_T::bias_t BO[CONFIG_T::n_out]) {
+void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI, const typename CONFIG_T::weight_t &WF,
+          const typename CONFIG_T::weight_t &WC, const typename CONFIG_T::weight_t &WO,
+          const typename CONFIG_T::recurrent_weight_t &RWI, const typename CONFIG_T::recurrent_weight_t &RWF,
+          const typename CONFIG_T::recurrent_weight_t &RWC, const typename CONFIG_T::recurrent_weight_t &RWO,
+          const typename CONFIG_T::bias_t &BI, const typename CONFIG_T::bias_t &BF, const typename CONFIG_T::bias_t &BC,
+          const typename CONFIG_T::bias_t &BO) {
+
+    // Note:  currently this does not support recurrent bias
 
     using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
     using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
index 193d2df46..4aaedaf67 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -7,8 +7,8 @@
 
 namespace nnet {
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void gru_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::weight_t &recurrent_weights,
-                const typename CONFIG_T::bias_t &bias, const typename CONFIG_T::bias_t &recurrent_bias) {
+void gru_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::recurrent_weight_t &recurrent_weights,
+                const typename CONFIG_T::bias_t &bias, const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
 
     using data_T = typename ExtractPipeType<data_pipe>::value_type;
     using res_T = typename ExtractPipeType<res_pipe>::value_type;

From 9ffd18ecceb8ab66671dc2fd48fbe89a0aef5676 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 22 Aug 2024 11:58:24 -0500
Subject: [PATCH 082/100] fixed BatchNormalizationQuantizedTanhConfigTemplate
 template selection

---
 hls4ml/backends/fpga/passes/bn_quant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/fpga/passes/bn_quant.py
index 515e16e68..8a2e8a5c4 100644
--- a/hls4ml/backends/fpga/passes/bn_quant.py
+++ b/hls4ml/backends/fpga/passes/bn_quant.py
@@ -42,9 +42,9 @@ def format(self, node):
         params['n_in'] = node.get_input_variable().size_cpp()
 
         if node.get_attr('quantize') == 2:
-            return self.template(0).format(**params)
+            return self.template[0].format(**params)
         else:
-            return self.template(1).format(**params)
+            return self.template[1].format(**params)
 
 
 class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):

From be08ad06f1b20cc1a33aa92ce0ec70cf00e16723 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 22 Aug 2024 13:37:49 -0500
Subject: [PATCH 083/100] fix embedding_stream

---
 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
index 869387ac6..0f2acb098 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
@@ -4,7 +4,7 @@
 namespace nnet {
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void embedding_stream(const typename CONFIG_T::embeddings_t &embeddings) {
+void embedding_stream(typename CONFIG_T::embeddings_t embeddings) {
 
     using res_T = typename ExtractPipeType<res_pipe>::value_type;
     constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};

From c06beda08632ee048abf01209401f24a0987fe85 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 22 Aug 2024 15:11:38 -0500
Subject: [PATCH 084/100] fix lstm and simple rnn

---
 .../oneapi/passes/recurrent_templates.py      | 17 ++++--
 .../firmware/nnet_utils/nnet_recurrent.h      | 61 ++++++++++---------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
index 291a3bea2..59f3022cd 100644
--- a/hls4ml/backends/oneapi/passes/recurrent_templates.py
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -189,11 +189,18 @@ def format(self, node):
     static const unsigned return_sequences = {return_sequences};
 
     typedef {accum_t.name} accum_t;
-    typedef {weight_t.name} weight_t;
-    typedef {bias_t.name} bias_t;
-    typedef {recurrent_weight_t.name} recurrent_weight_t;
-    typedef {recurrent_bias_t.name} recurrent_bias_t;  // not yet supported
-
+    typedef {weight_i_t.name} weight_i_t;
+    typedef {bias_i_t.name} bias_i_t;
+    typedef {weight_f_t.name} weight_f_t;
+    typedef {bias_f_t.name} bias_f_t;
+    typedef {weight_c_t.name} weight_c_t;
+    typedef {bias_c_t.name} bias_c_t;
+    typedef {weight_o_t.name} weight_o_t;
+    typedef {bias_o_t.name} bias_o_t;
+    typedef {recurrent_weight_i_t.name} recurrent_weight_i_t;
+    typedef {recurrent_weight_f_t.name} recurrent_weight_f_t;
+    typedef {recurrent_weight_c_t.name} recurrent_weight_c_t;
+    typedef {recurrent_weight_o_t.name} recurrent_weight_o_t;
     typedef {act_t} ACT_CONFIG_T;
     template<class x_T, class y_T, class config_T>
     using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
index 36f50365e..4c20f28d1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -12,7 +12,7 @@ namespace nnet {
 //----------------------
 
 template <class data_T, class res_T, class weight_t, int N_IN, int N_OUT>
-void multiply_W(const data_T &input, res_T &out, const weight_t weight[N_IN * N_OUT]) {
+void multiply_W(const data_T &input, res_T &out, const weight_t &weight) {
 MULTIPLY_W_LOOP_I:
     #pragma unroll
     for (int i = 0; i < N_OUT; i++) {
@@ -27,7 +27,7 @@ void multiply_W(const data_T &input, res_T &out, const weight_t weight[N_IN * N_
 }
 
 template <class data_T, class res_T, class weight_t, int N_OUT>
-void multiply_U(const data_T &input, res_T &out, const weight_t weight[N_OUT * N_OUT]) {
+void multiply_U(const data_T &input, res_T &out, const weight_t &weight) {
 MULTIPLY_U_LOOP_I:
     #pragma unroll
     for (int i = 0; i < N_OUT; i++) {
@@ -42,7 +42,7 @@ void multiply_U(const data_T &input, res_T &out, const weight_t weight[N_OUT * N
 }
 
 template <class data_T, class res_T, class bias_t, int N>
-void add_bias(const data_T &inputs, res_T &out, const bias_t bias[N]) {
+void add_bias(const data_T &inputs, res_T &out, const bias_t &bias) {
 ADD_BIAS_LOOP:
     #pragma unroll
     for (int i = 0; i < N; i++) {
@@ -243,7 +243,8 @@ void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o,
 
     // Hidden state
     [[intel::fpga_register]] accum_array_T hiddenCand;
-    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand, rec_kernel);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
+                                                                                           rec_kernel);
 
     // Vector addition
     [[intel::fpga_register]] accum_array_T afterAdd;
@@ -345,12 +346,12 @@ struct lstm_config {
 
 template <class in_T, class h_T, typename CONFIG_T>
 void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o,
-               const typename CONFIG_T::weight_t &WI, const typename CONFIG_T::weight_t &WF,
-               const typename CONFIG_T::weight_t &WC, const typename CONFIG_T::weight_t &WO,
-               const typename CONFIG_T::recurrent_weight_t &RWI, const typename CONFIG_T::recurrent_weight_t &RWF,
-               const typename CONFIG_T::recurrent_weight_t &RWC, const typename CONFIG_T::recurrent_weight_t &RWO,
-               const typename CONFIG_T::bias_t &BI, const typename CONFIG_T::bias_t BF, const typename CONFIG_T::bias_t &BC,
-               const typename CONFIG_T::bias_t BO) {
+               const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF,
+               const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO,
+               const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF,
+               const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO,
+               const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t BF,
+               const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t BO) {
 
     using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
 
@@ -391,13 +392,14 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 
     //-----------Gate I Calculations
     // Weight multiplication
-    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, i_afterW, WI);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_i_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, i_afterW, WI);
 
     // Bias addition
-    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(i_afterW, i_afterBias, BI);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_i_t, CONFIG_T::n_out>(i_afterW, i_afterBias, BI);
 
     // Hidden Candidate
-    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand, RWI);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_i_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand,
+                                                                                             RWI);
 
     // Vector addition
     add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(i_afterBias, i_hiddenCand, i_afterAdd);
@@ -408,13 +410,14 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 
     //-----------Gate F Calculations
     // Weight multiplication
-    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, f_afterW, WF);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_f_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, f_afterW, WF);
 
     // Bias addition
-    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(f_afterW, f_afterBias, BF);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_f_t, CONFIG_T::n_out>(f_afterW, f_afterBias, BF);
 
     // Hidden Candidate
-    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand, RWF);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_f_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand,
+                                                                                             RWF);
 
     // Vector addition
     add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(f_afterBias, f_hiddenCand, f_afterAdd);
@@ -425,13 +428,14 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 
     //-----------Gate C Calculations
     // Weight multiplication
-    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, c_afterW, WC);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_c_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, c_afterW, WC);
 
     // Bias addition
-    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(c_afterW, c_afterBias, BC);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_c_t, CONFIG_T::n_out>(c_afterW, c_afterBias, BC);
 
     // Hidden Candidate
-    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand, RWC);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_c_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand,
+                                                                                             RWC);
 
     // Vector addition
     add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(c_afterBias, c_hiddenCand, c_afterAdd);
@@ -446,13 +450,14 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 
     //-----------Gate O Calculations
     // Weight multiplication
-    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, o_afterW, WO);
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_o_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, o_afterW, WO);
 
     // Bias addition
-    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(o_afterW, o_afterBias, BO);
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_o_t, CONFIG_T::n_out>(o_afterW, o_afterBias, BO);
 
     // Hidden Candidate
-    multiply_U<h_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand, RWO);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_o_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand,
+                                                                                             RWO);
 
     // Vector addition
     add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(o_afterBias, o_hiddenCand, o_afterAdd);
@@ -485,12 +490,12 @@ void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &
 }
 
 template <class data_T, class res_T, class CONFIG_T>
-void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_t WI, const typename CONFIG_T::weight_t &WF,
-          const typename CONFIG_T::weight_t &WC, const typename CONFIG_T::weight_t &WO,
-          const typename CONFIG_T::recurrent_weight_t &RWI, const typename CONFIG_T::recurrent_weight_t &RWF,
-          const typename CONFIG_T::recurrent_weight_t &RWC, const typename CONFIG_T::recurrent_weight_t &RWO,
-          const typename CONFIG_T::bias_t &BI, const typename CONFIG_T::bias_t &BF, const typename CONFIG_T::bias_t &BC,
-          const typename CONFIG_T::bias_t &BO) {
+void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF,
+          const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO,
+          const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF,
+          const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO,
+          const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t &BF,
+          const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t &BO) {
 
     // Note:  currently this does not support recurrent bias
 

From 5452fab96468d68e32987b5b3cf15173b95b7213 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 22 Aug 2024 15:51:27 -0500
Subject: [PATCH 085/100] fix GRU

---
 .../oneapi/passes/recurrent_templates.py      | 27 ++++++++++++++++---
 .../nnet_utils/nnet_recurrent_stream.h        |  4 +--
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
index 59f3022cd..00cd16879 100644
--- a/hls4ml/backends/oneapi/passes/recurrent_templates.py
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -10,7 +10,7 @@
 ################################################
 # Shared Matrix Multiplication Template (Dense)
 ################################################
-recr_mult_config_template = '''struct config{index}_mult : nnet::dense_config {{
+recr_mult_x_config_template = '''struct config{index}_mult : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
 
@@ -31,6 +31,27 @@
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n'''
 
+recr_mult_h_config_template = '''struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+    typedef {accum_t.name} accum_t;
+    typedef {recurrent_bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n'''
+
 ################################################
 # Shared Activation Template
 ################################################
@@ -85,8 +106,8 @@ def __init__(self):
         self.gru_template = gru_config_template
         self.act_template = activ_config_template
         self.recr_act_template = activ_config_template
-        self.mult_x_template = recr_mult_config_template
-        self.mult_h_template = recr_mult_config_template
+        self.mult_x_template = recr_mult_x_config_template
+        self.mult_h_template = recr_mult_h_config_template
 
     def format(self, node):
         # Input has shape (n_timesteps, inp_dimensionality)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
index 4aaedaf67..7429419cd 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -7,8 +7,8 @@
 
 namespace nnet {
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void gru_stream(const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::recurrent_weight_t &recurrent_weights,
-                const typename CONFIG_T::bias_t &bias, const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
+void gru_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::recurrent_weight_t recurrent_weights,
+                typename CONFIG_T::bias_t bias, typename CONFIG_T::recurrent_bias_t recurrent_bias) {
 
     using data_T = typename ExtractPipeType<data_pipe>::value_type;
     using res_T = typename ExtractPipeType<res_pipe>::value_type;

From e39e8677d84365050449d8122e9608551be707d7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 22 Aug 2024 17:07:09 -0500
Subject: [PATCH 086/100] fix winograd, and also disable it by default

---
 hls4ml/backends/oneapi/oneapi_backend.py              | 2 +-
 hls4ml/backends/oneapi/passes/convolution_winograd.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 74ab488d1..6a72965e8 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -284,7 +284,7 @@ def init_conv2d(self, layer):
         # - combination - at compile-time, the decision between Winograd and im2col is made
         # - im2col - specifically use im2col
         # - Winograd - use Winograd, if possible
-        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'combination'))
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col'))
 
         layer.set_attr(
             'n_partitions', 1
diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py
index bf53e5b7c..fdab408b3 100644
--- a/hls4ml/backends/oneapi/passes/convolution_winograd.py
+++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py
@@ -120,7 +120,6 @@ def transform(self, model, node):
                 # Fractional precision is increased by 2 bits (division by 4),
                 # for low-precision (less than 8) fractional weights
                 if node.weights['weight'].type.precision.fractional < 8:
-                    node.weights['weight'].type.precision.fractional += 2
                     node.weights['weight'].type.precision.width += 2
 
                 # Modified kernel size
@@ -167,7 +166,6 @@ def transform(self, model, node):
                 # Fractional precision is increased by 2 bits (division by 4),
                 # for low-precision (less than 8) fractional weights
                 if node.weights['weight'].type.precision.fractional < 8:
-                    node.weights['weight'].type.precision.fractional += 2
                     node.weights['weight'].type.precision.width += 2
 
                 # Modified kernel size

From cfe229f6652279647ea7254a5ceace7fc6dadcc9 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 23 Aug 2024 10:00:26 -0500
Subject: [PATCH 087/100] fix threshold name

---
 hls4ml/backends/fpga/passes/bn_quant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/fpga/passes/bn_quant.py
index 8a2e8a5c4..ab26407e3 100644
--- a/hls4ml/backends/fpga/passes/bn_quant.py
+++ b/hls4ml/backends/fpga/passes/bn_quant.py
@@ -21,8 +21,8 @@
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
-    typedef {threshold_hi_t.name} threshold_t;
-    typedef {threshold_lo_t.name} threshold_t;
+    typedef {threshold_hi_t.name} threshold_hi_t;
+    typedef {threshold_lo_t.name} threshold_lo_t;
 }};\n"""
 
 batchnorm_quantized_tanh_function_template = (

From 70617e164f81af53e5c063e10dc9000e57266bc5 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 23 Aug 2024 15:22:27 -0500
Subject: [PATCH 088/100] split bn_quant to be backend-specific

---
 .../{fpga => catapult}/passes/bn_quant.py     |  20 +-
 hls4ml/backends/oneapi/passes/bn_quant.py     | 222 ++++++++++++++++++
 hls4ml/backends/quartus/passes/bn_quant.py    | 169 +++++++++++++
 hls4ml/backends/vivado/passes/bn_quant.py     | 169 +++++++++++++
 .../nnet_utils/nnet_batchnorm_stream.h        |   2 +-
 5 files changed, 564 insertions(+), 18 deletions(-)
 rename hls4ml/backends/{fpga => catapult}/passes/bn_quant.py (88%)
 create mode 100644 hls4ml/backends/oneapi/passes/bn_quant.py
 create mode 100644 hls4ml/backends/quartus/passes/bn_quant.py
 create mode 100644 hls4ml/backends/vivado/passes/bn_quant.py

diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/catapult/passes/bn_quant.py
similarity index 88%
rename from hls4ml/backends/fpga/passes/bn_quant.py
rename to hls4ml/backends/catapult/passes/bn_quant.py
index ab26407e3..3224b0002 100644
--- a/hls4ml/backends/fpga/passes/bn_quant.py
+++ b/hls4ml/backends/catapult/passes/bn_quant.py
@@ -6,23 +6,12 @@
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
 
-batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_filt = {n_filt};
     static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
-    typedef {threshold_t.name} threshold_t;
-}};\n"""
-
-batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_filt = {n_filt};
-    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    typedef {threshold_hi_t.name} threshold_hi_t;
-    typedef {threshold_lo_t.name} threshold_lo_t;
 }};\n"""
 
 batchnorm_quantized_tanh_function_template = (
@@ -35,16 +24,13 @@
 class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(BatchNormalizationQuantizedTanh)
-        self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template)
+        self.template = batchnorm_quantized_tanh_config_template
 
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
 
-        if node.get_attr('quantize') == 2:
-            return self.template[0].format(**params)
-        else:
-            return self.template[1].format(**params)
+        return self.template.format(**params)
 
 
 class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
diff --git a/hls4ml/backends/oneapi/passes/bn_quant.py b/hls4ml/backends/oneapi/passes/bn_quant.py
new file mode 100644
index 000000000..8425d5da1
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/bn_quant.py
@@ -0,0 +1,222 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {threshold_t.name} threshold_t;
+}};\n"""
+
+batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {threshold_hi_t.name} threshold_hi_t;
+    typedef {threshold_lo_t.name} threshold_lo_t;
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {output_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+batchnorm_quantized_tanh_task_sequence_template = (
+    'task_sequence<nnet::normalize_{quantize}_tanh_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+batchnorm_quantized_tanh_stream_function_template = '{name}.async({threshold});'
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template)
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        if node.get_attr('quantize') == 2:
+            return self.template[0].format(**params)
+        else:
+            return self.template[1].format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhTaskSequenceTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhStreamFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/backends/quartus/passes/bn_quant.py b/hls4ml/backends/quartus/passes/bn_quant.py
new file mode 100644
index 000000000..3224b0002
--- /dev/null
+++ b/hls4ml/backends/quartus/passes/bn_quant.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/backends/vivado/passes/bn_quant.py b/hls4ml/backends/vivado/passes/bn_quant.py
new file mode 100644
index 000000000..3224b0002
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/bn_quant.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index 2f949c7c0..128b3ac1a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -48,7 +48,7 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) {
-    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type::value_type>{};
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
 
 BinaryNormLoop:
     [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {

From 5bc6cbef9e03d008851e66853548e6fc330c5d00 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Sun, 25 Aug 2024 17:41:53 -0500
Subject: [PATCH 089/100] add type inference to oneAPI

---
 hls4ml/backends/oneapi/oneapi_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 6a72965e8..d4847dd91 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -64,6 +64,7 @@ def _register_flows(self):
             'oneapi:inplace_parallel_reshape',
             'oneapi:skip_softmax',
             'oneapi:fix_softmax_table_size',
+            'infer_precision_types',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 

From c0cf580c5124d611191873e6af84f27eb76be1aa Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 27 Aug 2024 17:54:09 -0500
Subject: [PATCH 090/100] add oneAPI to pytorch tests

---
 test/pytest/test_pytorch_api.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index 8d18c6a1d..5bc035106 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -22,7 +22,7 @@ def forward(self, x):
         return self.linear(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_linear(backend, io_type):
     model = LinearModel()
@@ -73,7 +73,7 @@ def test_linear(backend, io_type):
         nn.Threshold(threshold=1.0, value=0.0),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to()
@@ -164,7 +164,7 @@ def forward(self, x):
         ThresholdModel(),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activation_functionals(activation_function, backend, io_type):
     model = activation_function
@@ -201,7 +201,7 @@ def test_activation_functionals(activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     n_in = 2
@@ -311,7 +311,7 @@ def test_conv1d(padds, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(padds, backend, io_type):
     n_in = 2
@@ -464,7 +464,7 @@ def test_conv2d(padds, backend, io_type):
 
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_pooling(pooling, padds, backend):
     assert '1d' in pooling.__name__ or '2d' in pooling.__name__
 
@@ -588,7 +588,7 @@ def forward(self, x):
         return self.bn(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_bn(backend, io_type):
     model = BatchNormModel()
@@ -631,7 +631,7 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_squeeze(backend, io_type):
     model = SqueezeModel()
@@ -655,7 +655,8 @@ def test_squeeze(backend, io_type):
 
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01)
 
-    if io_type == 'io_parallel':
+    # oneAPI doesn't use the Repack class (and for io_stream does not use inplace variables)
+    if io_type == 'io_parallel' or backend == 'oneAPI':
         assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Reshape'
         assert list(hls_model.get_layers())[1].attributes['target_shape'] == [1, 5]
         assert list(hls_model.get_layers())[3].attributes['class_name'] == 'Reshape'
@@ -667,7 +668,7 @@ def test_squeeze(backend, io_type):
         assert list(hls_model.get_layers())[3].attributes['target_shape'] == [3]
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_flatten(backend):
     input = torch.randn(1, 1, 5, 5)
     model = nn.Sequential(nn.Conv2d(1, 32, 5, 1, 1), nn.Flatten(), nn.ReLU())
@@ -711,7 +712,7 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_skipped_layers(backend, io_type):
     model = ModelSkippedLayers()
@@ -742,7 +743,7 @@ def test_skipped_layers(backend, io_type):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # Only io_parallel for now
 @pytest.mark.parametrize('tensor_rank', [2, 3])
 def test_remove_transpose(backend, io_type, tensor_rank):
@@ -810,7 +811,7 @@ def forward(self, x):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_view(backend, io_type):
 

From 8c827b83af689d0efb6a1292efdaed888703e42c Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 29 Aug 2024 16:29:29 -0500
Subject: [PATCH 091/100] fix pooling with padding for oneAPI and Quartus

---
 .../oneapi/firmware/nnet_utils/nnet_pooling.h | 35 +++++++++----------
 .../firmware/nnet_utils/nnet_pooling.h        | 34 ++++++++----------
 2 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
index cb602b712..d4ae91533 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
@@ -83,18 +83,17 @@ struct pooling1d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(const data_T &data, res_T &res) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::n_in / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
 
 FiltLoop:
     #pragma unroll
     [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
     InputWidthLoop:
         #pragma unroll
-        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_width;
+                                                inp_col += CONFIG_T::stride_width) {
             [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_width];
 
             // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
@@ -103,7 +102,8 @@ template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(const
         PoolWidthLoop:
             #pragma unroll
             [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
-                if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) {
+                if (inp_col + pool_col < CONFIG_T::pad_left ||
+                    inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) {
                     // Add padding
                     pool[pool_col] = pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
                     if (CONFIG_T::count_pad)
@@ -170,25 +170,22 @@ struct pooling2d_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(const data_T &data, res_T &res) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::in_width / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right);
-    static constexpr int padded_height = (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0)
-                                             ? (CONFIG_T::in_height / CONFIG_T::stride_height * CONFIG_T::stride_height)
-                                             : (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
 
 FiltLoop:
     #pragma unroll
     [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
     InputHeightLoop:
         #pragma unroll
-        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < padded_height;
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_height;
                                                 inp_col += CONFIG_T::stride_height) {
         InputWidthLoop:
             #pragma unroll
-            [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < padded_width;
+            [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < restricted_padded_width;
                                                     inp_width += CONFIG_T::stride_width) {
                 [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
 
@@ -203,9 +200,9 @@ template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(const
                     [[intel::disable_loop_pipelining]] for (int pool_row = 0; pool_row < CONFIG_T::stride_width;
                                                             pool_row++) {
                         if (inp_col + pool_col < CONFIG_T::pad_top ||
-                            inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) ||
+                            inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) ||
                             inp_width + pool_row < CONFIG_T::pad_left ||
-                            inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) {
+                            inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) {
                             // Add padding
                             pool[pool_col * CONFIG_T::stride_width + pool_row] =
                                 pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
index bbfc0908e..6bc254db9 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
@@ -122,11 +122,9 @@ struct pooling1d_config {
 
 template <class data_T, class res_T, typename CONFIG_T>
 void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::n_in / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
 
 FiltLoop:
     #pragma unroll
@@ -135,7 +133,7 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
     InputWidthLoop:
         #pragma unroll
         #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
+        for (int inp_col = 0; inp_col < restricted_padded_width; inp_col += CONFIG_T::stride_width) {
             hls_register data_T pool[CONFIG_T::pool_width];
 
             // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
@@ -145,7 +143,8 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
             #pragma unroll
             #pragma disable_loop_pipelining
             for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
-                if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) {
+                if (inp_col + pool_col < CONFIG_T::pad_left ||
+                    inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) {
                     // Add padding
                     pool[pool_col] = pad_val<data_T, CONFIG_T::pool_op>();
                     if (CONFIG_T::count_pad)
@@ -220,14 +219,11 @@ struct pooling2d_config {
 template <class data_T, class res_T, typename CONFIG_T>
 void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
                   res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::in_width / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right);
-    static constexpr int padded_height = (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0)
-                                             ? (CONFIG_T::in_height / CONFIG_T::stride_height * CONFIG_T::stride_height)
-                                             : (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
 
 FiltLoop:
     #pragma unroll
@@ -236,11 +232,11 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
     InputHeightLoop:
         #pragma unroll
         #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_height; inp_col += CONFIG_T::stride_height) {
+        for (int inp_col = 0; inp_col < restricted_padded_height; inp_col += CONFIG_T::stride_height) {
         InputWidthLoop:
             #pragma unroll
             #pragma disable_loop_pipelining
-            for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) {
+            for (int inp_width = 0; inp_width < restricted_padded_width; inp_width += CONFIG_T::stride_width) {
                 hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
 
                 // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
@@ -255,9 +251,9 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
                     #pragma disable_loop_pipelining
                     for (int pool_row = 0; pool_row < CONFIG_T::stride_width; pool_row++) {
                         if (inp_col + pool_col < CONFIG_T::pad_top ||
-                            inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) ||
+                            inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) ||
                             inp_width + pool_row < CONFIG_T::pad_left ||
-                            inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) {
+                            inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) {
                             // Add padding
                             pool[pool_col * CONFIG_T::stride_width + pool_row] = pad_val<data_T, CONFIG_T::pool_op>();
                             if (CONFIG_T::count_pad)

From 7e0a8ca5ee55ba5a4cbf2dd2f26163e00a073f9c Mon Sep 17 00:00:00 2001
From: Lauri Laatu <l.laatu@imperial.ac.uk>
Date: Fri, 13 Sep 2024 18:14:07 +0100
Subject: [PATCH 092/100] Compilation for larger models enabled by increasing
 -fconstexpr-steps

---
 hls4ml/templates/oneapi/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index f200780db..e2b386d70 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
-set(USER_FLAGS -Wno-unused-label ${USER_FLAGS})
+set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
 
 # Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
 # compilation.

From d1e14deb8315fc770db648d93f3c51befcc67ee8 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 18 Sep 2024 14:23:19 -0500
Subject: [PATCH 093/100] add oneapi clone tests; remove reduntand multi_clone
 test

---
 test/pytest/test_stream_clone.py       |  4 +--
 test/pytest/test_stream_multi_clone.py | 48 --------------------------
 2 files changed, 2 insertions(+), 50 deletions(-)
 delete mode 100644 test/pytest/test_stream_multi_clone.py

diff --git a/test/pytest/test_stream_clone.py b/test/pytest/test_stream_clone.py
index 7f5988f65..b871047e9 100644
--- a/test/pytest/test_stream_clone.py
+++ b/test/pytest/test_stream_clone.py
@@ -40,7 +40,7 @@ def data():
     return X
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'oneAPI'])
 def test_multi_clone(model_multi_clone, data, backend: str):
     output_dir = str(test_root_path / f'hls4mlprj_stream_clone_multiclone_{backend}')
     hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
@@ -58,7 +58,7 @@ def test_multi_clone(model_multi_clone, data, backend: str):
     assert np.allclose(r_hls, r_keras, atol=1e-5, rtol=0)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'oneAPI'])
 def test_clone_precision_inherition(model_clone_precision_inherition, data, backend: str):
     output_dir = str(test_root_path / f'hls4mlprj_stream_clone_precision_{backend}')
     layer_config = {
diff --git a/test/pytest/test_stream_multi_clone.py b/test/pytest/test_stream_multi_clone.py
deleted file mode 100644
index bd7d7a0bb..000000000
--- a/test/pytest/test_stream_multi_clone.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pytest
-from keras.layers import Add, Dense
-from tensorflow import keras
-
-from hls4ml.converters import convert_from_keras_model
-
-test_root_path = Path(__file__).parent
-
-
-@pytest.fixture(scope='module')
-def model():
-    inp = keras.Input(shape=(10,))
-    x = Dense(10)(inp)
-    y = Dense(10)(inp)
-    z = Dense(10)(inp)
-    xy = Add()([x, y])  # 5
-    xy = Add()([xy, y])  # 5
-    out = Add()([xy, z])  # 5
-    model = keras.Model(inp, out)
-    return model
-
-
-@pytest.fixture(scope='module')
-def data():
-    X = np.random.normal(0, 1, (1000, 10))
-    X = np.clip(X, -16, 15)
-    return X
-
-
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
-def test_multi_clone(model, data, backend: str):
-    output_dir = str(test_root_path / f'hls4mlprj_stream_multi_clone_{backend}')
-    hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
-    model_hls = convert_from_keras_model(
-        model,
-        backend=backend,
-        output_dir=output_dir,
-        hls_config=hls_config,
-        io_type='io_stream',  # clone only happens with stream io.
-    )
-    model_hls.compile()
-    r_hls = model_hls.predict(data)
-    r_keras = model(data).numpy()
-
-    assert np.allclose(r_hls, r_keras, atol=1e-5, rtol=0)

From 1b78e57fd43af45d776436dab97b580ae28419bf Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 23 Sep 2024 23:51:56 -0500
Subject: [PATCH 094/100] remove some attributes to avoid overwrite warnings

---
 hls4ml/backends/oneapi/passes/pointwise.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index 6683b2341..ccf410d1f 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -1,5 +1,3 @@
-from copy import copy
-
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.oneapi.passes.convolution_templates import (
@@ -149,8 +147,9 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
         pw_node = model.make_node(
-            'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy()
+            'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
         model.replace_node(node, pw_node)
 

From f9a71f1495563bf12fa209826a269a6937468cd7 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 1 Oct 2024 17:01:01 -0500
Subject: [PATCH 095/100] make extra handling for oneAPI like others (as in PR
 #1067)

---
 hls4ml/backends/oneapi/oneapi_backend.py | 25 +++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index d4847dd91..7d473dd71 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -1,5 +1,6 @@
 import subprocess
 from pathlib import Path
+from warnings import warn
 
 import numpy as np
 
@@ -7,7 +8,7 @@
 from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
 from hls4ml.model.flow import register_flow
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
-from hls4ml.model.optimizer import layer_optimizer
+from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
 
 # from hls4ml.report import parse_oneapi_report
@@ -68,13 +69,32 @@ def _register_flows(self):
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 
+        templates = self._get_layer_templates()
         template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
 
         writer_passes = ['make_stamp', 'oneapi:write_hls']
 
         self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name)
 
-        extras_flow = None
+        all_passes = get_backend_passes(self.name)
+
+        extras = [
+            # Ideally this should be empty
+            opt_pass
+            for opt_pass in all_passes
+            if opt_pass
+            not in initializers
+            + streaming_passes
+            + oneapi_types
+            + quantization_passes
+            + templates
+            + optimization_passes
+            + writer_passes
+        ]
+
+        if len(extras) > 0:
+            for opt in extras:
+                warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.')
 
         ip_flow_requirements = [
             'optimize',
@@ -83,7 +103,6 @@ def _register_flows(self):
             quantization_flow,
             optimization_flow,
             oneapi_types_flow,
-            extras_flow,
             template_flow,
         ]
         ip_flow_requirements = list(filter(None, ip_flow_requirements))

From 320615d7a3d3996cc78d7916cc50c6e132a732e3 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 1 Oct 2024 18:11:31 -0500
Subject: [PATCH 096/100] remove warnings for extra optimizers that are not
 scheduled on purpose

---
 hls4ml/backends/oneapi/oneapi_backend.py |  2 ++
 test/pytest/test_repack_stream.py        | 23 ++++++++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 7d473dd71..801174832 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -90,6 +90,8 @@ def _register_flows(self):
             + templates
             + optimization_passes
             + writer_passes
+            + ['oneapi:inplace_stream_flatten', 'oneapi:reshape_stream']  # not needed
+            + ['oneapi:process_fixed_point_quantizer_layer']  # not yet supported
         ]
 
         if len(extras) > 0:
diff --git a/test/pytest/test_repack_stream.py b/test/pytest/test_repack_stream.py
index 04cc9867a..a108af441 100644
--- a/test/pytest/test_repack_stream.py
+++ b/test/pytest/test_repack_stream.py
@@ -9,7 +9,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 def test_repack_precision(backend: str):
     inp = keras.Input(shape=(3, 3), name='inp')
     out = keras.layers.Reshape((3, 3), name='reshape')(inp)
@@ -33,16 +33,29 @@ def test_repack_precision(backend: str):
         io_type='io_stream',
     )
     model_hls.write()  # Not needed for this test, but useful for debugging
-    assert 'repack_reshape' in model_hls.graph, 'repack_reshape not found in graph'
-    repack_precision = model_hls.graph['repack_reshape'].attributes['result_t'].precision
+
+    reshape_name = 'reshape' if backend == 'oneAPI' else 'repack_reshape'
+    assert reshape_name in model_hls.graph, f'{reshape_name} not found in graph'
+    repack_precision = model_hls.graph[reshape_name].attributes['result_t'].precision
     assert repack_precision.integer == 10, 'Precision mismatch'
     assert repack_precision.fractional == 10, 'Precision mismatch'
     assert repack_precision.width == 20, 'Precision mismatch'
     assert repack_precision.signed is True, 'Precision mismatch'
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
-@pytest.mark.parametrize('strategy', ['Latency', 'Resource'])
+@pytest.mark.parametrize(
+    'backend, strategy',
+    [
+        ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
+        ('Vivado', 'Resource'),
+        ('Vitis', 'Resource'),
+        ('Vivado', 'Latency'),
+        ('Vitis', 'Latency'),
+        ('Catapult', 'Latency'),
+        ('Catapult', 'Resource'),
+    ],
+)
 def test_repack(backend: str, strategy: str):
     inp1 = keras.Input(shape=(4,), name='inp1')
     inp2 = keras.Input(shape=(4,), name='inp2')

From 5d13de51089c0a5752c7cbb22a7b4aea629891fd Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 1 Oct 2024 23:54:32 -0500
Subject: [PATCH 097/100] update parametrized activations

---
 .../backends/oneapi/passes/core_templates.py  | 102 +++++++++---------
 hls4ml/model/layers.py                        |   2 +-
 .../firmware/nnet_utils/nnet_activation.h     |  26 ++---
 .../nnet_utils/nnet_activation_stream.h       |  10 +-
 test/pytest/test_activations.py               |   2 +-
 5 files changed, 67 insertions(+), 75 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 093e80182..916d9b196 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -6,24 +6,24 @@
 # Dense templates
 
 dense_config_template = """struct config{index} : nnet::dense_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_out = {n_out};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned n_zeros = {nzeros};
-    static const unsigned n_nonzeros = {nonzeros};
-    static const bool store_weights_in_bram = false;
-
-    static const unsigned rf_pad = {rfpad};
-    static const unsigned bf_pad = {bfpad};
-
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor);
-    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
-    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
-    static const unsigned block_factor_rounded = block_factor + bf_pad;
-    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
-    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
-    static const unsigned multiplier_scale = multiplier_limit/n_out;
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned n_out = {n_out};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned n_zeros = {nzeros};
+    static constexpr unsigned n_nonzeros = {nonzeros};
+    static constexpr bool store_weights_in_bram = false;
+
+    static constexpr unsigned rf_pad = {rfpad};
+    static constexpr unsigned bf_pad = {bfpad};
+
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor);
+    static constexpr unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static constexpr unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static constexpr unsigned block_factor_rounded = block_factor + bf_pad;
+    static constexpr unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static constexpr unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static constexpr unsigned multiplier_scale = multiplier_limit/n_out;
 
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
@@ -96,11 +96,11 @@ def format(self, node):
 # BatchNormalization templates
 
 batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_filt = {n_filt};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const bool store_weights_in_bram = false;
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned n_filt = {n_filt};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
     template<class x_T, class y_T>
@@ -168,38 +168,36 @@ def format(self, node):
 # Activation templates
 
 activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
     typedef {table_t.name} table_t;
 }};\n"""
 
-prelu_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
+param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
     typedef {table_t.name} table_t;
-    typedef {alpha_t.name} alpha_t;
+    typedef {param_t.name} param_t;
 }};\n"""
 
-hard_activ_config_template = """struct {type}_config{index} {{
-    static const unsigned n_in = {n_in};
-    static const {slope_t.name} slope;
-    static const {shift_t.name} shift;
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-}};
-const {slope_t.name} {type}_config{index}::slope = {slope};
-const {shift_t.name} {type}_config{index}::shift = {shift};\n"""
+hard_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr {slope_t.name} slope = {slope};
+    static constexpr {shift_t.name} shift = {shift};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+}};\n"""
 
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
@@ -216,7 +214,7 @@ def format(self, node):
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Activation, ParametrizedActivation))
+        super().__init__(Activation)
         self.template = activ_config_template
 
     def format(self, node):
@@ -226,10 +224,10 @@ def format(self, node):
         return self.template.format(**params)
 
 
-class PreluActivationConfigTemplate(LayerConfigTemplate):
+class ParamActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__(PReLU)
-        self.template = prelu_activ_config_template
+        super().__init__((ParametrizedActivation, PReLU))
+        self.template = param_activ_config_template
 
     def format(self, node):
         params = self._default_config_params(node)
@@ -291,7 +289,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['param'] = node.get_weights('alpha').name
+        params['param'] = node.get_weights('param').name
         params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 1ceb6456b..8054f41ee 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -884,7 +884,7 @@ class HardActivation(Activation):
     def initialize(self):
         super().initialize()
         slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False))
-        shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False))
+        shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=2, integer=0, signed=False))
         index = self.get_attr('index')
         slope_t = NamedType(f'slope{index}_t', precision=slope_prec)
         shift_t = NamedType(f'shift{index}_t', precision=shift_prec)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index d42ef601f..ab1874ec1 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -7,14 +7,14 @@ namespace nnet {
 
 struct activ_config {
     // IO size
-    static const unsigned n_in = 10;
+    static constexpr unsigned n_in = 10;
 
     // Internal info
-    static const unsigned table_size = 512;
+    static constexpr unsigned table_size = 512;
 
     // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
+    static constexpr unsigned io_type = io_parallel;
+    static constexpr unsigned reuse_factor = 1;
 
     // Internal data type definitions
     typedef ac_fixed<16, 8> table_t;
@@ -70,7 +70,7 @@ template <class data_T, class res_T, typename CONFIG_T> void relu1(const data_T
 //       Sigmoid Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_T &data, res_T &res) {
-    static const int MAX_VALUE = 8;
+    static constexpr int MAX_VALUE = 8;
 #include "activation_tables/sigmoid_table.tb"
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
@@ -269,7 +269,7 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
 //       TanH Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T> void dense_tanh(const data_T &data, res_T &res) {
-    static const int MAX_VALUE = 4;
+    static constexpr int MAX_VALUE = 4;
 // Initialize the lookup table
 #include "activation_tables/tanh_table.tb"
     // Index into the lookup table based on data
@@ -325,7 +325,7 @@ template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(const dat
 //       Leaky RELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void leaky_relu(const data_T &data, typename data_T::value_type alpha, res_T &res) {
+void leaky_relu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = data[ii];
@@ -340,7 +340,7 @@ void leaky_relu(const data_T &data, typename data_T::value_type alpha, res_T &re
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void thresholded_relu(const data_T &data, typename data_T::value_type theta, res_T &res) {
+void thresholded_relu(const data_T &data, const typename CONFIG_T::param_t theta, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = data[ii];
@@ -374,7 +374,7 @@ template <class data_T, class res_T, typename CONFIG_T> void softplus(const data
 //       Softsign Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T> void softsign(const data_T &data, res_T &res) {
-    static const int MAX_VALUE = 8;
+    static constexpr int MAX_VALUE = 8;
 // Initialize the lookup table
 #include "activation_tables/softsign_table.tb"
 
@@ -404,7 +404,7 @@ template <class data_T, class res_T, typename CONFIG_T> void softsign(const data
 //       ELU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void elu(const data_T &data, const typename res_T::value_type alpha, res_T &res) {
+void elu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) {
 // Initialize the lookup table
 #include "activation_tables/elu_table.tb"
     // Index into the lookup table based on data
@@ -422,10 +422,6 @@ void elu(const data_T &data, const typename res_T::value_type alpha, res_T &res)
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T> void elu(const data_T &data, res_T &res) {
-    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
-}
-
 // *************************************************
 //       SELU Activation
 // *************************************************
@@ -451,7 +447,7 @@ template <class data_T, class res_T, typename CONFIG_T> void selu(const data_T &
 //       PReLU Activation
 // *************************************************
 template <class data_T, class res_T, typename CONFIG_T>
-void prelu(const data_T &data, const typename CONFIG_T::alpha_t &alpha, res_T &res) {
+void prelu(const data_T &data, const typename CONFIG_T::param_t &alpha, res_T &res) {
     #pragma unroll
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         auto datareg = data[ii];
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 3dd55b878..13de5ab3b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -52,8 +52,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream()
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T>
-void leaky_relu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void leaky_relu_stream(typename CONFIG_T::param_t alpha) {
     constexpr unsigned multiplier_limit =
         DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
@@ -83,7 +82,7 @@ void leaky_relu_stream(const typename ExtractPipeType<data_pipe>::value_type::va
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void thresholded_relu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type theta) {
+void thresholded_relu_stream(typename CONFIG_T::param_t theta) {
 ThresholdedReLUActLoop:
     [[intel::initiation_interval(
         1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
@@ -106,8 +105,7 @@ void thresholded_relu_stream(const typename ExtractPipeType<data_pipe>::value_ty
 // *************************************************
 //       ELU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T>
-void elu_stream(const typename ExtractPipeType<data_pipe>::value_type::value_type alpha) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void elu_stream(typename CONFIG_T::param_t alpha) {
 #include "activation_tables/elu_table.tb"
 
     constexpr unsigned multiplier_limit =
@@ -174,7 +172,7 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream()
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void prelu_stream(typename CONFIG_T::alpha_t alpha) {
+template <class data_pipe, class res_pipe, typename CONFIG_T> void prelu_stream(typename CONFIG_T::param_t alpha) {
     constexpr unsigned multiplier_limit =
         DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index f156b1cdc..5d97067c4 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -12,7 +12,7 @@
 # Variable 'name' is simply used as an identifier for the activation
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 @pytest.mark.parametrize(
     'activation, name',

From ae9266f31e505a28199c2f8306c61c82d072abe4 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 2 Oct 2024 10:00:59 -0500
Subject: [PATCH 098/100] fix reference to alpha that had not been switched to
 param

---
 hls4ml/backends/oneapi/passes/core_templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 916d9b196..5ccf1a521 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -347,5 +347,5 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['param'] = node.get_weights('alpha').name
+        params['param'] = node.get_weights('param').name
         return self.template.format(**params)

From 641be56ddb8633120ae6c3a34d4d35a41899f880 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 21 Oct 2024 16:02:09 -0500
Subject: [PATCH 099/100] add oneapi documentation

---
 docs/advanced/oneapi.rst                  | 35 +++++++++++++++++++++++
 docs/index.rst                            |  1 +
 hls4ml/backends/oneapi/oneapi_template.py | 16 +++++++++++
 hls4ml/backends/template.py               | 21 ++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 docs/advanced/oneapi.rst

diff --git a/docs/advanced/oneapi.rst b/docs/advanced/oneapi.rst
new file mode 100644
index 000000000..ae0e0bc56
--- /dev/null
+++ b/docs/advanced/oneapi.rst
@@ -0,0 +1,35 @@
+==============
+oneAPI Backend
+==============
+
+The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
+replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus
+program continues to be used with IP produced by the ``oneAPI`` backend.)
+This section discusses details of the ``oneAPI`` backend.
+
+The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
+accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the
+kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
+PCIe accelerator boards, is planned to be added in the future.
+
+The produced work areas use cmake to build the projects in a style based
+`oneAPI-samples <https://github.com/oneapi-src/oneAPI-samples/tree/main/DirectProgramming/C%2B%2BSYCL_FPGA>`_.
+The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib``
+produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands
+in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
+if desired.
+
+The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There
+is no ``Latency`` implementation of any of the layers.
+
+Note:  currently tracing and external weights (i.e. setting BramFactor) are not supported.
+
+io_parallel and io_stream
+=========================
+
+As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for
+larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its
+own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This
+is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel``
+always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis
+backend sometimes uses dataflow with ``io_parallel``.
diff --git a/docs/index.rst b/docs/index.rst
index c21b90aeb..07fcd217d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -24,6 +24,7 @@
 
     advanced/fifo_depth
     advanced/extension
+    advanced/oneapi
     advanced/accelerator
     advanced/model_optimization
 
diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py
index b11191939..c86b8f7ea 100644
--- a/hls4ml/backends/oneapi/oneapi_template.py
+++ b/hls4ml/backends/oneapi/oneapi_template.py
@@ -6,6 +6,15 @@
 
 
 class StreamFunctionCallTemplate(Template):
+    """Base class for the streaming function call templates in oneAPI:  provides the 'stream_function_cpp' attribute.
+    This generally provides the async call to the task sequence that executes the streaming function.
+
+    Note:  the include header files are specified in the regular FunctionCallTemplate, not here.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
@@ -24,6 +33,13 @@ def transform(self, model, node):
 
 
 class TaskSequenceTemplate(Template):
+    """Base class for the task sequence definition in oneAPI:  provides the 'task_sequence_cpp' attribute.
+    This defines the task sequence that is then called by the StreamFunctionCallTemplate.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
diff --git a/hls4ml/backends/template.py b/hls4ml/backends/template.py
index 9638b53ad..f7f6fe313 100644
--- a/hls4ml/backends/template.py
+++ b/hls4ml/backends/template.py
@@ -2,6 +2,14 @@
 
 
 class Template(OptimizerPass):
+    """The Template base class, should not be instantiated directly
+
+    Args:
+        name (str): Name of the template.
+        layer_class (Layer or list, tuple, or aet of Layers): The Layers that this template handles.
+        attribute_name (str):  The type of attribute provided
+    """
+
     def __init__(self, name, layer_class, attribute_name):
         self.name = name
         self.layer_class = layer_class
@@ -36,6 +44,12 @@ def _default_params(self, node):
 
 
 class LayerConfigTemplate(Template):
+    """Base class for layer config templates:  provides the 'config_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
@@ -53,6 +67,13 @@ def _default_config_params(self, layer):
 
 
 class FunctionCallTemplate(Template):
+    """Base class for function call templates:  provides the 'function_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+        include_header (list, tuple, or set of str, or None):  The list of needed include files
+    """
+
     def __init__(self, layer_class, include_header=None):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])

From 0f475fe6b6a413a43df9cba473ba2c6b25ccbd88 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 22 Oct 2024 15:11:53 -0500
Subject: [PATCH 100/100] add parallelization factor to the attributes for
 oneAPI

---
 hls4ml/backends/oneapi/oneapi_backend.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 801174832..c85a8c0e9 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -35,6 +35,17 @@ def _register_layer_attributes(self):
             attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
             self.attribute_map[layer] = attrs
 
+        # Add ParallelizationFactor to Conv1D/2D
+        pf_layers = [
+            Conv1D,
+            Conv2D,
+        ]
+
+        for layer in pf_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            self.attribute_map[layer] = attrs
+
     def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)