From 45185379af2a2aa80883ec99375b6760f7813478 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 25 Oct 2024 02:43:54 -0400
Subject: [PATCH] Beginnings of the oneAPI backend (#955)

* snapshot adding oneapi

* fix reduce constexpr

* further updates

* update the bridge and testbench

* fix issues discovered when compiling

* update bridge writing files

* build library (but not tested)

* fix a bug in testbench

* snapshot after some debugging

* remove forgotten debug printing

* add build

* pre-commit fixes

* fix more pre-commit

* fix more pre-commit errors

* snapshot of work before reworking types

* Use using to decide array type, some preliminary updates

* snapshot unifying types

* fix the testbench and bridge

* snapshot updating nnet_utils (not finished)

* define array in nnet_types for oneAPI

* fix parallel conv2d

* add back the streaming versions of algs, most unconverted

* tentatively complete streaming for dense but not functional

* first version that compiles streaming

* change how the pipe value type is extracted

* fix pre-commit error

* always treat elu as ELU class

* fix batchnorm

* snapshot towards fixing conv

* snapshot fixing test for streaming

* fix conv1d

* fix conv2d

* fix reshape and flatten for oneAPI

* initial oneAPI tests

* remove nnet_dense_compressed from oneAPI

* add merge functionality (untested)

* fix merge for oneAPI

* fix merge for oneAPI (missing commit)

* add zeropadding

* standardize paralellization spelling

* fix pointwise for oneAPI

* remove references to quartus

* more replace quartus with oneapi

* snapshot on the way towards implementing pooling

* fix io_stream pooling for oneAPI

* add fix for Conv2DBatchnorm

* accidentally committed CMakeLists.txt in my debug setup

* reshaping, not fully tested

* fix cloning of streams

* fix pytest library loading

* remove unused template

* fix some activation bugs

* fix the overwriting of directories in the pytest

* update version of test repository

* try to fix docker issue

* bump hls4ml-testing tag to 0.5.2

* try not restricting tensorflow-model-optimizatoin

* Update to 0.5.3 for testing

* bump to docker image 0.5.4, suggested by Ben

* fix pre-commit warning

* dial down N_TESTS_PER_YAML to 4

* revert tensorflow-model-optimization change

* fix issue of saving in "obsolete" h5 format

* fix embedding for oneAPI

* First attempt at adding RNNs to oneAPI

* fix bug in array size

* fix order or indices

* make queues static in bridge

* fix logic error in repack stream

* changing the style, but functionally identical

* update pointwise optimizer for oneAPI

* add oneAPI to test_multi_dense.py

* fix updating weight types

* initial changes of templates, for testing

* fix weight naming, product selection

* make im2col the default; fix winograd size

* fix up streaming dense and convolution

* fix prelu, some batchnorm

* fix weight array of exponential types

* move ACExponentialPrecisionDefinition to oneapi_types

* attempt to fix batchnorm and recurrent

* fixed BatchNormalizationQuantizedTanhConfigTemplate template selection

* fix embedding_stream

* fix lstm and simple rnn

* fix GRU

* fix winograd, and also disable it by default

* fix threshold name

* split bn_quant to be backend-specific

* add type inference to oneAPI

* add oneAPI to pytorch tests

* fix pooling with padding for oneAPI and Quartus

* Compilation for larger models enabled by increasing -fconstexpr-steps

* add oneapi clone tests; remove reduntand multi_clone test

* remove some attributes to avoid overwrite warnings

* make extra handling for oneAPI like others (as in PR #1067)

* remove warnings for extra optimizers that are not scheduled on purpose

* update parametrized activations

* fix reference to alpha that had not been switched to param

* add oneapi documentation

* add parallelization factor to the attributes for oneAPI

---------

Co-authored-by: Lauri Laatu <l.laatu@imperial.ac.uk>
Co-authored-by: Jan-Frederik Schulte <jschulte@cern.ch>
---
 docs/advanced/oneapi.rst                      |  35 +
 docs/index.rst                                |   1 +
 hls4ml/backends/__init__.py                   |   2 +
 .../{fpga => catapult}/passes/bn_quant.py     |   0
 hls4ml/backends/oneapi/__init__.py            |   0
 hls4ml/backends/oneapi/oneapi_backend.py      | 376 +++++++
 hls4ml/backends/oneapi/oneapi_template.py     |  61 ++
 hls4ml/backends/oneapi/oneapi_types.py        | 267 +++++
 hls4ml/backends/oneapi/passes/__init__.py     |   0
 hls4ml/backends/oneapi/passes/bn_quant.py     | 222 ++++
 .../backends/oneapi/passes/clone_templates.py |  32 +
 .../oneapi/passes/convolution_templates.py    | 235 +++++
 .../oneapi/passes/convolution_winograd.py     | 179 ++++
 .../backends/oneapi/passes/core_templates.py  | 351 +++++++
 .../oneapi/passes/embedding_templates.py      |  32 +
 .../backends/oneapi/passes/merge_templates.py | 137 +++
 hls4ml/backends/oneapi/passes/pointwise.py    | 156 +++
 .../oneapi/passes/pooling_templates.py        | 153 +++
 .../oneapi/passes/quantization_templates.py   |  63 ++
 .../oneapi/passes/recurrent_templates.py      | 369 +++++++
 .../oneapi/passes/reshaping_templates.py      | 244 +++++
 .../oneapi/passes/resource_strategy.py        |  77 ++
 .../backends/oneapi/passes/transform_types.py |  60 ++
 hls4ml/backends/quartus/passes/bn_quant.py    | 169 +++
 .../quartus/passes/convolution_templates.py   |   4 +-
 hls4ml/backends/template.py                   |  21 +
 hls4ml/backends/vivado/passes/bn_quant.py     | 169 +++
 hls4ml/converters/keras/core.py               |   4 +
 hls4ml/model/layers.py                        |   2 +-
 hls4ml/model/optimizer/passes/stamp.py        |   8 +-
 .../objectives/vivado_objectives.py           |   4 +-
 hls4ml/templates/oneapi/CMakeLists.txt        | 338 ++++++
 hls4ml/templates/oneapi/exception_handler.hpp |  21 +
 hls4ml/templates/oneapi/firmware/defines.h    |  20 +
 .../templates/oneapi/firmware/myproject.cpp   |  24 +
 hls4ml/templates/oneapi/firmware/myproject.h  |  29 +
 .../firmware/nnet_utils/nnet_activation.h     | 499 +++++++++
 .../nnet_utils/nnet_activation_stream.h       | 712 +++++++++++++
 .../firmware/nnet_utils/nnet_batchnorm.h      | 104 ++
 .../nnet_utils/nnet_batchnorm_stream.h        | 107 ++
 .../oneapi/firmware/nnet_utils/nnet_common.h  |  76 ++
 .../oneapi/firmware/nnet_utils/nnet_conv1d.h  |  61 ++
 .../nnet_utils/nnet_conv1d_resource.h         | 237 +++++
 .../firmware/nnet_utils/nnet_conv1d_stream.h  | 177 ++++
 .../oneapi/firmware/nnet_utils/nnet_conv2d.h  |  67 ++
 .../nnet_utils/nnet_conv2d_resource.h         | 297 ++++++
 .../firmware/nnet_utils/nnet_conv2d_stream.h  | 241 +++++
 .../oneapi/firmware/nnet_utils/nnet_dense.h   | 164 +++
 .../firmware/nnet_utils/nnet_dense_stream.h   |  23 +
 .../oneapi/firmware/nnet_utils/nnet_embed.h   |  43 +
 .../firmware/nnet_utils/nnet_embed_stream.h   |  31 +
 .../oneapi/firmware/nnet_utils/nnet_helpers.h | 118 +++
 .../oneapi/firmware/nnet_utils/nnet_merge.h   | 232 +++++
 .../firmware/nnet_utils/nnet_merge_stream.h   | 359 +++++++
 .../oneapi/firmware/nnet_utils/nnet_mult.h    | 113 ++
 .../oneapi/firmware/nnet_utils/nnet_padding.h | 104 ++
 .../firmware/nnet_utils/nnet_padding_stream.h |  81 ++
 .../oneapi/firmware/nnet_utils/nnet_pooling.h | 257 +++++
 .../firmware/nnet_utils/nnet_pooling_stream.h | 322 ++++++
 .../oneapi/firmware/nnet_utils/nnet_printf.h  |  18 +
 .../firmware/nnet_utils/nnet_recurrent.h      | 566 ++++++++++
 .../nnet_utils/nnet_recurrent_activation.h    |  47 +
 .../nnet_utils/nnet_recurrent_stream.h        |  68 ++
 .../oneapi/firmware/nnet_utils/nnet_resize.h  |  36 +
 .../firmware/nnet_utils/nnet_resize_stream.h  |  58 ++
 .../oneapi/firmware/nnet_utils/nnet_stream.h  | 126 +++
 .../firmware/nnet_utils/nnet_transpose.h      |  48 +
 .../nnet_utils/nnet_transpose_stream.h        |  39 +
 .../oneapi/firmware/nnet_utils/nnet_types.h   |  71 ++
 hls4ml/templates/oneapi/firmware/parameters.h |  11 +
 hls4ml/templates/oneapi/myproject_bridge.cpp  |  77 ++
 hls4ml/templates/oneapi/myproject_test.cpp    | 133 +++
 .../quartus/firmware/nnet_utils/nnet_conv1d.h |   2 +-
 .../nnet_utils/nnet_conv1d_resource.h         |  12 +-
 .../quartus/firmware/nnet_utils/nnet_conv2d.h |   2 +-
 .../nnet_utils/nnet_conv2d_resource.h         |  18 +-
 .../firmware/nnet_utils/nnet_pooling.h        |  34 +-
 .../quartus/firmware/nnet_utils/nnet_stream.h |   1 +
 hls4ml/utils/fixed_point_utils.py             |  11 +-
 hls4ml/writer/__init__.py                     |   2 +
 hls4ml/writer/oneapi_writer.py                | 969 ++++++++++++++++++
 test/pytest/test_activations.py               |   2 +-
 test/pytest/test_batchnorm.py                 |   4 +-
 test/pytest/test_conv1d.py                    |   4 +
 test/pytest/test_embed.py                     |   4 +-
 test/pytest/test_globalpooling.py             |   4 +-
 test/pytest/test_keras_api.py                 |  12 +-
 test/pytest/test_merge.py                     |  10 +-
 test/pytest/test_multi_dense.py               |   1 +
 test/pytest/test_pointwiseconv.py             |   8 +-
 test/pytest/test_pooling.py                   |  70 +-
 test/pytest/test_pytorch_api.py               |  27 +-
 test/pytest/test_qkeras.py                    |  22 +-
 test/pytest/test_repack_stream.py             |  23 +-
 test/pytest/test_reshape.py                   |   4 +-
 test/pytest/test_rnn.py                       |  39 +-
 test/pytest/test_stream_clone.py              |   4 +-
 test/pytest/test_stream_multi_clone.py        |  48 -
 test/pytest/test_transpose_concat.py          |   4 +-
 test/pytest/test_upsampling.py                |   2 +-
 test/pytest/test_zeropadding.py               |   2 +-
 101 files changed, 10764 insertions(+), 169 deletions(-)
 create mode 100644 docs/advanced/oneapi.rst
 rename hls4ml/backends/{fpga => catapult}/passes/bn_quant.py (100%)
 create mode 100644 hls4ml/backends/oneapi/__init__.py
 create mode 100644 hls4ml/backends/oneapi/oneapi_backend.py
 create mode 100644 hls4ml/backends/oneapi/oneapi_template.py
 create mode 100644 hls4ml/backends/oneapi/oneapi_types.py
 create mode 100644 hls4ml/backends/oneapi/passes/__init__.py
 create mode 100644 hls4ml/backends/oneapi/passes/bn_quant.py
 create mode 100644 hls4ml/backends/oneapi/passes/clone_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/convolution_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/convolution_winograd.py
 create mode 100644 hls4ml/backends/oneapi/passes/core_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/embedding_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/merge_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/pointwise.py
 create mode 100644 hls4ml/backends/oneapi/passes/pooling_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/quantization_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/recurrent_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/reshaping_templates.py
 create mode 100644 hls4ml/backends/oneapi/passes/resource_strategy.py
 create mode 100644 hls4ml/backends/oneapi/passes/transform_types.py
 create mode 100644 hls4ml/backends/quartus/passes/bn_quant.py
 create mode 100644 hls4ml/backends/vivado/passes/bn_quant.py
 create mode 100644 hls4ml/templates/oneapi/CMakeLists.txt
 create mode 100644 hls4ml/templates/oneapi/exception_handler.hpp
 create mode 100644 hls4ml/templates/oneapi/firmware/defines.h
 create mode 100644 hls4ml/templates/oneapi/firmware/myproject.cpp
 create mode 100644 hls4ml/templates/oneapi/firmware/myproject.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
 create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
 create mode 100644 hls4ml/templates/oneapi/firmware/parameters.h
 create mode 100644 hls4ml/templates/oneapi/myproject_bridge.cpp
 create mode 100644 hls4ml/templates/oneapi/myproject_test.cpp
 create mode 100644 hls4ml/writer/oneapi_writer.py
 delete mode 100644 test/pytest/test_stream_multi_clone.py

diff --git a/docs/advanced/oneapi.rst b/docs/advanced/oneapi.rst
new file mode 100644
index 000000000..ae0e0bc56
--- /dev/null
+++ b/docs/advanced/oneapi.rst
@@ -0,0 +1,35 @@
+==============
+oneAPI Backend
+==============
+
+The ``oneAPI`` backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
+replace the ``Quartus`` backend, which should really have been called the Intel HLS backend. (The actual Quartus
+program continues to be used with IP produced by the ``oneAPI`` backend.)
+This section discusses details of the ``oneAPI`` backend.
+
+The ``oneAPI`` code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
+accelerator style of programming. In the IP Component flow, which is currently the only flow supported, the
+kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
+PCIe accelerator boards, is planned to be added in the future.
+
+The produced work areas use cmake to build the projects in a style based
+`oneAPI-samples <https://github.com/oneapi-src/oneAPI-samples/tree/main/DirectProgramming/C%2B%2BSYCL_FPGA>`_.
+The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` are supported. Additionally, ``make lib``
+produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands
+in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
+if desired.
+
+The ``oneAPI`` backend, like the ``Quartus`` backend, only implements the ``Resource`` strategy for the layers. There
+is no ``Latency`` implementation of any of the layers.
+
+Note:  currently tracing and external weights (i.e. setting BramFactor) are not supported.
+
+io_parallel and io_stream
+=========================
+
+As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for
+larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its
+own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This
+is similar in style to the `dataflow` implementation on Vitis, but more explicit. On the other hand, ``io_parallel``
+always uses a single task, relying on pipelining within the task for good performance. In contrast, the Vitis
+backend sometimes uses dataflow with ``io_parallel``.
diff --git a/docs/index.rst b/docs/index.rst
index c21b90aeb..07fcd217d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -24,6 +24,7 @@
 
     advanced/fifo_depth
     advanced/extension
+    advanced/oneapi
     advanced/accelerator
     advanced/model_optimization
 
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 8b3117af7..4a48f072c 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -1,5 +1,6 @@
 from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend  # noqa: F401
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
+from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
@@ -16,3 +17,4 @@
 register_backend('Quartus', QuartusBackend)
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
+register_backend('oneAPI', OneAPIBackend)
diff --git a/hls4ml/backends/fpga/passes/bn_quant.py b/hls4ml/backends/catapult/passes/bn_quant.py
similarity index 100%
rename from hls4ml/backends/fpga/passes/bn_quant.py
rename to hls4ml/backends/catapult/passes/bn_quant.py
diff --git a/hls4ml/backends/oneapi/__init__.py b/hls4ml/backends/oneapi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
new file mode 100644
index 000000000..c85a8c0e9
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -0,0 +1,376 @@
+import subprocess
+from pathlib import Path
+from warnings import warn
+
+import numpy as np
+
+from hls4ml.backends import FPGABackend
+from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
+from hls4ml.model.flow import register_flow
+from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
+from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
+
+# from hls4ml.report import parse_oneapi_report
+
+
+class OneAPIBackend(FPGABackend):
+    def __init__(self):
+        super().__init__('oneAPI')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_layer_attributes(self):
+        # Add RNN-specific recurrent_reuse_factor attribute
+        rnn_layers = [
+            SimpleRNN,
+            LSTM,
+            GRU,
+        ]
+
+        for layer in rnn_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('table_size', default=1024))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            self.attribute_map[layer] = attrs
+
+        # Add ParallelizationFactor to Conv1D/2D
+        pf_layers = [
+            Conv1D,
+            Conv2D,
+        ]
+
+        for layer in pf_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            self.attribute_map[layer] = attrs
+
+    def _register_flows(self):
+        initializers = self._get_layer_initializers()
+        init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
+
+        streaming_passes = ['oneapi:clone_output']
+        streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
+
+        oneapi_types = [
+            'oneapi:transform_types',
+            'oneapi:register_bram_weights',
+            'oneapi:apply_resource_strategy',
+            'oneapi:apply_winograd_kernel_transformation',
+        ]
+        oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=[init_flow], backend=self.name)
+
+        quantization_passes = [
+            'oneapi:merge_batch_norm_quantized_tanh',
+            'oneapi:quantize_dense_output',
+            'fuse_consecutive_batch_normalization',
+            'oneapi:xnor_pooling',
+            'oneapi:generate_conv_im2col',
+        ]
+        quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
+
+        optimization_passes = [
+            'oneapi:remove_final_reshape',
+            'oneapi:optimize_pointwise_conv',
+            'oneapi:inplace_parallel_reshape',
+            'oneapi:skip_softmax',
+            'oneapi:fix_softmax_table_size',
+            'infer_precision_types',
+        ]
+        optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
+
+        templates = self._get_layer_templates()
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
+
+        writer_passes = ['make_stamp', 'oneapi:write_hls']
+
+        self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name)
+
+        all_passes = get_backend_passes(self.name)
+
+        extras = [
+            # Ideally this should be empty
+            opt_pass
+            for opt_pass in all_passes
+            if opt_pass
+            not in initializers
+            + streaming_passes
+            + oneapi_types
+            + quantization_passes
+            + templates
+            + optimization_passes
+            + writer_passes
+            + ['oneapi:inplace_stream_flatten', 'oneapi:reshape_stream']  # not needed
+            + ['oneapi:process_fixed_point_quantizer_layer']  # not yet supported
+        ]
+
+        if len(extras) > 0:
+            for opt in extras:
+                warn(f'WARNING: Optimizer "{opt}" is not part of any flow and will not be executed.')
+
+        ip_flow_requirements = [
+            'optimize',
+            init_flow,
+            streaming_flow,
+            quantization_flow,
+            optimization_flow,
+            oneapi_types_flow,
+            template_flow,
+        ]
+        ip_flow_requirements = list(filter(None, ip_flow_requirements))
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel'):
+        config = {}
+
+        config['Part'] = part if part is not None else 'Arria10'
+        config['ClockPeriod'] = clock_period
+        config['IOType'] = io_type
+        config['HLSConfig'] = {}
+
+        return config
+
+    def compile(self, model):
+        """Compile the generated project that can be linked into Python runtime.
+
+        Args:
+            model (ModelGraph): Model to compile.
+
+        Raises:
+            Exception: If the project failed to compile
+
+        Returns:
+            string: Returns the name of the compiled library.
+        """
+        outdir = Path(Path.cwd(), model.config.get_output_dir())
+        builddir = outdir / 'build'
+        builddir.mkdir(exist_ok=True)
+        try:
+            subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
+        except subprocess.CalledProcessError:
+            raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
+        subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
+        subprocess.run('make lib', shell=True, cwd=builddir, check=True)
+
+        lib_name = builddir / f'lib{model.config.get_project_name()}-{model.config.get_config_value("Stamp")}.so'
+        return lib_name
+
+    def build(self, model, build_type='fpga_emu', run=False):
+        """
+        Builds the project using Intel DPC++ (oneAPI) compiler.
+
+        Args:
+            model (ModelGraph): The model to build
+            build_type, optional: What to build (e.g. fpga_emu, fpga_sim, fpga, report)
+            run, optional: Whether to run the testbench
+        Errors raise exceptions
+        """
+
+        # Check software needed is present
+        outdir = Path(Path.cwd(), model.config.get_output_dir())
+        builddir = outdir / 'build'
+        builddir.mkdir(exist_ok=True)
+        try:
+            subprocess.run('which icpx', shell=True, cwd=builddir, check=True)
+        except subprocess.CalledProcessError:
+            raise RuntimeError('Could not find icpx. Please configure oneAPI appropriately')
+        subprocess.run('cmake ..', shell=True, cwd=builddir, check=True)
+        subprocess.run(f'make {build_type}', shell=True, cwd=builddir, check=True)
+
+        if run and build_type in ('fpga_emu', 'fpga_sim', 'fpga'):
+            executable = builddir / f'{model.config.get_project_name()}.{build_type}'
+            subprocess.run(f'{str(executable)}', shell=True, cwd=builddir, check=True)
+
+    @layer_optimizer(Layer)
+    def init_base_layer(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('reuse_factor', reuse_factor)
+
+        target_cycles = layer.model.config.get_target_cycles(layer)
+        layer.set_attr('target_cycles', target_cycles)
+
+    @layer_optimizer(Dense)
+    def init_dense(self, layer):
+        index_t = IntegerPrecisionType(width=1, signed=False)
+
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        if layer.model.config.get_compression(layer):
+            layer.set_attr('strategy', 'compressed')
+        else:
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            layer.set_attr('strategy', 'resource')
+
+        if layer.model.config.is_resource_strategy(layer):
+            if layer.model.config.get_compression(layer):
+                index_t = layer.get_weights('weight').type.index_precision
+
+        layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t))
+
+    @layer_optimizer(Activation)
+    def init_activation(self, layer):
+        if layer.get_attr('activation') == 'tanh':
+            layer.set_attr('activation', 'dense_tanh')
+        if layer.get_attr('recurrent_activation') == 'tanh':
+            layer.set_attr('recurrent_activation', 'dense_tanh')
+
+    @layer_optimizer(Softmax)
+    def init_softmax(self, layer):
+        if layer.model.config.get_config_value('IOType') == 'io_parallel':
+            assert (
+                len(layer.get_input_variable().shape) == 1
+            ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
+
+    @layer_optimizer(Embedding)
+    def init_embed(self, layer):
+        if layer.attributes['n_in'] is None:
+            raise Exception('Input length of Embedding layer must be specified.')
+
+    @layer_optimizer(GRU)
+    def init_gru(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # Dense multiplication properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        index_t = IntegerPrecisionType(width=1, signed=False)
+        layer.set_attr('index_t', index_t)
+
+        if 'table_t' not in layer.attributes:
+            layer.set_attr(
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8))
+            )
+        if 'table_size' not in layer.attributes:
+            layer.set_attr('table_size', 1024)
+        if True:  # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+
+        layer.set_attr('index_t', index_t)
+
+    @layer_optimizer(Conv1D)
+    def init_conv1d(self, layer):
+        # This can happen if we assign weights of Dense layer to 1x1 Conv1D
+        if len(layer.weights['weight'].data.shape) == 2:
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        # Dense matrix multiply properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        # Reuse and parallelization factors
+        layer.set_attr('strategy', 'resource')
+        n_in, n_out = self.get_layer_mult_size(layer)
+        self.set_target_reuse_factor(layer)
+        self.set_closest_reuse_factor(layer, n_in, n_out)
+        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # impl_filt_width determines the filter size post-Winograd transformation
+        layer.set_attr('impl_filt_width', layer.get_attr('filt_width'))
+
+        # Implementation:
+        # - combination - at compile-time, the decision between Winograd and im2col is made
+        # - im2col - specifically use im2col
+        # - Winograd - use Winograd, if possible
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col'))
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend
+
+    @layer_optimizer(Conv2D)
+    def init_conv2d(self, layer):
+        # This can happen if we assign weights of Dense layer to 1x1 Conv2D
+        if len(layer.weights['weight'].data.shape) == 2:
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        # Dense matrix multiply properties
+        layer.set_attr('rfpad', 0)
+        layer.set_attr('bfpad', 0)
+
+        # Reuse and parallelization factors
+        layer.set_attr('strategy', 'resource')
+        n_in, n_out = self.get_layer_mult_size(layer)
+        self.set_target_reuse_factor(layer)
+        self.set_closest_reuse_factor(layer, n_in, n_out)
+        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation
+        layer.set_attr('impl_filt_height', layer.get_attr('filt_height'))
+        layer.set_attr('impl_filt_width', layer.get_attr('filt_width'))
+
+        # Implementation:
+        # - combination - at compile-time, the decision between Winograd and im2col is made
+        # - im2col - specifically use im2col
+        # - Winograd - use Winograd, if possible
+        layer.set_attr('implementation', layer.model.config.get_layer_config_value(layer, 'Implementation', 'im2col'))
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Not used yet as there is no codegen implementation of CNNs for oneAPI backend
+
+    @layer_optimizer(LSTM)
+    def init_lstm(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # We don't use RF yet
+        if True:  # layer.model.config.is_resource_strategy(layer): ... oneAPI only supports Dense resource multiplication
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+
+        # Split weights for easier storage in on-chip memory and implementation in HLS
+        weights_data = layer.weights['weight'].data
+        rec_weights_data = layer.weights['recurrent_weight'].data
+        bias_data = layer.weights['bias'].data
+
+        weight_types = ['i', 'f', 'c', 'o']
+        for i in range(0, 4):
+            layer.add_weights_variable(
+                name=f'weight_{weight_types[i]}',
+                var_name=f'kernel_{weight_types[i]}_{{index}}',
+                data=weights_data[
+                    0 : layer.get_attr('n_in'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out')
+                ],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+            layer.add_weights_variable(
+                name=f'recurrent_weight_{weight_types[i]}',
+                var_name=f'recurrent_kernel_{weight_types[i]}_{{index}}',
+                data=rec_weights_data[
+                    0 : layer.get_attr('n_out'), i * layer.get_attr('n_out') : (i + 1) * layer.get_attr('n_out')
+                ],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+            layer.add_weights_variable(
+                name=f'bias_{weight_types[i]}',
+                var_name=f'bias_{weight_types[i]}_{{index}}',
+                data=bias_data[i * layer.get_attr('n_out') : (i + 1) * (layer.get_attr('n_out'))],
+                quantizer=layer.get_attr('weight_quantizer'),
+                compression=None,
+            )
+
+    @layer_optimizer(SimpleRNN)
+    def init_simple_rnn(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        # TODO - Consider setting and using RF
diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py
new file mode 100644
index 000000000..c86b8f7ea
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_template.py
@@ -0,0 +1,61 @@
+'''
+This package includes oneAPI-specific templates
+'''
+
+from hls4ml.backends.template import Template
+
+
+class StreamFunctionCallTemplate(Template):
+    """Base class for the streaming function call templates in oneAPI:  provides the 'stream_function_cpp' attribute.
+    This generally provides the async call to the task sequence that executes the streaming function.
+
+    Note:  the include header files are specified in the regular FunctionCallTemplate, not here.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
+    def __init__(self, layer_class):
+        if isinstance(layer_class, (list, tuple, set)):
+            name = '_'.join([cls.__name__.lower() for cls in layer_class])
+        else:
+            name = layer_class.__name__.lower()
+        name += '_stream_function_template'
+        super().__init__(name, layer_class, 'stream_function_cpp')
+
+    def _default_function_params(self, layer):
+        params = self._default_params(layer)
+        params['name'] = layer.name
+        return params
+
+    def transform(self, model, node):
+        return super().transform(model, node)
+
+
+class TaskSequenceTemplate(Template):
+    """Base class for the task sequence definition in oneAPI:  provides the 'task_sequence_cpp' attribute.
+    This defines the task sequence that is then called by the StreamFunctionCallTemplate.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
+    def __init__(self, layer_class):
+        if isinstance(layer_class, (list, tuple, set)):
+            name = '_'.join([cls.__name__.lower() for cls in layer_class])
+        else:
+            name = layer_class.__name__.lower()
+        name += '_task_sequence_template'
+        super().__init__(name, layer_class, 'tast_sequence_cpp')
+
+    def _default_function_params(self, layer):
+        params = self._default_params(layer)
+        params['name'] = layer.name
+        params['config'] = f'config{layer.index}'
+        params['input_pipe'] = layer.get_input_variable().pipe_name
+        params['output_pipe'] = layer.get_output_variable().pipe_name
+
+        return params
+
+    def transform(self, model, node):
+        return super().transform(model, node)
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
new file mode 100644
index 000000000..3106e1e10
--- /dev/null
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -0,0 +1,267 @@
+'''
+This package includes oneAPI-specific customizations to the variable types
+'''
+
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_types import (
+    ACFixedPrecisionDefinition,
+    ACIntegerPrecisionDefinition,
+    FixedPrecisionConverter,
+    HLSTypeConverter,
+    NamedTypeConverter,
+    PrecisionDefinition,
+    TypeDefinition,
+    TypePrecisionConverter,
+    VariableDefinition,
+)
+from hls4ml.model.types import (
+    CompressedType,
+    ExponentPrecisionType,
+    ExponentType,
+    FixedPrecisionType,
+    IntegerPrecisionType,
+    NamedType,
+    PackedType,
+    XnorPrecisionType,
+)
+from hls4ml.utils.fixed_point_utils import next_pow2
+from hls4ml.utils.string_utils import convert_to_pascal_case
+
+
+class ACExponentPrecisionDefinition(PrecisionDefinition):
+    def definition_cpp(self):
+        typestring = f'std::pair<ac_int<1, false>, ac_int<{self.width}, true>>'
+        return typestring
+
+
+class OneAPIACTypeConverter(FixedPrecisionConverter):
+    def __init__(self):
+        super().__init__(
+            type_map={
+                FixedPrecisionType: ACFixedPrecisionDefinition,
+                IntegerPrecisionType: ACIntegerPrecisionDefinition,
+                ExponentPrecisionType: ACExponentPrecisionDefinition,
+                XnorPrecisionType: ACIntegerPrecisionDefinition,
+            },
+            prefix='AC',
+        )
+
+
+class OneAPICompressedTypeConverter(TypeDefinition, TypePrecisionConverter):
+    """Use a tuple for storing a compressed type for oneAPI since it's better supported. (Currently unused)"""
+
+    def definition_cpp(self):
+        """tuple format is row_index, col_index, weight"""
+        cpp_fmt = 'typedef std::tuple<{index}, {index}, {precision}> {name};\n'
+        return cpp_fmt.format(name=self.name, index=self.index_precision, precision=self.precision.definition_cpp())
+
+    def convert_precision(self, precision_converter):
+        super().convert_precision(precision_converter)
+        self.index_precision = precision_converter.convert(self.index_precision)
+
+
+class OneAPIExponentTypeConverter(TypeDefinition, TypePrecisionConverter):
+    """Use a pair for storing a exponent type for oneAPI since it's better supported"""
+
+    def definition_cpp(self):
+        cpp_fmt = 'typedef std::pair<{sign}, {precision}> {name};\n'
+        return cpp_fmt.format(name=self.name, precision=self.precision.definition_cpp(), sign=self.sign.definition_cpp())
+
+    def convert_precision(self, precision_converter):
+        super().convert_precision(precision_converter)
+        self.sign = precision_converter.convert(self.sign)
+
+
+class OneAPIPackedTypeConverter(TypeDefinition, TypePrecisionConverter):
+    def definition_cpp(self):
+        n_elem_expr = '/' if self.unpack else '*'
+        return 'typedef nnet::array<{precision}, {n_elem}> {name};\n'.format(
+            name=self.name,
+            precision=self.precision.definition_cpp(),
+            n_elem=str(self.n_elem) + n_elem_expr + str(self.n_pack),
+        )
+
+    def convert_precision(self, precision_converter):
+        self.precision = precision_converter.convert(self.precision)
+
+
+class OneAPIHLSTypeConverter(HLSTypeConverter):
+    def __init__(self, precision_converter):
+        self.precision_converter = precision_converter
+        self.type_map = {
+            NamedType: NamedTypeConverter,
+            CompressedType: OneAPICompressedTypeConverter,
+            ExponentType: OneAPIExponentTypeConverter,
+            PackedType: OneAPIPackedTypeConverter,
+        }
+
+
+# region ArrayVarable
+
+
+class OneAPIArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if self.pragma and not isinstance(self.pragma, tuple):
+            return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        else:
+            return f'{self.type.name} {self.name}{name_suffix}'
+
+
+class OneAPIInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class AggregratedArrayVariableConverter:
+    """This is a bit of an extension of the standard ArrayVariableConverter"""
+
+    def __init__(self, type_converter, prefix, definition_cls):
+        self.type_converter = type_converter
+        self.prefix = prefix
+        self.definition_cls = definition_cls
+
+    def convert(self, tensor_var, pragma='', depth=0, n_pack=1):
+        if isinstance(tensor_var, self.definition_cls):  # Already converted
+            return tensor_var
+
+        tensor_var.pragma = pragma
+        if pragma == 'stream':
+            if depth == 0:
+                depth = np.prod(tensor_var.shape) // tensor_var.shape[-1]
+            tensor_var.pragma = ('stream', depth)
+            n_elem = tensor_var.shape[-1]
+        else:
+            tensor_var.pragma = pragma
+            n_elem = tensor_var.size()
+            n_pack = 1  # ignore any passed value
+
+        tensor_var.type = self.type_converter.convert(
+            PackedType(tensor_var.type.name, tensor_var.type.precision, n_elem, n_pack)
+        )
+
+        # pipe_name and pipe_id are only used for io_stream and interface variables in io_parallel
+        tensor_var.pipe_name = f'{convert_to_pascal_case(tensor_var.name)}Pipe'
+        tensor_var.pipe_id = f'{convert_to_pascal_case(tensor_var.name)}PipeID'
+
+        tensor_var.__class__ = type(self.prefix + 'AggregateArrayVariable', (type(tensor_var), self.definition_cls), {})
+        return tensor_var
+
+
+class OneAPIArrayVariableConverter(AggregratedArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIArrayVariableDefinition)
+
+
+class OneAPIInplaceArrayVariableConverter(AggregratedArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceArrayVariableDefinition)
+
+
+# endregion
+
+# region InterfaceMemberVariable
+
+
+class OneAPIInterfaceVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if self.pragma and not isinstance(self.pragma, tuple):
+            return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}'
+        else:
+            return f'{self.type.name} {self.name}{name_suffix}'
+
+    def declare_cpp(self, pipe_min_size=0, indent=''):
+        lines = indent + f'class {self.pipe_id};\n'
+        lines += indent + (
+            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+            + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
+        )
+        return lines
+
+
+class OneAPIInterfaceVariableConverter(AggregratedArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInterfaceVariableDefinition)
+
+
+# endregion
+
+
+# region StreamVariable
+class OneAPIStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=True):
+        return f'{self.name}{name_suffix}'
+
+    def declare_cpp(self, indent=''):
+        lines = indent + f'class {self.pipe_id};\n'
+        lines += indent + (
+            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+            + f'{self.type.name}, {self.pragma[-1]}>;\n'
+        )
+        return lines
+
+
+class OneAPIInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'using {self.name} = {self.input_var.name}'
+
+
+class OneAPIStreamVariableConverter(AggregratedArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIStreamVariableDefinition)
+
+
+class OneAPIInplaceStreamVariableConverter(AggregratedArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIInplaceStreamVariableDefinition
+        )
+
+
+# region WeightsVariable
+
+
+class OneAPIStaticWeightVariableDefinition(VariableDefinition):
+    def definition_cpp(self, reuse_factor):
+        """Write the appropriate weight definiiton"""
+        # first determine whether to store in register or bram (heuristic)
+        if reuse_factor == 1 or self.data_length < 2048 or self.type.precision.width < 3:
+            attribute = '[[intel::fpga_register]]'
+        else:
+            # revisit this heuristic
+            nbanks = int(2 ** np.ceil(np.log2(self.data_length)) / 2)
+            var_width = int(np.ceil(self.type.precision.width / 8))
+            bwidth = next_pow2(var_width)
+            attribute = (
+                f'[[intel::bankwidth({bwidth}), intel::numbanks({nbanks}), '
+                'intel::max_replicates(1), intel::fpga_memory("BLOCK_RAM")]]'
+            )
+        if self.storage == 'register':
+            return f'{attribute} static constexpr {self.type.name} {self.name}'
+        else:
+            return f'{attribute} {self.type.name} {self.name}'
+
+
+class OneAPIStaticWeightVariableConverter:
+    def __init__(self, type_converter):
+        self.type_converter = type_converter
+
+    def convert(self, weight_var):
+        if isinstance(weight_var, OneAPIStaticWeightVariableDefinition):  # Already converted
+            return weight_var
+
+        weight_var.weight_class = weight_var.__class__.__name__
+        weight_var.storage = 'register'
+        weight_var.type = self.type_converter.convert(
+            PackedType(weight_var.name + '_t', weight_var.type.precision, weight_var.data_length, 1)
+        )
+
+        weight_var.__class__ = type(
+            'OneAPIStaticWeightVariable', (type(weight_var), OneAPIStaticWeightVariableDefinition), {}
+        )
+        return weight_var
+
+
+# endregion
+
+# endregion
diff --git a/hls4ml/backends/oneapi/passes/__init__.py b/hls4ml/backends/oneapi/passes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/hls4ml/backends/oneapi/passes/bn_quant.py b/hls4ml/backends/oneapi/passes/bn_quant.py
new file mode 100644
index 000000000..8425d5da1
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/bn_quant.py
@@ -0,0 +1,222 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_binary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {threshold_t.name} threshold_t;
+}};\n"""
+
+batchnorm_quantized_tanh_ternary_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {threshold_hi_t.name} threshold_hi_t;
+    typedef {threshold_lo_t.name} threshold_lo_t;
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {output_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+batchnorm_quantized_tanh_task_sequence_template = (
+    'task_sequence<nnet::normalize_{quantize}_tanh_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+batchnorm_quantized_tanh_stream_function_template = '{name}.async({threshold});'
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = (batchnorm_quantized_tanh_binary_config_template, batchnorm_quantized_tanh_ternary_config_template)
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        if node.get_attr('quantize') == 2:
+            return self.template[0].format(**params)
+        else:
+            return self.template[1].format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhTaskSequenceTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhStreamFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/backends/oneapi/passes/clone_templates.py b/hls4ml/backends/oneapi/passes/clone_templates.py
new file mode 100644
index 000000000..447ae126e
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/clone_templates.py
@@ -0,0 +1,32 @@
+""" The clone templates in the fpga backend are not enough for oneAPI, so this adds the missing parts
+"""
+
+from hls4ml.backends.fpga.passes.clone import Clone
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+
+clone_stream_function_template = '{name}.async();'
+
+
+class CloneTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Clone)
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        for i in range(len(node.outputs)):
+            params[f'output{i + 1}_pipe'] = node.variables[node.outputs[i]].pipe_name
+
+        output_pipes = ', '.join([f'{{output{i + 1}_pipe}}' for i in range(len(node.outputs))])
+
+        template = f'task_sequence<nnet::clone_stream<{{input_pipe}}, {output_pipes}, {{size}}>> {{name}};'
+        return template.format(**params)
+
+
+class CloneStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Clone)
+        self.template = clone_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
new file mode 100644
index 000000000..17154559d
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -0,0 +1,235 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
+
+# TODO - Dilation rate ?
+
+''' Shared mutliplication config '''
+conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+''' 1D Conv '''
+conv1d_config_template = """struct config{index} : nnet::conv1d_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned filt_width = {filt_width};
+    static const unsigned impl_filt_width = {impl_filt_width};
+    static const unsigned kernel_size = filt_width;
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned dilation = {dilation};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned parallelization_factor = {parallelization};
+    static const bool store_weights_in_bram = false;
+
+    static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation};
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+}};
+"""
+
+conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+conv1d_task_sequence_template = (
+    'task_sequence<nnet::conv_1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+conv_stream_function_template = '{name}.async({w}, {b});'
+
+conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
+
+
+class Conv1DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Conv1D)
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        conv_params = self._default_config_params(node)
+        conv_params['dilation'] = node.get_attr('dilation', 1)
+        if conv_params['dilation'] != 1:
+            raise RuntimeError('dilation != 1 not supported yet')
+        conv_params['config_t'] = f'config{node.index}_mult'
+        conv_config = self.template.format(**conv_params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Conv1D, include_header=conv1d_include_list)
+        self.template = conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class Conv1DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Conv1D)
+        self.template = conv1d_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
+
+
+class ConvStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Conv1D, Conv2D, Conv2DBatchnorm))
+        self.template = conv_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+''' 2D Conv '''
+conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned filt_height = {filt_height};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned impl_filt_height = {impl_filt_height};
+    static const unsigned impl_filt_width = {impl_filt_width};
+    static const unsigned kernel_size = filt_height * filt_width;
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned parallelization_factor = {parallelization};
+    static const bool store_weights_in_bram = false;
+
+    static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation};
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+}};\n"""
+
+conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+conv2d_task_sequence_template = (
+    'task_sequence<nnet::conv_2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
+
+
+class Conv2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm))
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        conv_params = self._default_config_params(node)
+        conv_params['dilation'] = node.get_attr('dilation', 1)
+        if conv_params['dilation'] != 1:
+            raise RuntimeError('dilation != 1 not supported yet')
+        conv_params['config_t'] = f'config{node.index}_mult'
+        conv_config = self.template.format(**conv_params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list)
+        self.template = conv2d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported for oneAPI')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class Conv2DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm))
+        self.template = conv2d_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/convolution_winograd.py b/hls4ml/backends/oneapi/passes/convolution_winograd.py
new file mode 100644
index 000000000..fdab408b3
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/convolution_winograd.py
@@ -0,0 +1,179 @@
+import math
+
+import numpy as np
+
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyWinogradKernelTransformation(OptimizerPass):
+    '''
+    Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution
+    For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks
+    '''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Conv1D, Conv2D))
+
+        # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters
+        weights_transformed = node.get_attr('_weights_transposed', False) is True
+
+        # User opted for Winograd
+        implementation_is_winograd = (
+            node.get_attr('implementation', 'combination') == 'combination'
+            or node.get_attr('implementation', 'combination') == 'winograd'
+        )
+
+        parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel'
+
+        # Winograd algorithm-specific conditions
+        if isinstance(node, Conv1D):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with stride != 1
+            stride_is_one = node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_width') > 2
+
+            winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type
+
+        elif isinstance(node, (Conv2D)):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with striede != 1
+            stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2
+
+            padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr(
+                'pad_left', 0
+            ) == node.get_attr('pad_right', 0)
+
+            winograd_conditions = (
+                filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type
+            )
+
+        else:
+            winograd_conditions = False
+
+        # Check any previous transformations
+        already_transformed = node.get_attr('_winograd_transformation_applied', False) is True
+
+        if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd':
+            raise RuntimeError(
+                'Not possible to use Winograd algorithm with current architecture. '
+                'Please set implementation to im2col or combination'
+            )
+
+        return (
+            node_matches
+            and weights_transformed
+            and winograd_conditions
+            and not already_transformed
+            and implementation_is_winograd
+        )
+
+    def transform(self, model, node):
+        if isinstance(node, Conv1D):
+            if node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C)
+                # Therefore, (F, W, C) => (F, C, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3) => (4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4))
+
+                # Transformation matrices for 3x1 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel])
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+                        # need to always be consistent
+                        node.weights['weight'].type.n_elem = node.weights['weight'].data_length
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_width', 4)
+
+        elif isinstance(node, Conv2D):
+            if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C)
+                # Therefore, (F, H, W, C) => (F, C, H, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3x3) => (4x4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4))
+
+                # Transformation matrices for 3x3 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+                GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT)
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+                        # need to always be consistent
+                        node.weights['weight'].type.n_elem = node.weights['weight'].data_length
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_height', 4)
+                node.set_attr('impl_filt_width', 4)
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer')
+
+        node.set_attr('_winograd_transformation_applied', True)
+
+        return False
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
new file mode 100644
index 000000000..5ccf1a521
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -0,0 +1,351 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+
+# Dense templates
+
+dense_config_template = """struct config{index} : nnet::dense_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned n_out = {n_out};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned n_zeros = {nzeros};
+    static constexpr unsigned n_nonzeros = {nonzeros};
+    static constexpr bool store_weights_in_bram = false;
+
+    static constexpr unsigned rf_pad = {rfpad};
+    static constexpr unsigned bf_pad = {bfpad};
+
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr unsigned compressed_block_factor = DIV_ROUNDUP(n_nonzeros, reuse_factor);
+    static constexpr unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static constexpr unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static constexpr unsigned block_factor_rounded = block_factor + bf_pad;
+    static constexpr unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static constexpr unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static constexpr unsigned multiplier_scale = multiplier_limit/n_out;
+
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {index_t.name} index_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+dense_stream_function_template = '{name}.async({w}, {b});'
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
+
+
+class DenseConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['nzeros'] = node.get_weights('weight').nzeros
+        params['nonzeros'] = node.get_weights('weight').nonzeros
+        params['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class DenseFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense, include_header=dense_include_list)
+        self.template = dense_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class DenseTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class DenseStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# BatchNormalization templates
+
+batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned n_filt = {n_filt};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr bool store_weights_in_bram = false;
+    typedef {bias_t.name} bias_t;
+    typedef {scale_t.name} scale_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
+batchnorm_task_sequence_template = 'task_sequence<nnet::normalize_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+batchnorm_stream_function_template = '{name}.async({scale}, {bias});'
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# Activation templates
+
+activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+param_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+    typedef {param_t.name} param_t;
+}};\n"""
+
+hard_activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr {slope_t.name} slope = {slope};
+    static constexpr {shift_t.name} shift = {shift};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+}};\n"""
+
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef {exp_table_t.name} exp_table_t;
+    typedef {inv_table_t.name} inv_table_t;
+}};\n"""
+
+activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
+param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
+
+activ_task_sequence_template = 'task_sequence<nnet::{activation}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+activ_stream_function_template = '{name}.async();'
+param_activ_stream_function_template = '{name}.async({param});'
+
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
+
+
+class ActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Activation)
+        self.template = activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class ParamActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((ParametrizedActivation, PReLU))
+        self.template = param_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class HardActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HardActivation)
+        self.template = hard_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class SoftmaxConfigTemplate(ActivationConfigTemplate):
+    def __init__(self):
+        super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
+        self.template = softmax_config_template
+
+
+class ActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+
+        return self.template.format(**params)
+
+
+class ParametrizedActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node._get_act_function_name()
+        params['param'] = node.get_attr('activ_param', 1.0)
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+
+        return self.template.format(**params)
+
+
+class PReLUFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PReLU, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['param'] = node.get_weights('param').name
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+
+        return self.template.format(**params)
+
+
+class ActivationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax, PReLU))
+        self.template = activ_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        return self.template.format(**params)
+
+
+class ParametrizedActivationTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation)
+        self.template = activ_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node._get_act_function_name()
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        return self.template.format(**params)
+
+
+class ActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax))
+        self.template = activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
+
+
+class ParametrizedActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation)
+        self.template = param_activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['param'] = node.get_attr('activ_param', 1.0)
+        return self.template.format(**params)
+
+
+class PReLUActivationStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PReLU)
+        self.template = param_activ_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['param'] = node.get_weights('param').name
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/embedding_templates.py b/hls4ml/backends/oneapi/passes/embedding_templates.py
new file mode 100644
index 000000000..6fda678f0
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/embedding_templates.py
@@ -0,0 +1,32 @@
+"""
+These are the stream oneAPI templates for embedding layers. The io_parallel ones are in backends/fpga/passes/embedding.py.
+"""
+
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.model.layers import Embedding
+
+embed_task_sequence_template = 'task_sequence<nnet::embedding_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+embed_stream_function_template = '{name}.async({e});'
+
+
+class EmbeddingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Embedding)
+        self.template = embed_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class EmbeddingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Embedding)
+        self.template = embed_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['e'] = node.get_weights('embeddings').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
new file mode 100644
index 000000000..c38e1e055
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -0,0 +1,137 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Dot, Merge
+
+# TODO - Very similar to vivado/merge_templates.py - only difference is on line 67:
+# TODO -    get_backend('vivado').product_type(inp1.type.precision, inp2.type.precision)
+# TODO - Look into ways of having passes similar accross many backends in a shared folder thorugh inheritance and overriding.
+
+# Merge templates
+merge_config_template = """struct config{index} : nnet::merge_config {{
+    static const unsigned n_elem = {n_elem};
+}};\n"""
+
+merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
+
+merge_task_sequence_template = (
+    'task_sequence<nnet::{merge}_stream<{input1_pipe}, {input2_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+merge_stream_function_template = '{name}.async();'
+
+merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h']
+
+
+class MergeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Merge)
+        self.template = merge_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp()
+
+        return self.template.format(**params)
+
+
+class MergeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list)
+        self.template = merge_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['merge'] = node.get_attr('op').lower()
+        params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name
+        params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
+        params['input1'] = node.get_input_variable(node.inputs[0]).name
+        params['input2'] = node.get_input_variable(node.inputs[1]).name
+
+        return self.template.format(**params)
+
+
+class MergeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot))
+        self.template = merge_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['merge'] = node.get_attr('op').lower()
+        params['input1_pipe'] = node.get_input_variable(node.inputs[0]).pipe_name
+        params['input2_pipe'] = node.get_input_variable(node.inputs[1]).pipe_name
+        return self.template.format(**params)
+
+
+class MergeStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot))
+        self.template = merge_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+# Dot templates
+dot_config_template = """struct config{index} : nnet::dot_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned reuse_factor = {reuse};
+
+    typedef {accum_t.name} accum_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+
+class DotConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dot)
+        self.template = dot_config_template
+
+    def format(self, node):
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        params = self._default_config_params(node)
+        params['n_out'] = 1
+        params['n_in'] = inp1.shape[0]
+        params['product_type'] = get_backend('oneAPI').product_type(inp1.type.precision, inp2.type.precision)
+
+        return self.template.format(**params)
+
+
+# Concatenate templates
+concat_config_template = """struct config{index} : nnet::concat_config {{
+    static const unsigned n_elem1_0 = {n_elem1_0};
+    static const unsigned n_elem1_1 = {n_elem1_1};
+    static const unsigned n_elem1_2 = {n_elem1_2};
+    static const unsigned n_elem2_0 = {n_elem2_0};
+    static const unsigned n_elem2_1 = {n_elem2_1};
+    static const unsigned n_elem2_2 = {n_elem2_2};
+
+    static const int axis = {axis};
+}};\n"""
+
+
+class ConcatenateConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Concatenate)
+        self.template = concat_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        for i in range(3):
+            params.setdefault(f'n_elem1_{i}', 0)
+            params.setdefault(f'n_elem2_{i}', 0)
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)):
+            params[f'n_elem1_{i}'] = s1
+            params[f'n_elem2_{i}'] = s2
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
new file mode 100644
index 000000000..ccf410d1f
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -0,0 +1,156 @@
+from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.oneapi.passes.convolution_templates import (
+    Conv1DConfigTemplate,
+    Conv2DConfigTemplate,
+    conv1d_config_template,
+    conv2d_config_template,
+    conv_mult_config_template,
+)
+from hls4ml.backends.template import FunctionCallTemplate
+from hls4ml.model.layers import register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+'''
+Custom hls4ml layer implementation for 1x1 Conv filters using im2col
+Allows lower latency andresource usage, due to less loop invocations
+'''
+
+pointwise_conv1d_function_template = (
+    'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+pointwise_conv2d_function_template = (
+    'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+
+pointwise_conv1d_task_sequence_template = (
+    'task_sequence<nnet::pintwise_conv_1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+pointwise_conv2d_task_sequence_template = (
+    'task_sequence<nnet::pintwise_conv_2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+pointwise_conv_stream_function_template = '{name}.async({w}, {b});'
+
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h']
+
+
+class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate):
+    def __init__(self):
+        super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D)
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv1D, include_header=sepconv1d_include_list)
+        self.template = pointwise_conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class PointwiseConv1DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv1D)
+        self.template = pointwise_conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
+
+
+class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
+    def __init__(self):
+        super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D)
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv2D, include_header=sepconv2d_include_list)
+        self.template = pointwise_conv2d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class PointwiseConv2DTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(PointwiseConv2D)
+        self.template = pointwise_conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+        return self.template.format(**params)
+
+
+class PointwiseConvStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((PointwiseConv1D, PointwiseConv2D))
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+def register_pointwise(backend):
+    # Register the layer types to the layer map
+    register_layer('PointwiseConv1D', PointwiseConv1D)
+    register_layer('PointwiseConv2D', PointwiseConv2D)
+
+    # Register the optimization passes
+    backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv)
+
+    # Register template passes
+    backend.register_template(PointwiseConv1DConfigTemplate)
+    backend.register_template(PointwiseConv1DFunctionTemplate)
+    backend.register_template(PointwiseConv2DConfigTemplate)
+    backend.register_template(PointwiseConv2DFunctionTemplate)
+
+
+class OptimizePointwiseConv(OptimizerPass):
+    def match(self, node):
+        return (
+            node.class_name in ('Conv1D', 'Conv2D')
+            and node.get_attr('filt_height', 1) == 1
+            and node.get_attr('filt_width') == 1
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
+
+    def transform(self, model, node):
+        dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        pw_node = model.make_node(
+            'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
+        )
+        model.replace_node(node, pw_node)
+
+        return True
diff --git a/hls4ml/backends/oneapi/passes/pooling_templates.py b/hls4ml/backends/oneapi/passes/pooling_templates.py
new file mode 100644
index 000000000..97136ed84
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/pooling_templates.py
@@ -0,0 +1,153 @@
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D
+
+pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned stride_width = {stride_width};
+    static const unsigned pool_width = {pool_width};
+
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned filt_width = {pool_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
+
+    static const unsigned in_width = {n_in};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+
+    static const unsigned pool_height = {pool_height};
+    static const unsigned pool_width = {pool_width};
+    static const unsigned filt_height = {pool_height};
+    static const unsigned filt_width = {pool_width};
+
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+global_pooling1d_function_template = (
+    'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+global_pooling2d_function_template = (
+    'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+
+pooling1d_task_sequence_template = (
+    'task_sequence<nnet::pooling1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+pooling2d_task_sequence_template = (
+    'task_sequence<nnet::pooling2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+global_pooling1d_task_sequence_template = (
+    'task_sequence<nnet::global_pooling1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+global_pooling2d_task_sequence_template = (
+    'task_sequence<nnet::global_pooling2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>>({name});'
+)
+
+pooling_stream_function_template = '{name}.async();'
+
+pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']
+
+
+class PoolingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.templates = {
+            'Pooling1D': pooling1d_config_template,
+            'Pooling2D': pooling2d_config_template,
+            'GlobalPooling1D': global_pooling1d_config_template,
+            'GlobalPooling2D': global_pooling2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list)
+        self.templates = {
+            'Pooling1D': pooling1d_function_template,
+            'Pooling2D': pooling2d_function_template,
+            'GlobalPooling1D': global_pooling1d_function_template,
+            'GlobalPooling2D': global_pooling2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported for oneAPI')
+        params['data_format'] = 'cl'
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.templates = {
+            'Pooling1D': pooling1d_task_sequence_template,
+            'Pooling2D': pooling2d_task_sequence_template,
+            'GlobalPooling1D': global_pooling1d_task_sequence_template,
+            'GlobalPooling2D': global_pooling2d_task_sequence_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('channels_first not supported for oneAPI')
+        params['data_format'] = 'cl'
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.template = pooling_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/quantization_templates.py b/hls4ml/backends/oneapi/passes/quantization_templates.py
new file mode 100644
index 000000000..c46e17485
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/quantization_templates.py
@@ -0,0 +1,63 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.oneapi.passes.core_templates import (
+    batchnorm_config_template,
+    batchnorm_function_template,
+    batchnorm_include_list,
+    batchnorm_stream_function_template,
+    batchnorm_task_sequence_template,
+)
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha
+
+
+class ApplyAlphaConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
new file mode 100644
index 000000000..00cd16879
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -0,0 +1,369 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GRU, LSTM, SimpleRNN
+
+# Note:  currently only GRU is supported for stream; lstm and simpleRNN are parallel-only
+
+recurrent_include_list = ['nnet_utils/nnet_recurrent.h', 'nnet_utils/nnet_recurrent_stream.h']
+
+################################################
+# Shared Matrix Multiplication Template (Dense)
+################################################
+recr_mult_x_config_template = '''struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n'''
+
+recr_mult_h_config_template = '''struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+
+    static const unsigned rf_pad = {rfpad};
+    static const unsigned bf_pad = {bfpad};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned reuse_factor_rounded = reuse_factor + rf_pad;
+    static const unsigned block_factor = DIV_ROUNDUP(n_in*n_out, reuse_factor);
+    static const unsigned block_factor_rounded = block_factor + bf_pad;
+    static const unsigned multiplier_factor = MIN(n_in, reuse_factor);
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in*n_out, multiplier_factor);
+    static const unsigned multiplier_scale = multiplier_limit/n_out;
+    typedef {accum_t.name} accum_t;
+    typedef {recurrent_bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} weight_t;
+
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n'''
+
+################################################
+# Shared Activation Template
+################################################
+activ_config_template = '''struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n'''
+
+################################################
+# GRU Template
+################################################
+gru_config_template = '''struct config{index} : nnet::gru_config {{
+    static const unsigned n_in  = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_units = {n_units};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned n_outputs = {n_outputs};
+    static const bool return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;
+    typedef {bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} recurrent_weight_t;
+    typedef {recurrent_bias_t.name} recurrent_bias_t;
+
+    typedef {config_mult_x} mult_config_x;
+    typedef {config_mult_h} mult_config_h;
+
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n'''
+
+gru_function_template = 'nnet::gru<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});'
+gru_task_sequence_template = 'task_sequence<nnet::gru_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+gru_stream_function_template = '{name}.async({w}, {wr}, {b}, {br});'
+
+
+class GRUConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.gru_template = gru_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+        self.mult_x_template = recr_mult_x_config_template
+        self.mult_h_template = recr_mult_h_config_template
+
+    def format(self, node):
+        # Input has shape (n_timesteps, inp_dimensionality)
+        # Output / hidden units has shape (1 if !return_sequences else n_timesteps , n_units)
+        params = self._default_config_params(node)
+        params['n_units'] = node.get_attr('n_out')
+        params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+        params['return_sequences'] = 'true' if node.get_attr('return_sequences', False) else 'false'
+        params['config_mult_x'] = f'config{node.index}_x_mult'
+        params['config_mult_h'] = f'config{node.index}_h_mult'
+        params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act')
+        gru_config = self.gru_template.format(**params)
+
+        # Activation is on candidate hidden state, dimensionality (1, n_units)
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        # Recurrent activation is on reset and update gates (therefore x2), dimensionality (1, n_units)
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = str(node.get_attr('n_out')) + ' * 2'
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        # Multiplication config for matrix multiplications of type Wx (reset, update and candidate states)
+        mult_params_x = self._default_config_params(node)
+        mult_params_x['n_in'] = node.get_attr('n_in')
+        mult_params_x['n_out'] = str(node.get_attr('n_out')) + ' * 3'
+        mult_params_x['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_params_x['index'] = str(node.index) + '_x'
+        mult_config_x = self.mult_x_template.format(**mult_params_x)
+
+        # Multiplication config for matrix multiplications of type Wh (reset, update and candidate states)
+        mult_params_h = self._default_config_params(node)
+        mult_params_h['n_in'] = node.get_attr('n_out')
+        mult_params_h['n_out'] = str(node.get_attr('n_out')) + ' * 3'
+        mult_params_h['reuse_factor'] = params['recurrent_reuse_factor']
+        mult_params_h['product_type'] = get_backend('oneAPI').product_type(
+            node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision
+        )
+        mult_params_h['index'] = str(node.index) + '_h'
+        mult_config_h = self.mult_h_template.format(**mult_params_h)
+
+        return mult_config_x + '\n' + mult_config_h + '\n' + recr_act_config + '\n' + act_config + '\n' + gru_config
+
+
+class GRUFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(GRU, include_header=recurrent_include_list)
+        self.template = gru_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['wr'] = node.get_weights('recurrent_weight').name
+        params['br'] = node.get_weights('recurrent_bias').name
+        return self.template.format(**params)
+
+
+class GRUTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.template = gru_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class GRUStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__(GRU)
+        self.template = gru_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['wr'] = node.get_weights('recurrent_weight').name
+        params['br'] = node.get_weights('recurrent_bias').name
+
+        return self.template.format(**params)
+
+
+################################################
+# LSTM Template
+################################################
+lstm_config_template = """struct config{index} : nnet::lstm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_i_t.name} weight_i_t;
+    typedef {bias_i_t.name} bias_i_t;
+    typedef {weight_f_t.name} weight_f_t;
+    typedef {bias_f_t.name} bias_f_t;
+    typedef {weight_c_t.name} weight_c_t;
+    typedef {bias_c_t.name} bias_c_t;
+    typedef {weight_o_t.name} weight_o_t;
+    typedef {bias_o_t.name} bias_o_t;
+    typedef {recurrent_weight_i_t.name} recurrent_weight_i_t;
+    typedef {recurrent_weight_f_t.name} recurrent_weight_f_t;
+    typedef {recurrent_weight_c_t.name} recurrent_weight_c_t;
+    typedef {recurrent_weight_o_t.name} recurrent_weight_o_t;
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n"""
+
+lstm_function_template = 'nnet::lstm<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+
+
+class LSTMConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(LSTM)
+        self.template = lstm_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+
+    def format(self, node):
+        lstm_params = self._default_config_params(node)
+        lstm_params['n_in'] = node.get_attr('n_in')
+        lstm_params['n_out'] = node.get_attr('n_out')
+        lstm_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+
+        lstm_params['return_sequences'] = str(node.get_attr('return_sequences')).lower()
+        lstm_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        lstm_params['act_recurrent_t'] = '{}_config{}'.format(
+            node.get_attr('recurrent_activation'), str(node.index) + '_rec_act'
+        )
+        lstm_config = self.template.format(**lstm_params)
+
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = node.get_attr('n_out')
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        return act_config + '\n' + recr_act_config + '\n' + lstm_config
+
+
+class LSTMFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(LSTM, include_header=recurrent_include_list)
+        self.template = lstm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        types = ['i', 'f', 'c', 'o']
+        params['weights'] = ''
+        for t in types:
+            params['weights'] += f'kernel_{t}_{str(node.index)},'
+        for t in types:
+            params['weights'] += f'recurrent_kernel_{t}_{str(node.index)},'
+        for t in types:
+            params['weights'] += 'bias_{}_{}{}'.format(t, str(node.index), ',' if t != 'o' else '')
+
+        return self.template.format(**params)
+
+
+################################################
+# SimpleRNN Template
+################################################
+simple_rnn_config_template = """struct config{index} : nnet::simpleRNN_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_outputs = {n_outputs};
+    static const unsigned n_timesteps = {n_timesteps};
+    static const unsigned return_sequences = {return_sequences};
+
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;
+    typedef {bias_t.name} bias_t;
+    typedef {recurrent_weight_t.name} recurrent_weight_t;
+
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+
+    typedef {act_recurrent_t} ACT_CONFIG_RECURRENT_T;
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n"""
+
+simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+
+
+class SimpleRNNConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SimpleRNN)
+        self.template = simple_rnn_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = activ_config_template
+
+    def format(self, node):
+        simple_rnn_params = self._default_config_params(node)
+        simple_rnn_params['n_in'] = node.get_attr('n_in')
+        simple_rnn_params['n_out'] = node.get_attr('n_out')
+        simple_rnn_params['n_outputs'] = node.get_attr('n_timesteps') if node.get_attr('return_sequences', False) else '1'
+        simple_rnn_params['return_sequences'] = str(node.get_attr('return_sequences')).lower()
+        simple_rnn_params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
+        simple_rnn_params['act_recurrent_t'] = '{}_config{}'.format(
+            node.get_attr('recurrent_activation'), str(node.index) + '_rec_act'
+        )
+        simple_rnn_params['recurrent_activation'] = 'relu'
+
+        simple_rnn_config = self.template.format(**simple_rnn_params)
+
+        act_params = self._default_config_params(node)
+        act_params['type'] = node.get_attr('activation')
+        act_params['n_in'] = node.get_attr('n_out')
+        act_params['index'] = str(node.index) + '_act'
+        act_config = self.act_template.format(**act_params)
+
+        recr_act_params = self._default_config_params(node)
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        recr_act_params['n_in'] = node.get_attr('n_out')
+        recr_act_params['index'] = str(node.index) + '_rec_act'
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        return act_config + '\n' + recr_act_config + '\n' + simple_rnn_config
+
+
+class SimpleRNNFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SimpleRNN, include_header=recurrent_include_list)
+        self.template = simple_rnn_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/reshaping_templates.py b/hls4ml/backends/oneapi/passes/reshaping_templates.py
new file mode 100644
index 000000000..85357cdb2
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/reshaping_templates.py
@@ -0,0 +1,244 @@
+import numpy as np
+
+from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Reshape, Resize, Transpose, ZeroPadding1D, ZeroPadding2D
+
+# ZeroPadding templates
+
+zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned out_width = {out_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned n_chan = {n_chan};
+
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+
+zeropad1d_task_sequence_template = (
+    'task_sequence<nnet::zeropad1d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+zeropad2d_task_sequence_template = (
+    'task_sequence<nnet::zeropad2d_{data_format}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+
+reshaping_stream_function_template = '{name}.async();'
+
+padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
+
+
+class ZeroPaddingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_config_template,
+            'ZeroPadding2D': zeropad2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class ZeroPaddingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list)
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_function_template,
+            'ZeroPadding2D': zeropad2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise Exception('oneAPI only supports channels_last data format')
+        params['data_format'] = 'cl'
+
+        return self.templates[node.class_name].format(**params)
+
+
+class ZeroPaddingTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_task_sequence_template,
+            'ZeroPadding2D': zeropad2d_task_sequence_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('data_format') == 'channels_first':
+            raise RuntimeError('channels_first not supported on oneAPI')
+        params['data_format'] = 'cl'
+
+        return self.templates[node.class_name].format(**params)
+
+
+class ReshapingStreamFunctionTemplate(StreamFunctionCallTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D, Resize, Reshape, Transpose))
+        self.template = reshaping_stream_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+# Resize templates
+
+resize_config_template = """struct config{index} : nnet::resize_config {{
+    static const unsigned height = {in_height};
+    static const unsigned width = {in_width};
+
+    static const unsigned new_height = {out_height};
+    static const unsigned new_width = {out_width};
+
+    static const unsigned n_chan = {n_chan};
+}};\n"""
+
+resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {output_t}, {config}>({input}, {output});'
+resize_task_sequence_template = (
+    'task_sequence<nnet::resize_{algorithm}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h']
+
+
+class ResizeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Resize)
+        self.template = resize_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class ResizeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Resize, include_header=resize_include_list)
+        self.template = resize_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('algorithm') != 'nearest':
+            raise Exception('Currently only supporting resize_nearest')
+        params['algorithm'] = node.get_attr('algorithm')
+
+        return self.template.format(**params)
+
+
+class ResizeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Resize)
+        self.template = resize_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('algorithm') != 'nearest':
+            raise Exception('Currently only supporting resize_nearest')
+        params['algorithm'] = node.get_attr('algorithm')
+
+        return self.template.format(**params)
+
+
+# Transpose templates
+
+transpose_config_template = """struct config{index} : nnet::transpose_config {{
+    static const unsigned depth = {depth};
+    static const unsigned height = {height};
+    static const unsigned width = {width};
+    static constexpr unsigned perm[3] = {{{perm_str}}};
+}};\n"""
+
+transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
+transpose_task_sequence_template = (
+    'task_sequence<nnet::transpose_{dim}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
+)
+transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']
+
+
+class TransposeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Transpose)
+        self.template = transpose_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class TransposeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Transpose, include_header=transpose_include_list)
+        self.template = transpose_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dim'] = node.get_attr('dim')
+
+        return self.template.format(**params)
+
+
+class TransposeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Transpose)
+        self.template = transpose_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dim'] = node.get_attr('dim')
+
+        return self.template.format(**params)
+
+
+# Reshape template (only used in streaming)
+reshape_task_sequence_template = 'task_sequence<nnet::repack_stream<{input_pipe}, {output_pipe}, {size}>> {name};'
+reshape_include_list = ['nnet_utils/nnet_stream.h']
+
+
+class ReshapeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Reshape)
+
+    def format(self, node):
+        return ''
+
+
+class ReshapeFunctionTemplate(FunctionCallTemplate):
+    """Only used to add the include list"""
+
+    def __init__(self):
+        super().__init__(Reshape, include_header=reshape_include_list)
+
+    def format(self, node):
+        return ''
+
+
+class ReshapeTaskSequenceTemplate(TaskSequenceTemplate):
+    def __init__(self):
+        super().__init__(Reshape)
+        self.template = reshape_task_sequence_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['size'] = np.prod(node.get_output_variable().shape)
+        return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/resource_strategy.py b/hls4ml/backends/oneapi/passes/resource_strategy.py
new file mode 100644
index 000000000..15af1d197
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/resource_strategy.py
@@ -0,0 +1,77 @@
+import numpy as np
+
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SimpleRNN
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyResourceStrategy(OptimizerPass):
+    '''Transposes the weights to use the dense_resource matrix multiply routine'''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Dense, Conv1D, Conv2D, GRU, LSTM, SimpleRNN))
+        is_resource_strategy = (
+            True  # node.get_attr('strategy', '').lower() == 'resource' -> oneAPI only supportr Resource strategy
+        )
+        already_transformed = node.get_attr('_weights_transposed', False) is True
+        return node_matches and is_resource_strategy and not already_transformed
+
+    def transform(self, model, node):
+        if isinstance(node, Dense) and not node.model.config.get_compression(node):
+            rf = node.get_attr('reuse_factor')
+            bf = int((node.attributes['n_in'] * node.attributes['n_out']) / rf)
+            bf_rounded = int(pow(2, np.ceil(np.log2(bf))))
+            rf_rounded = int(pow(2, np.ceil(np.log2(rf))))
+
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data).flatten()
+
+            if node.attributes['n_in'] * node.attributes['n_out'] > 2048 and rf_rounded != rf:
+                node.set_attr('rfpad', rf_rounded - rf)
+                node.set_attr('bfpad', bf_rounded - bf)
+
+                temp = np.empty([bf_rounded, rf_rounded])
+                for i in range(rf_rounded):
+                    for j in range(bf_rounded):
+                        if i < rf and j < bf:
+                            w_index = i + rf * j
+                            temp[j][i] = node.weights['weight'].data[w_index]
+                        else:
+                            temp[j][i] = 0
+                node.weights['weight'].data = temp.flatten()
+                node.weights['weight'].data_length = node.weights['weight'].data.size
+
+        elif isinstance(node, Conv1D):
+            # (W,C,F) => (F,W,C)
+            # IMPORTANT - This format only works with im2col convolution
+            #           - Future commits add new optimizers that further transpose THIS format to a format
+            #                 useful for Winograd's minimal filtering algorithm
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1])
+
+        elif isinstance(node, Conv2D):
+            # (H,W,C,F) => (F,H,W,C)
+            # IMPORTANT - This format only works with im2col convolution
+            #           - Future commits add new optimizers that further transpose THIS format to a format
+            #                 useful for Winograd's minimal filtering algorithm
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[3, 0, 1, 2])
+
+        elif isinstance(node, GRU):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+        elif isinstance(node, SimpleRNN):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+        elif isinstance(node, LSTM):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+
+            for weight_type in ['i', 'f', 'c', 'o']:
+                node.weights[f'weight_{weight_type}'].data = np.transpose(node.weights[f'weight_{weight_type}'].data)
+                node.weights[f'recurrent_weight_{weight_type}'].data = np.transpose(
+                    node.weights[f'recurrent_weight_{weight_type}'].data
+                )
+
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
+        node.set_attr('_weights_transposed', True)
+        return False
diff --git a/hls4ml/backends/oneapi/passes/transform_types.py b/hls4ml/backends/oneapi/passes/transform_types.py
new file mode 100644
index 000000000..8a90bad82
--- /dev/null
+++ b/hls4ml/backends/oneapi/passes/transform_types.py
@@ -0,0 +1,60 @@
+from hls4ml.backends.oneapi.oneapi_types import (
+    OneAPIACTypeConverter,
+    OneAPIArrayVariableConverter,
+    OneAPIHLSTypeConverter,
+    OneAPIInplaceArrayVariableConverter,
+    OneAPIInplaceStreamVariableConverter,
+    OneAPIInterfaceVariableConverter,
+    OneAPIStaticWeightVariableConverter,
+    OneAPIStreamVariableConverter,
+)
+from hls4ml.model.optimizer import GlobalOptimizerPass
+from hls4ml.model.types import InplaceTensorVariable
+
+# from hls4ml.utils.string_utils import convert_to_pascal_case
+
+
+class TransformTypes(GlobalOptimizerPass):
+    def __init__(self):
+        self.type_converter = OneAPIHLSTypeConverter(precision_converter=OneAPIACTypeConverter())
+        self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter)
+        self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter)
+        self.interface_var_converter = OneAPIInterfaceVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter)
+        self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter)
+        self.weight_var_converter = OneAPIStaticWeightVariableConverter(type_converter=self.type_converter)
+
+    def transform(self, model, node):
+        io_type = node.model.config.get_config_value('IOType')
+
+        for out_name, var in node.variables.items():
+            if io_type == 'io_stream':
+                if out_name in node.model.inputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='stream')
+                elif out_name in node.model.outputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='stream')
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_stream_var_converter.convert(var, pragma='stream')
+                else:
+                    new_var = self.stream_var_converter.convert(var, pragma='stream')
+            elif io_type == 'io_parallel':
+                if out_name in node.model.inputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register')
+                elif out_name in node.model.outputs:
+                    new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register')
+                elif isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_array_var_converter.convert(var, pragma='')
+                else:
+                    new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register')
+            else:
+                raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})')
+
+            node.set_attr(out_name, new_var)
+
+        for w_name, weight in node.weights.items():
+            new_weight = self.weight_var_converter.convert(weight)
+            node.set_attr(w_name, new_weight)
+
+        for t_name, type in node.types.items():
+            new_type = self.type_converter.convert(type)
+            node.set_attr(t_name, new_type)
diff --git a/hls4ml/backends/quartus/passes/bn_quant.py b/hls4ml/backends/quartus/passes/bn_quant.py
new file mode 100644
index 000000000..3224b0002
--- /dev/null
+++ b/hls4ml/backends/quartus/passes/bn_quant.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/backends/quartus/passes/convolution_templates.py b/hls4ml/backends/quartus/passes/convolution_templates.py
index 75f8ca687..d1c36fe1b 100644
--- a/hls4ml/backends/quartus/passes/convolution_templates.py
+++ b/hls4ml/backends/quartus/passes/convolution_templates.py
@@ -46,7 +46,7 @@
     static const unsigned dilation = {dilation};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv1d_implementation implementation = nnet::conv1d_implementation::{implementation};
@@ -127,7 +127,7 @@ def format(self, node):
     static const unsigned stride_width = {stride_width};
 
     static const unsigned reuse_factor = {reuse};
-    static const unsigned parallelisation_factor = {parallelization};
+    static const unsigned parallelization_factor = {parallelization};
     static const bool store_weights_in_bram = false;
 
     static const nnet::conv2d_implementation implementation = nnet::conv2d_implementation::{implementation};
diff --git a/hls4ml/backends/template.py b/hls4ml/backends/template.py
index 9638b53ad..f7f6fe313 100644
--- a/hls4ml/backends/template.py
+++ b/hls4ml/backends/template.py
@@ -2,6 +2,14 @@
 
 
 class Template(OptimizerPass):
+    """The Template base class, should not be instantiated directly
+
+    Args:
+        name (str): Name of the template.
+        layer_class (Layer or list, tuple, or aet of Layers): The Layers that this template handles.
+        attribute_name (str):  The type of attribute provided
+    """
+
     def __init__(self, name, layer_class, attribute_name):
         self.name = name
         self.layer_class = layer_class
@@ -36,6 +44,12 @@ def _default_params(self, node):
 
 
 class LayerConfigTemplate(Template):
+    """Base class for layer config templates:  provides the 'config_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
@@ -53,6 +67,13 @@ def _default_config_params(self, layer):
 
 
 class FunctionCallTemplate(Template):
+    """Base class for function call templates:  provides the 'function_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+        include_header (list, tuple, or set of str, or None):  The list of needed include files
+    """
+
     def __init__(self, layer_class, include_header=None):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
diff --git a/hls4ml/backends/vivado/passes/bn_quant.py b/hls4ml/backends/vivado/passes/bn_quant.py
new file mode 100644
index 000000000..3224b0002
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/bn_quant.py
@@ -0,0 +1,169 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_layers import BatchNormalizationQuantizedTanh
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
+
+batchnorm_quantized_tanh_config_template = """struct config{index} : nnet::batchnorm_quantized_tanh_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};\n"""
+
+batchnorm_quantized_tanh_function_template = (
+    'nnet::normalize_{quantize}_tanh<{input_t}, {config}>({input}, {output}, {threshold});'
+)
+
+bn_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationQuantizedTanhConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh)
+        self.template = batchnorm_quantized_tanh_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationQuantizedTanhFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalizationQuantizedTanh, include_header=bn_include_list)
+        self.template = batchnorm_quantized_tanh_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        if node.get_attr('quantize') == 2:
+            params['quantize'] = 'binary'
+            params['threshold'] = node.get_weights('threshold').name
+        elif node.get_attr('quantize') == 3:
+            params['quantize'] = 'ternary'
+            params['threshold'] = node.get_weights('threshold_hi').name + ', ' + node.get_weights('threshold_lo').name
+
+        return self.template.format(**params)
+
+
+def register_bn_quant(backend):
+    # Register the layer types to the layer map
+    register_layer('BatchNormalizationQuantizedTanh', BatchNormalizationQuantizedTanh)
+
+    # Register the optimization passes
+    backend.register_pass('merge_batch_norm_quantized_tanh', MergeBatchNormAndQuantizedTanh)
+    backend.register_pass('quantize_dense_output', QuantizeDenseOutput)
+
+    # Register template passes
+    backend.register_template(BatchNormalizationQuantizedTanhConfigTemplate)
+    backend.register_template(BatchNormalizationQuantizedTanhFunctionTemplate)
+
+
+class MergeBatchNormAndQuantizedTanh(OptimizerPass):
+    def match(self, node):
+        is_match = (
+            node.class_name == 'Activation'
+            and node.get_attr('activation') in ['binary', 'binary_tanh', 'ternary', 'ternary_tanh']
+            or node.class_name == 'TernaryTanh'
+        )
+        is_match = is_match and isinstance(node.get_input_node(), BatchNormalization)
+        return is_match
+
+    def transform(self, model, node):
+        bn_layer = node.get_input_node()
+        # Make a new layer with the new attributes
+        quantize = 0
+        if 'binary' in node.get_attr('activation'):
+            quantize = 2
+        if 'ternary' in node.get_attr('activation'):
+            quantize = 3
+        attrs = {
+            'name': bn_layer.get_attr('name'),
+            'original_name': bn_layer.get_attr('name'),
+            'class_name': 'BatchNormalizationQuantizedTanh',
+            'n_in': bn_layer.get_attr('n_in'),
+            'n_out': bn_layer.get_attr('n_in'),
+            'n_filt': bn_layer.get_attr('n_filt'),
+            'quantize': quantize,
+            'trace': bn_layer.get_attr('trace'),
+        }
+        bnbt_layer = model.make_node(BatchNormalizationQuantizedTanh, 'bnbt_' + bn_layer.name, attrs, bn_layer.inputs)
+        bnbt_layer.set_thresholds(
+            bn_layer.get_weights('scale').data, bn_layer.get_weights('bias').data, node.get_attr('threshold', 0.5)
+        )
+        # Remove the BatchNormalization layer
+        model.remove_node(bn_layer, rewire=True)
+        # Replace the old Activation layer with this one
+        model.replace_node(node, bnbt_layer)
+
+        return True
+
+
+class QuantizeDenseOutput(OptimizerPass):
+    def match(self, node):
+        is_dense = node.class_name == 'Dense'
+        input_node = node.get_input_node()
+        is_input_bnqt = input_node is not None and input_node.class_name == 'BatchNormalizationQuantizedTanh'
+        quantizer = node.get_attr('weight_quantizer')
+        is_binary_ternary = quantizer is not None and (
+            quantizer.__class__.__name__ == 'BinaryQuantizer' or quantizer.__class__.__name__ == 'TernaryQuantizer'
+        )
+        return is_dense and is_input_bnqt and is_binary_ternary
+
+    def transform(self, model, node):
+        # Compute the required precision and update the variables
+        # Number of bits for output is log2 of number of input nodes
+        # Since this is the number of uint<1>'s which are summed
+        nbits = int(np.ceil(np.log2(node.attributes['n_in'])) + 2)
+        out_type = IntegerPrecisionType(width=nbits)
+        accum_t = NamedType(f'layer{node.index}_accum_t', out_type)
+        node.set_attr('accum_t', accum_t)
+        out_var = node.get_output_variable()
+        out_var.type.precision = out_type
+
+        quantized_data = None
+        quantized_precision = None
+        quantizer = node.get_attr('weight_quantizer')
+        if quantizer.__class__.__name__ == 'BinaryQuantizer':
+            quantized_precision = XnorPrecisionType()
+        elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+            quantized_precision = IntegerPrecisionType(width=2)
+        else:
+            print(f'WARNING: Unknown quantizer - {quantizer.__class__.__name__}. Bailing out')
+            return False
+        quantizer.bits = quantized_precision.width
+        quantizer.hls_type = quantized_precision
+        quantized_data = quantizer(node.weights['weight'].data)
+
+        weights = node.weights['weight']
+        weights.data = quantized_data
+        weights.type.name = f'weight{node.index}_t'
+        weights.update_precision(quantized_precision)
+
+        bias = node.weights['bias']
+        bias.data = np.zeros(shape=(node.get_attr('n_out')))
+        bias.type.name = f'bias{node.index}_t'
+        bias.nzeros = 0
+        bias.update_precision(quantized_precision)
+
+        # If followed by the BatchNormalizationBinaryTanh, update its input
+        # Also requantise the weights
+        bd_out_nodes = node.get_output_nodes()
+        for out_node in bd_out_nodes:
+            if isinstance(out_node, BatchNormalizationQuantizedTanh):
+                var_names = []
+                if quantizer.__class__.__name__ == 'BinaryQuantizer':
+                    var_names.append('threshold')
+                elif quantizer.__class__.__name__ == 'TernaryQuantizer':
+                    var_names.append('threshold_hi')
+                    var_names.append('threshold_lo')
+                for var_name in var_names:
+                    threshold_var = out_node.weights[var_name]
+                    threshold_var.update_precision(out_type)
+                    threshold_var.data = np.floor(threshold_var.data)
+
+        return False
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index aff15808a..67798ae7b 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -62,6 +62,10 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
 
     if layer['class_name'] != 'Activation':
         layer['activation'] = layer['class_name']
+
+    if layer['activation'] == 'elu':
+        layer['class_name'] = 'ELU'  # always use ELU type for elu, even if passed as activation
+
     if layer['class_name'] == 'LeakyReLU':
         layer['activ_param'] = keras_layer['config'].get('alpha', 0.3)
     elif layer['class_name'] == 'ThresholdedReLU':
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 1ceb6456b..8054f41ee 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -884,7 +884,7 @@ class HardActivation(Activation):
     def initialize(self):
         super().initialize()
         slope_prec = self.get_attr('slope_prec', FixedPrecisionType(width=16, integer=0, signed=False))
-        shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=1, integer=0, signed=False))
+        shift_prec = self.get_attr('shift_prec', FixedPrecisionType(width=2, integer=0, signed=False))
         index = self.get_attr('index')
         slope_t = NamedType(f'slope{index}_t', precision=slope_prec)
         shift_t = NamedType(f'shift{index}_t', precision=shift_prec)
diff --git a/hls4ml/model/optimizer/passes/stamp.py b/hls4ml/model/optimizer/passes/stamp.py
index f29ae2a18..84bb466aa 100644
--- a/hls4ml/model/optimizer/passes/stamp.py
+++ b/hls4ml/model/optimizer/passes/stamp.py
@@ -1,3 +1,5 @@
+import uuid
+
 from hls4ml.model.optimizer import ModelOptimizerPass
 
 
@@ -9,11 +11,11 @@ def transform(self, model):
         def _make_stamp():
             """Create a unique identifier for the generated code. This identifier is used to
             compile a unique library and link it with python."""
-            from random import choice
-            from string import hexdigits
 
             length = 8
-            return ''.join(choice(hexdigits) for m in range(length))
+
+            stamp = uuid.uuid4()
+            return str(stamp)[-length:]
 
         model.config.config['Stamp'] = _make_stamp()
 
diff --git a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
index 798542cfc..9374f4aef 100644
--- a/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
@@ -32,7 +32,7 @@ def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
             return [0]
         else:
-            # TOOD - Extend for parallelisation factor
+            # TOOD - Extend for parallelization factor
             return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor]
 
     @classmethod
@@ -117,7 +117,7 @@ def layer_resources(self, layer_attributes):
         if not layer_attributes.weight_shape:
             return [0]
 
-        # TOOD - Extend for parallelisation factor
+        # TOOD - Extend for parallelization factor
         if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
             return [
                 int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
new file mode 100644
index 000000000..e2b386d70
--- /dev/null
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -0,0 +1,338 @@
+# Direct CMake to use icpx rather than the default C++ compiler/linker on Linux
+# and icx-cl on Windows
+if(UNIX)
+    set(CMAKE_CXX_COMPILER icpx)
+else() # Windows
+    include (CMakeForceCompiler)
+    CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
+    include (Platform/Windows-Clang)
+endif()
+
+cmake_minimum_required (VERSION 3.7.2)
+
+project(myproject CXX)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+###############################################################################
+### Customize these build variables
+###############################################################################
+set(SOURCE_FILES src/firmware/myproject.cpp src/myproject_test.cpp)
+set(LIBRARY_FILES src/firmware/myproject.cpp src/myproject_bridge.cpp)
+set(LIB_STAMP mystamp)
+set(TARGET_NAME myproject)
+set(LIBRARY_NAME myproject-${LIB_STAMP})
+
+# Use cmake -DFPGA_DEVICE=<board-support-package>:<board-variant> to choose a
+# different device. Here are a few device examples (this list is not
+# exhaustive):
+#   intel_s10sx_pac:pac_s10
+#   intel_s10sx_pac:pac_s10_usm
+#   intel_a10gx_pac:pac_a10
+# Note that depending on your installation, you may need to specify the full
+# path to the board support package (BSP), this usually is in your install
+# folder.
+#
+# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
+# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
+if(NOT DEFINED FPGA_DEVICE)
+    set(FPGA_DEVICE "Arria10")
+endif()
+
+# Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
+# compilation.
+set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+
+# Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
+set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
+
+# Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
+# compilation.
+set(USER_INCLUDE_PATHS src;src/firmware;${USER_INCLUDE_PATHS})
+
+###############################################################################
+### no changes after here
+###############################################################################
+
+# Print the device being used for the compiles
+message(STATUS "Configuring the design to run on FPGA board ${FPGA_DEVICE}")
+
+# Set the names of the makefile targets to be generated by cmake
+set(EMULATOR_TARGET fpga_emu)
+set(SIMULATOR_TARGET fpga_sim)
+set(REPORT_TARGET report)
+set(FPGA_TARGET fpga)
+set(IP_EXPORT_TARGET fpga_ip_export)
+set(LIBRARY_TARGET lib)
+
+# Set the names of the generated files per makefile target
+set(EMULATOR_OUTPUT_NAME ${TARGET_NAME}.${EMULATOR_TARGET})
+set(SIMULATOR_OUTPUT_NAME ${TARGET_NAME}.${SIMULATOR_TARGET})
+set(REPORT_OUTPUT_NAME ${TARGET_NAME}.${REPORT_TARGET})
+set(FPGA_OUTPUT_NAME ${TARGET_NAME}.${FPGA_TARGET})
+set(IP_EXPORT_OUTPUT_NAME ${TARGET_NAME}.${IP_EXPORT_TARGET})
+
+message(STATUS "Additional USER_FPGA_FLAGS=${USER_FPGA_FLAGS}")
+message(STATUS "Additional USER_FLAGS=${USER_FLAGS}")
+
+include_directories(${USER_INCLUDE_PATHS})
+message(STATUS "Additional USER_INCLUDE_PATHS=${USER_INCLUDE_PATHS}")
+
+link_directories(${USER_LIB_PATHS})
+message(STATUS "Additional USER_LIB_PATHS=${USER_LIB_PATHS}")
+
+link_libraries(${USER_LIBS})
+message(STATUS "Additional USER_LIBS=${USER_LIBS}")
+
+if(WIN32)
+    # add qactypes for Windows
+    set(QACTYPES "-Qactypes")
+    # This is a Windows-specific flag that enables exception handling in host code
+    set(WIN_FLAG "/EHsc")
+else()
+    # add qactypes for Linux
+    set(QACTYPES "-qactypes")
+endif()
+
+set(COMMON_COMPILE_FLAGS -fsycl -fintelfpga -Wall ${WIN_FLAG} ${QACTYPES} ${USER_FLAGS})
+# for debugging need to do this. Not sure why
+# set(COMMON_LINK_FLAGS -L/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/host/linux64/lib -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+set(COMMON_LINK_FLAGS -fsycl -fintelfpga ${QACTYPES} ${USER_FLAGS})
+
+# A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
+# 1. The "compile" stage compiles the device code to an intermediate
+#    representation (SPIR-V).
+# 2. The "link" stage invokes the compiler's FPGA backend before linking. For
+#    this reason, FPGA backend flags must be passed as link flags in CMake.
+set(EMULATOR_COMPILE_FLAGS -DFPGA_EMULATOR)
+set(LIBRARY_COMPILE_FLAGS -DFPGA_EMULATOR)
+set(EMULATOR_LINK_FLAGS )
+set(LIBRARY_LINK_FLAGS -L$ENV{FPGA_VARS_DIR}/host/linux64/lib)
+set(REPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(REPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early)
+set(SIMULATOR_COMPILE_FLAGS -Xssimulation -DFPGA_SIMULATOR)
+set(SIMULATOR_LINK_FLAGS -Xssimulation -Xsghdl -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${SIMULATOR_OUTPUT_NAME})
+set(FPGA_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(FPGA_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_OUTPUT_NAME})
+# get rid of this once host pipes work properly
+set(IP_EXPORT_COMPILE_FLAGS -DFPGA_HARDWARE)
+set(IP_EXPORT_LINK_FLAGS -Xshardware -Xstarget=${FPGA_DEVICE} ${USER_FPGA_FLAGS} -fsycl-link=early -fsycl-device-code-split=per_kernel)
+
+###############################################################################
+### FPGA Emulator library
+###############################################################################
+add_library(${LIBRARY_TARGET} SHARED ${LIBRARY_FILES})
+target_compile_options(${LIBRARY_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${LIBRARY_TARGET} PRIVATE ${LIBRARY_COMPILE_FLAGS})
+target_link_libraries(${LIBRARY_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${LIBRARY_TARGET} ${LIBRARY_LINK_FLAGS})
+set_target_properties(${LIBRARY_TARGET} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
+
+###############################################################################
+### FPGA Emulator
+###############################################################################
+add_executable(${EMULATOR_TARGET} ${SOURCE_FILES})
+target_compile_options(${EMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${EMULATOR_TARGET} PRIVATE ${EMULATOR_COMPILE_FLAGS})
+target_link_libraries(${EMULATOR_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${EMULATOR_TARGET} ${EMULATOR_LINK_FLAGS})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${EMULATOR_OUTPUT_NAME})
+
+###############################################################################
+### FPGA Simulator
+###############################################################################
+add_executable(${SIMULATOR_TARGET} ${SOURCE_FILES})
+target_compile_options(${SIMULATOR_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${SIMULATOR_TARGET} PRIVATE ${SIMULATOR_COMPILE_FLAGS})
+target_link_libraries(${SIMULATOR_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${SIMULATOR_TARGET} ${SIMULATOR_LINK_FLAGS})
+set_target_properties(${SIMULATOR_TARGET} PROPERTIES OUTPUT_NAME ${SIMULATOR_OUTPUT_NAME})
+
+###############################################################################
+### Generate Report
+###############################################################################
+add_executable(${REPORT_TARGET} ${SOURCE_FILES})
+target_compile_options(${REPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${REPORT_TARGET} PRIVATE ${REPORT_COMPILE_FLAGS})
+
+# The report target does not need the QACTYPES flag at link stage
+set(MODIFIED_COMMON_LINK_FLAGS_REPORT ${COMMON_LINK_FLAGS})
+list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_REPORT ${QACTYPES})
+
+target_link_libraries(${REPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_REPORT})
+target_link_libraries(${REPORT_TARGET} ${REPORT_LINK_FLAGS})
+set_target_properties(${REPORT_TARGET} PROPERTIES OUTPUT_NAME ${REPORT_OUTPUT_NAME})
+
+###############################################################################
+### FPGA Hardware
+###############################################################################
+add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILES})
+target_compile_options(${FPGA_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${FPGA_TARGET} PRIVATE ${FPGA_COMPILE_FLAGS})
+target_link_libraries(${FPGA_TARGET} ${COMMON_LINK_FLAGS})
+target_link_libraries(${FPGA_TARGET} ${FPGA_LINK_FLAGS})
+set_target_properties(${FPGA_TARGET} PROPERTIES OUTPUT_NAME ${FPGA_OUTPUT_NAME})
+
+###############################################################################
+### FPGA IP Export (only necessary until native host pipes)
+###############################################################################
+add_executable(${IP_EXPORT_TARGET} ${SOURCE_FILES})
+target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${COMMON_COMPILE_FLAGS})
+target_compile_options(${IP_EXPORT_TARGET} PRIVATE ${IP_EXPORT_COMPILE_FLAGS})
+
+# The ip export target does not need the QACTYPES flag at link stage
+set(MODIFIED_COMMON_LINK_FLAGS_EXPORT ${COMMON_LINK_FLAGS})
+list(REMOVE_ITEM MODIFIED_COMMON_LINK_FLAGS_EXPORT ${QACTYPES})
+
+target_link_libraries(${IP_EXPORT_TARGET} ${MODIFIED_COMMON_LINK_FLAGS_EXPORT})
+target_link_libraries(${IP_EXPORT_TARGET} ${IP_EXPORT_LINK_FLAGS})
+set_target_properties(${IP_EXPORT_TARGET} PROPERTIES OUTPUT_NAME ${IP_EXPORT_OUTPUT_NAME})
+
+###############################################################################
+### This part only manipulates cmake variables to print the commands to the user
+###############################################################################
+
+# set the correct object file extension depending on the target platform
+if(WIN32)
+    set(OBJ_EXTENSION "obj")
+else()
+    set(OBJ_EXTENSION "o")
+endif()
+
+# Set the source file names in a string
+set(SOURCE_FILE_NAME "${SOURCE_FILES}")
+
+function(getCompileCommands common_compile_flags special_compile_flags common_link_flags special_link_flags target output_name)
+
+    set(file_names ${SOURCE_FILE_NAME})
+    set(COMPILE_COMMAND )
+    set(LINK_COMMAND )
+
+    foreach(source ${file_names})
+        # Get the relative path to the source and object files
+        file(RELATIVE_PATH CURRENT_SOURCE_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${source})
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+
+        # Creating a string that contains the compile command
+        # Start by the compiler invocation
+        set(COMPILE_COMMAND "${COMPILE_COMMAND}${CMAKE_CXX_COMPILER}")
+
+        # Add all the potential includes
+        foreach(INCLUDE ${USER_INCLUDE_PATHS})
+            if(NOT IS_ABSOLUTE ${INCLUDE})
+                file(RELATIVE_PATH INCLUDE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${INCLUDE})
+            endif()
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} -I${INCLUDE}")
+        endforeach()
+
+        # Add all the common compile flags
+        foreach(FLAG ${common_compile_flags})
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}")
+        endforeach()
+
+        # Add all the specific compile flags
+        foreach(FLAG ${special_compile_flags})
+            set(COMPILE_COMMAND "${COMPILE_COMMAND} ${FLAG}")
+        endforeach()
+
+        # Get the location of the object file
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+
+        # Add the source file and the output file
+        set(COMPILE_COMMAND "${COMPILE_COMMAND} -c ${CURRENT_SOURCE_FILE} -o ${OBJ_FILE}\n")
+    endforeach()
+
+    set(COMPILE_COMMAND "${COMPILE_COMMAND}" PARENT_SCOPE)
+
+    # Creating a string that contains the link command
+    # Start by the compiler invocation
+    set(LINK_COMMAND "${LINK_COMMAND}${CMAKE_CXX_COMPILER}")
+
+    # Add all the common link flags
+    foreach(FLAG ${common_link_flags})
+        set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}")
+    endforeach()
+
+    # Add all the specific link flags
+    foreach(FLAG ${special_link_flags})
+        set(LINK_COMMAND "${LINK_COMMAND} ${FLAG}")
+    endforeach()
+
+    # Add the output file
+    set(LINK_COMMAND "${LINK_COMMAND} -o ${output_name}")
+
+    foreach(source ${file_names})
+        # Get the relative path to the source and object files
+        file(RELATIVE_PATH OBJ_FILE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${target}.dir/${source}.${OBJ_EXTENSION})
+
+        # Add the source file and the output file
+        set(LINK_COMMAND "${LINK_COMMAND} ${OBJ_FILE}")
+    endforeach()
+
+    # Add all the potential library paths
+    foreach(LIB_PATH ${USER_LIB_PATHS})
+        if(NOT IS_ABSOLUTE ${LIB_PATH})
+            file(RELATIVE_PATH LIB_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/${LIB_PATH})
+        endif()
+        if(NOT WIN32)
+            set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH}")
+        else()
+            set(LINK_COMMAND "${LINK_COMMAND} -L${LIB_PATH} -Wl,-rpath,${LIB_PATH}")
+        endif()
+    endforeach()
+
+    # Add all the potential includes
+    foreach(LIB ${USER_LIBS})
+        set(LINK_COMMAND "${LINK_COMMAND} -l${LIB}")
+    endforeach()
+
+    set(LINK_COMMAND "${LINK_COMMAND}" PARENT_SCOPE)
+
+endfunction()
+
+# Windows executable is going to have the .exe extension
+if(WIN32)
+    set(EXECUTABLE_EXTENSION ".exe")
+endif()
+
+# Display the compile instructions in the emulation flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${EMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${EMULATOR_LINK_FLAGS}" "${EMULATOR_TARGET}" "${EMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayEmulationCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${EMULATOR_TARGET} displayEmulationCompileCommands)
+
+# Display the compile instructions in the simulation flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${SIMULATOR_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${SIMULATOR_LINK_FLAGS}" "${SIMULATOR_TARGET}" "${SIMULATOR_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displaySimulationCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${SIMULATOR_TARGET} displaySimulationCompileCommands)
+
+# Display the compile instructions in the report flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${REPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_REPORT}" "${REPORT_LINK_FLAGS}" "${REPORT_TARGET}" "${REPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayReportCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${REPORT_TARGET} displayReportCompileCommands)
+
+# Display the compile instructions in the IP export flow (Remove after native host pipes work properly)
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${IP_EXPORT_COMPILE_FLAGS}" "${MODIFIED_COMMON_LINK_FLAGS_EXPORT}" "${IP_EXPORT_LINK_FLAGS}" "${IP_EXPORT_TARGET}" "${IP_EXPORT_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayExportCompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${IP_EXPORT_TARGET} displayExportCompileCommands)
+
+# Display the compile instructions in the fpga flow
+getCompileCommands("${COMMON_COMPILE_FLAGS}" "${FPGA_COMPILE_FLAGS}" "${COMMON_LINK_FLAGS}" "${FPGA_LINK_FLAGS}" "${FPGA_TARGET}" "${FPGA_OUTPUT_NAME}${EXECUTABLE_EXTENSION}")
+
+add_custom_target(  displayFPGACompileCommands ALL
+                    ${CMAKE_COMMAND} -E cmake_echo_color --cyan ""
+                    COMMENT "To compile manually:\n${COMPILE_COMMAND}\nTo link manually:\n${LINK_COMMAND}")
+add_dependencies(${FPGA_TARGET} displayFPGACompileCommands)
diff --git a/hls4ml/templates/oneapi/exception_handler.hpp b/hls4ml/templates/oneapi/exception_handler.hpp
new file mode 100644
index 000000000..bb7976f61
--- /dev/null
+++ b/hls4ml/templates/oneapi/exception_handler.hpp
@@ -0,0 +1,21 @@
+#ifndef __EXCEPTIONHANDLER_HPP__
+#define __EXCEPTIONHANDLER_HPP__
+#include <exception>
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+namespace fpga_tools {
+
+void exception_handler(sycl::exception_list exceptions) {
+    for (std::exception_ptr const &e : exceptions) {
+        try {
+            std::rethrow_exception(e);
+        } catch (sycl::exception const &e) {
+            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
+        }
+    }
+}
+
+} // namespace fpga_tools
+
+#endif //__EXCEPTIONHANDLER_HPP__
diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h
new file mode 100644
index 000000000..05de507dc
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/defines.h
@@ -0,0 +1,20 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+#include <sycl/ext/intel/ac_types/ac_fixed.hpp>
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
+#include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
new file mode 100644
index 000000000..06e7d3fe3
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -0,0 +1,24 @@
+#include "myproject.h"
+#include "parameters.h"
+#include <sycl/ext/intel/experimental/task_sequence.hpp>
+
+// hls-fpga-machine-learning insert weights
+
+// The inter-task pipes need to be declared in the global scope
+// hls-fpga-machine-learning insert inter-task pipes
+
+using sycl::ext::intel::experimental::task_sequence;
+
+void MyProject::operator()() const {
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning read in
+
+    // hls-fpga-machine-learning declare task sequences
+
+    // hls-fpga-machine-learning insert layers
+
+    // hls-fpga-machine-learning return
+}
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
new file mode 100644
index 000000000..082ae5dc8
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -0,0 +1,29 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include "defines.h"
+
+// This file defines the interface to the kernel
+
+// currently this is fixed
+using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));
+
+// Need to declare the input and output pipes
+
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
+
+class MyProjectID;
+
+struct MyProject {
+
+    // kernel property method to config invocation interface
+    auto get(sycl::ext::oneapi::experimental::properties_tag) {
+        return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>,
+                                                           sycl::ext::intel::experimental::pipelined<>};
+    }
+
+    SYCL_EXTERNAL void operator()() const;
+};
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
new file mode 100644
index 000000000..ab1874ec1
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -0,0 +1,499 @@
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static constexpr unsigned n_in = 10;
+
+    // Internal info
+    static constexpr unsigned table_size = 512;
+
+    // Resource reuse info
+    static constexpr unsigned io_type = io_parallel;
+    static constexpr unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ac_fixed<16, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        res[ii] = datareg;
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T> void relu_max(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(const data_T &data, res_T &res) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(const data_T &data, res_T &res) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_T &data, res_T &res) {
+    static constexpr int MAX_VALUE = 8;
+#include "activation_tables/sigmoid_table.tb"
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        [[intel::fpga_register]] typename data_T::value_type absoluteValue;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
+        if (data[ii] < 0) {
+            absoluteValue = -data[ii];
+        } else {
+            absoluteValue = data[ii];
+        }
+        int index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (absoluteValue > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = static_cast<typename res_T::value_type>(sigmoid_table[index]);
+        if (data[ii] < 0) {
+            res[ii] = 1 - temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
+    // Number of address bits for table
+    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+
+    // Slice the top N bits of the input
+    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
+    if (x != 0 && y == 0)
+        y[0] = 1;
+    return y.to_uint();
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
+    // Number of address bits for table
+    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+
+    // Slice the top N bits of the input
+    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N);
+    return y.to_uint();
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
+// Look-up tables
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
+
+    // Find maximum
+    Op_max<typename data_T::value_type> op_max;
+    [[intel::fpga_register]] auto x_max =
+        reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
+        d_xi_xmax[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+    }
+
+    // Explicitly sum previously calculated exponentials with an adder tree
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+// TODO - Improve accuracy
+template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(const data_T &data, res_T &res) {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
+    // Calculate all the e^x's
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val<typename data_T::value_type, CONFIG_T>(data[i])];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_legacy(const data_T &data, res_T &res) {
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
+
+    [[intel::fpga_register]] int data_round[CONFIG_T::n_in];
+New_loop:
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round[ii] = (data[ii] * CONFIG_T::table_size / 16).to_int();
+    }
+NN_Outer:
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        typename CONFIG_T::exp_table_t exp_res_temp = 0;
+    NN_Inner:
+        #pragma unroll
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj) {
+                exp_res_temp += 1;
+            } else {
+                int _data_cache = (data_round[jj] - data_round[ii]);
+                int index = _data_cache + 8 * CONFIG_T::table_size / 16;
+
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+
+                typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
+                exp_res_temp += temp_exp;
+            }
+        }
+        int exp_res_index = (exp_res_temp * CONFIG_T::table_size / 64).to_int();
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        res[ii] = invert_table_legacy[exp_res_index];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_argmax(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = static_cast<typename res_T::value_type>(0);
+    }
+
+    [[intel::fpga_register]] auto maximum = data[0];
+    [[intel::fpga_register]] int idx = 0;
+
+    [[intel::initiation_interval(1)]] for (int i = 1; i < CONFIG_T::n_in; i++) {
+        if (data[i] > maximum) {
+            maximum = data[i];
+            idx = i;
+        }
+    }
+
+    res[idx] = static_cast<typename res_T::value_type>(1);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax(const data_T &data, res_T &res) {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    default:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void dense_tanh(const data_T &data, res_T &res) {
+    static constexpr int MAX_VALUE = 4;
+// Initialize the lookup table
+#include "activation_tables/tanh_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        [[intel::fpga_register]] typename data_T::value_type temp;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
+        if (data[ii] < 0) {
+            temp = -data[ii];
+        } else {
+            temp = data[ii];
+        }
+        ac_int<16> index = (temp * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = static_cast<typename res_T::value_type>(tanh_table[index]);
+        if (data[ii] < 0) {
+            res[ii] = -temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void hard_sigmoid(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(const data_T &data, const typename CONFIG_T::param_t theta, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void softplus(const data_T &data, res_T &res) {
+// Initialize the lookup table
+#include "activation_tables/softplus_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<16> data_round = (data[ii] * CONFIG_T::table_size / 16).to_int();
+        ac_int<16> index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = static_cast<typename res_T::value_type>(softplus_table[index]);
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void softsign(const data_T &data, res_T &res) {
+    static constexpr int MAX_VALUE = 8;
+// Initialize the lookup table
+#include "activation_tables/softsign_table.tb"
+
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        [[intel::fpga_register]] typename data_T::value_type temp;
+        [[intel::fpga_register]] typename res_T::value_type temp2;
+        if (data[ii] < 0) {
+            temp = -data[ii];
+        } else {
+            temp = data[ii];
+        }
+        ac_int<16> index = (temp * CONFIG_T::table_size / MAX_VALUE).to_int();
+        if (temp > MAX_VALUE)
+            index = CONFIG_T::table_size - 1;
+        temp2 = static_cast<typename res_T::value_type>(softsign_table[index]);
+        if (data[ii] < 0) {
+            res[ii] = -temp2;
+        } else {
+            res[ii] = temp2;
+        }
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(const data_T &data, const typename CONFIG_T::param_t alpha, res_T &res) {
+// Initialize the lookup table
+#include "activation_tables/elu_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void selu(const data_T &data, res_T &res) {
+// Initialize the lookup table
+#include "activation_tables/selu_table.tb"
+    // Index into the lookup table based on data
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = static_cast<typename res_T::value_type>(1.0507009873554804934193349852946) * datareg;
+        } else {
+            ac_int<16> index = (datareg * CONFIG_T::table_size / -8).to_int();
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(const data_T &data, const typename CONFIG_T::param_t &alpha, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void binary_tanh(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = data[ii];
+        typename res_T::value_type cache;
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void ternary_tanh(const data_T &data, res_T &res) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = 2 * data[ii];
+        typename res_T::value_type cache;
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
new file mode 100644
index 000000000..13de5ab3b
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -0,0 +1,712 @@
+#ifndef NNET_ACTIVATION_STREAM_H_
+#define NNET_ACTIVATION_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// *************************************************
+//       Linear Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream() {
+LinearActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    LinearPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            out_data[j] = in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       ReLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
+ReLUActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    ReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void leaky_relu_stream(typename CONFIG_T::param_t alpha) {
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+LeakyReLUActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    LeakyReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void thresholded_relu_stream(typename CONFIG_T::param_t theta) {
+ThresholdedReLUActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    ThresholdedReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void elu_stream(typename CONFIG_T::param_t alpha) {
+#include "activation_tables/elu_table.tb"
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+EluActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    EluPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = datareg;
+            } else {
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = alpha * elu_table[index];
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       SeLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream() {
+#include "activation_tables/selu_table.tb"
+
+SeluActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    SeluPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] =
+                    typename ExtractPipeType<data_pipe>::value_type::value_type(1.0507009873554804934193349852946) * datareg;
+            } else {
+                int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = selu_table[index];
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void prelu_stream(typename CONFIG_T::param_t alpha) {
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+PReLUActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    PReLUPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{} + j] * in_data[j];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus_stream() {
+#include "activation_tables/softplus_table.tb"
+
+SoftplusActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    SoftplusPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
+            [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softplus_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stream() {
+#include "activation_tables/softsign_table.tb"
+
+    static const int MAX_VALUE = 8;
+
+SoftsignActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    SoftsignPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absValue;
+            ;
+            if (in_data[j] < 0) {
+                absValue = -in_data[j];
+            } else {
+                absValue = in_data[j];
+            }
+            ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int();
+            if (absValue > MAX_VALUE)
+                index = CONFIG_T::table_size - 1;
+            if (in_data[j] < 0) {
+                out_data[j] =
+                    static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-softsign_table[index]);
+            } else {
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(softsign_table[index]);
+            }
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
+#include "activation_tables/exp_table.tb"
+#include "activation_tables/invert_table.tb"
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
+        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+
+SoftmaxArrayLoop:
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+                                                  i++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxArrayPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            data_array[j] = in_pack[j];
+        }
+
+        // Find the max and compute all delta(x_i, x_max)
+        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
+        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
+            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
+                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
+
+        // For the diffs, use the same type as the input but force rounding and saturation
+        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
+                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
+            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            d_xi_xmax[j] = data_array[j] - x_max;
+        }
+
+        // Calculate all the e^x's
+        [[intel::fpga_register]]
+        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            exp_res[j] =
+                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                                                           CONFIG_T>(d_xi_xmax[j])];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
+                   Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
+
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+
+            // TODO - Find Quartus-equivalent pragma
+            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_latency_stream() {
+#include "activation_tables/exp_table_latency.tb"
+#include "activation_tables/invert_table_latency.tb"
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+    // Calculate all the e^x's
+    [[intel::fpga_register]]
+    typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+
+SoftmaxExpLoop:
+    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+                                                  i++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxExpPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val<
+                typename ExtractPipeType<data_pipe>::value_type::value_type, CONFIG_T>(in_pack[j])];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        // Multiply previously calculated exponetials with the reciprocal of the sum
+        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legacy_stream() {
+#include "activation_tables/exp_table_legacy.tb"
+#include "activation_tables/invert_table_legacy.tb"
+
+    // Index into the lookup table based on data for exponentials
+    [[intel::fpga_register]]
+    typename CONFIG_T::table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
+    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
+        data_cache[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+
+SoftmaxInitLoop:
+    [[intel::initiation_interval(1)]] for (unsigned s = 0;
+                                           s < CONFIG_T::n_in /
+                                                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+                                           s++) {
+        auto in_pack = data_pipe::read();
+
+    SoftmaxInitPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+            data_cache[j] = in_pack[j];
+            exp_res[j] = 0;
+        }
+
+    SoftmaxExpLoop:
+        #pragma unroll
+        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; i++) {
+        SoftmaxExpInner:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+                if (i == j) {
+                    exp_diff_res = 1;
+                } else {
+                    int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int();
+                    int index = data_round + 8 * CONFIG_T::table_size / 16;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    exp_diff_res = exp_table_legacy[index];
+                }
+                exp_res[i] += exp_diff_res;
+            }
+        }
+
+        typename ExtractPipeType<res_pipe>::value_type out_pack;
+    SoftmaxInvPackLoop:
+        #pragma unroll
+        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
+            out_pack[j] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(invert_table_legacy[exp_res_index]);
+        }
+
+        res_pipe::write(out_pack);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax_stream() {
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+        #pragma unroll
+        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+            out_data[i] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
+        }
+
+        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type maximum = in_data[0];
+        [[intel::fpga_register]] int idx = 0;
+
+        [[intel::initiation_interval(1)]] for (int i = 1;
+                                               i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+            if (in_data[i] > maximum) {
+                maximum = in_data[i];
+                idx = i;
+            }
+        }
+
+        out_data[idx] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stream() {
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency_stream<data_pipe, res_pipe, CONFIG_T>();
+        break;
+    case softmax_implementation::stable:
+        softmax_stable_stream<data_pipe, res_pipe, CONFIG_T>();
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy_stream<data_pipe, res_pipe, CONFIG_T>();
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax_stream<data_pipe, res_pipe, CONFIG_T>();
+        break;
+    default:
+        softmax_stable_stream<data_pipe, res_pipe, CONFIG_T>();
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh_stream() {
+#include "activation_tables/tanh_table.tb"
+    static const int MAX_VALUE = 4;
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+TanHActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    TanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
+
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
+
+            [[intel::fpga_register]] int index;
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = tanh_table[index];
+            else
+                out_data[j] = -tanh_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_stream() {
+#include "activation_tables/sigmoid_table.tb"
+    static const int MAX_VALUE = 8;
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+SigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    SigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
+
+            if (in_data[j] < 0)
+                absoluteValue = (-1) * in_data[j];
+            else
+                absoluteValue = in_data[j];
+
+            [[intel::fpga_register]] int index;
+            if (absoluteValue <= MAX_VALUE)
+                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+            else
+                index = CONFIG_T::table_size - 1;
+
+            if (in_data[j] > 0)
+                out_data[j] = sigmoid_table[index];
+            else
+                out_data[j] = 1 - sigmoid_table[index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+// Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid_stream() {
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+HardSigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    HardSigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
+            out_data[j] = datareg;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh_stream() {
+
+    constexpr unsigned multiplier_limit =
+        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+
+HardSigmoidActLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0;
+                                                  i < CONFIG_T::n_in /
+                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+                                                  i++) {
+
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    HardSigmoidPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh_stream() {
+BinaryTanHActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+
+        [[intel::fpga_register]] auto in_data = data_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    BinaryTanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
+            else
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh_stream() {
+TernaryTanHActLoop:
+    [[intel::initiation_interval(
+        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
+
+        [[intel::fpga_register]] auto in_data = data_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    TernaryTanHPackLoop:
+        #pragma unroll
+        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            if (in_data[j] > 1)
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
+            else if (in_data[j] <= -1)
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
+            else
+                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
new file mode 100644
index 000000000..f8e5bcb79
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm.h
@@ -0,0 +1,104 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(const data_T &data, res_T &res, const typename CONFIG_T::scale_t &scale,
+               const typename CONFIG_T::bias_t &bias) {
+// Calcuate result
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t::value_type>::product(
+                    data[ires], scale[ires]) +
+                bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t::value_type>::product(
+                    data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize_binary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_t &threshold) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<1, false> cache;
+        auto datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize_ternary_tanh(const data_T &data, res_T &res, const typename CONFIG_T::threshold_hi_t &threshold_hi,
+                            const typename CONFIG_T::threshold_lo_t &threshold_lo) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<2, true> cache;
+        auto datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
new file mode 100644
index 000000000..128b3ac1a
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -0,0 +1,107 @@
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                               typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit);
+
+BatchNormLoop:
+    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
+        typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    BatchNormpack:
+        #pragma unroll
+        for (int j = 0; j < datasize; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * datasize + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+            out_data[j] =
+                CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
+                                           typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) +
+                bias[norm_index];
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) {
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+
+BinaryNormLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
+        nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
+
+    BatchNormPack:
+        #pragma unroll
+        for (int j = 0; j < datasize; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * datasize + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi,
+                                   typename CONFIG_T::threshold_lo_t threshold_lo) {
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+
+TernaryNormLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+        auto in_data = data_pipe::read();
+        nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
+
+    BatchNormPack:
+        #pragma unroll
+        for (int j = 0; j < datasize; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt == -1)
+                norm_index = i * datasize + j;
+            else
+                norm_index = j % CONFIG_T::n_filt;
+
+            if (in_data[j] > threshold_hi[norm_index])
+                out_data[j] = 1;
+            else if (in_data[j] <= threshold_lo[norm_index])
+                out_data[j] = -1;
+            else
+                out_data[j] = 0;
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
new file mode 100644
index 000000000..f37a61cb0
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_common.h
@@ -0,0 +1,76 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "nnet_helpers.h"
+#include <sycl/ext/intel/ac_types/ac_fixed.hpp>
+#include <sycl/ext/intel/ac_types/ac_fixed_math.hpp>
+#include <sycl/ext/intel/ac_types/ac_int.hpp>
+
+typedef ac_fixed<16, 6> table_default_t;
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+
+// Default data types (??) TODO: Deprecate
+typedef ac_fixed<16, 4> weight_t_def;
+typedef ac_fixed<16, 4> bias_t_def;
+typedef ac_fixed<32, 10> accum_t_def;
+
+template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
+    #pragma unroll
+    for (int ii = 0; ii < NIN1; ii++) {
+        res[ii] = data1[ii];
+    }
+    #pragma unroll
+    for (int ii = 0; ii < NIN2; ii++) {
+        res[NIN1 + ii] = data2[ii];
+    }
+}
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Quartus cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2<floorlog2<N - 1>::val>::val > 0 ? pow2<floorlog2<N - 1>::val>::val : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if constexpr (N == 1) {
+        return x[0];
+    } else if constexpr (N == 2) {
+        return op(x[0], x[1]);
+    } else {
+        return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+    }
+}
+
+// alternate reduce - basic
+// template <class T, int N, class Op> T reduce(const T *x, Op op) {
+//     if (N == 1) {
+//         return x[0];
+//     }
+//     auto val = op(x[0], x[1]);
+//     for (int i = 2; i < N; i++) {
+//         val = op(val, x[i]);
+//     }
+//     return val;
+// }
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
new file mode 100644
index 000000000..38560f120
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d.h
@@ -0,0 +1,61 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+
+namespace nnet {
+
+struct conv1d_config {
+    // I/O sizes
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+
+    // Run-time Configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelization_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                const typename CONFIG_T::bias_t &biases) {
+    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+    assert(CONFIG_T::filt_width == 1);
+    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 000000000..85009d4a3
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,237 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+enum class conv1d_implementation { combination, im2col, winograd };
+
+// ****************************************************************
+//      im2col - General-purpose 1D Convolution algorithm
+// ****************************************************************
+
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_1d_cl(const data_T &data, data_col_T &data_col, const int col) {
+    // im2col can be unrolled fully, since number of parallel executions = filt_w x n_chann ~ O(100) and very little DSP
+    // usage
+
+    [[intel::fpga_register]] int index = 0;
+
+KernelLoop:
+    #pragma unroll
+    for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) {
+    ChannelLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            [[intel::fpga_register]] int index_data =
+                (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+            if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+                data_col[index++] = data[index_data];
+            } else {
+                data_col[index++] = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                       const typename CONFIG_T::bias_t &biases) {
+    // im2col performs no filter transformations; therefore, filter size remains constant
+    assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
+
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
+ColLoop:
+    #pragma unroll pf
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int i = 0; i < CONFIG_T::out_width; i++) {
+        // Loop variables should always be declared in the deepest scope available
+        // See Intel's HLS - Loop Best Practices
+        // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+        [[intel::fpga_register]] data_col_T data_col;
+        im2col_1d_cl<data_T, data_col_T, CONFIG_T>(data, data_col, i);
+
+        [[intel::fpga_register]] res_col_T res_col;
+        dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+    // Unroll fully, since
+    // (1) n_filt is usually low in io_parallel (< 32)
+    // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+    FiltLoop:
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            res[i * CONFIG_T::n_filt + j] = res_col[j];
+        }
+    }
+}
+
+// ****************************************************************
+//       1D Convolution for 3x1 kernels from Winograd's algoirithm
+// ****************************************************************
+
+// Explicity transofrmed input (B'dB) needed for Winograd convolution, as explained by Lavin & Gray (2015)
+template <typename data_T, typename res_T>
+inline void winograd_transform_input_tile_3x1_kernel(const data_T I[4], res_T D[4]) {
+    D[0] = I[0] - I[2];
+    D[1] = I[1] + I[2];
+    D[2] = -I[1] + I[2];
+    D[3] = I[1] - I[3];
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void winograd_conv1d_3x1_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+    // Ensure Winograd conditions are met
+    assert(CONFIG_T::filt_width == 3);
+    assert(CONFIG_T::stride_width == 1);
+    assert(CONFIG_T::out_width > 2);
+
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+
+    // Initialise result to bias
+    // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        int offset = CONFIG_T::n_filt * i;
+        #pragma unroll
+        for (int f = 0; f < CONFIG_T::n_filt; f++) {
+            res[offset + f] = static_cast<typename res_T::value_type>(biases[f]);
+        }
+    }
+
+WidthLoop:
+    #pragma unroll pf
+    for (int col = 0; col < CONFIG_T::out_width; col += 2) {
+    ChannelLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            // Get current 4x1 tile
+            [[intel::fpga_register]] typename data_T::value_type T[16];
+            [[intel::fpga_register]] uint8_t p = 0;
+
+            #pragma unroll
+            for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) {
+                if (c < CONFIG_T::in_width && c >= 0) {
+                    T[p++] = data[c * CONFIG_T::n_chan + channel];
+                } else {
+                    T[p++] = 0;
+                }
+            }
+
+            // Transform input tile
+            [[intel::fpga_register]] typename CONFIG_T::accum_t D[4];
+            winograd_transform_input_tile_3x1_kernel<typename data_T::value_type, typename CONFIG_T::accum_t>(T, D);
+
+            #pragma unroll
+            for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+                [[intel::fpga_register]] int filter_offset = 4 * (CONFIG_T::n_chan * filter + channel);
+
+                // Hadamard product between transformed input tile and kernel
+                [[intel::fpga_register]] typename CONFIG_T::accum_t Y[4];
+                #pragma unroll
+                for (int i = 0; i < 4; i++) {
+                    Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
+                }
+
+                // Explicitly transform intermediate result Z = A'YA and save to output
+                res[CONFIG_T::n_filt * col + filter] += static_cast<typename res_T::value_type>(Y[0] + Y[1] + Y[2]);
+                if ((col + 1) < CONFIG_T::out_width)
+                    res[CONFIG_T::n_filt * (col + 1) + filter] +=
+                        static_cast<typename res_T::value_type>(Y[1] - Y[2] - Y[3]);
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       1D Convolution for 1x1 kernels using optimized im2col
+// ****************************************************************
+
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_1d_pointwise_cl(const data_T &data, data_col_T &data_col, const int col) {
+    // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
+
+    [[intel::fpga_register]] int index = 0;
+
+ChannelLoop:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        [[intel::fpga_register]] int index_data =
+            (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+        if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+            data_col[index++] = data[index_data];
+        } else {
+            data_col[index++] = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+    assert(CONFIG_T::filt_width == 1);
+
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
+ColLoop:
+    #pragma unroll pf
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) {
+        // Loop variables should always be declared in the deepest scope available
+        // See Intel's HLS - Loop Best Practices
+        // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+        [[intel::fpga_register]] data_col_T data_col;
+        im2col_1d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, col);
+
+        [[intel::fpga_register]] res_col_T res_col;
+        dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+    // Unroll fully, since
+    // (1) n_filt is usually low in io_parallel (< 32)
+    // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+    FiltLoop:
+        #pragma unroll
+        for (int k = 0; k < CONFIG_T::n_filt; k++) {
+            res[col * CONFIG_T::n_filt + k] = res_col[k];
+        }
+    }
+}
+
+// ****************************************************************
+//      Top-level function - handles different implementations
+// ****************************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                         const typename CONFIG_T::bias_t &biases) {
+    static constexpr bool winograd_conditions =
+        // Winograd's minimal filtering algorithm not applicable to stride != 1
+        CONFIG_T::stride_width == 1 &&
+
+        // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+        CONFIG_T::out_width > 2 &&
+
+        // Verify user opted for Winograd
+        (CONFIG_T::implementation == nnet::conv1d_implementation::combination ||
+         CONFIG_T::implementation == nnet::conv1d_implementation::winograd);
+
+    if (CONFIG_T::filt_width == 3 && winograd_conditions) {
+        winograd_conv1d_3x1_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_im2col_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 000000000..1ffd11774
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,177 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void kernel_shift(shift_buffer, kernel_window)
+ *
+ * Args:
+ *   shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
+ *   kernel_window - array of values from the input curently being convolved with the kernel
+ *
+ * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
+ */
+template <class data_T, class data_window_T, typename CONFIG_T>
+void kernel_shift_1d(typename data_T::value_type shift_buffer[CONFIG_T::n_chan], data_window_T &kernel_window) {
+/*
+ * Manually shift kernel_window by one step to the left
+ * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
+ * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are
+ * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see
+ * synthesis report in report.html > Area Analysis of System
+ */
+KernelShiftWidth:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
+    KernelShiftChannel:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel];
+        }
+    }
+
+// Insert shift_buffer values into the last column of the kernel window
+KernelPushChannel:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel];
+    }
+}
+
+/*
+ * void shift_line_buffer(in_element, line_buffer, shift_buffer)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer -
+ * array elements popped from the line the buffer during the shift operation
+ *
+ * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
+ * Popped elements are later used to update the kernel window, during the kernel_shift operation
+ */
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer_1d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type shift_buffer[CONFIG_T::n_chan]) {
+// For every channel, insert the incoming pixel at end of the shift buffer
+UpdateBuffer:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        shift_buffer[channel] = in_elem[channel];
+    }
+}
+
+/*
+ * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved
+ * with the kernel weights - Conv1D layer weights biases - Conv1D layer biases
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
+ * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
+ */
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_output_buffer_1d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan],
+    data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases,
+    int &pX, int &sX) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    // Thresholds
+    constexpr int lShiftX = CONFIG_T::filt_width - 1;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
+    nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_1d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
+        // Step 3 - Dense matrix multiplication
+        [[intel::fpga_register]] res_T res_out;
+        dense_resource<data_window_T, res_T, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
+
+        // Write result to output stream
+        [[intel::fpga_register]] res_T res_pack;
+    CastLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
+            res_pack[channel] = res_out[channel];
+        }
+        res_pipe::write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        // Move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void conv_1d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::filt_width * CONFIG_T::n_chan>;
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[CONFIG_T::n_chan];
+    [[intel::fpga_register]] data_window_T kernel_window;
+
+    // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
+    constexpr auto padds = zero_array<data_arr_T>();
+
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // X strides
+    int sX = 0;
+
+// Input image left-side padding
+PaddingLeftWidth:
+    for (int col = 0; col < CONFIG_T::pad_left; col++) {
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
+                                                                                biases, pX, sX);
+    }
+
+// Read input image
+ReadInputWidth:
+    for (int col = 0; col < CONFIG_T::in_width; col++) {
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer,
+                                                                                kernel_window, weights, biases, pX, sX);
+    }
+
+// Input image right-side padding
+PaddingRightWidth:
+    for (int col = 0; col < CONFIG_T::pad_right; col++) {
+        compute_output_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window, weights,
+                                                                                biases, pX, sX);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
new file mode 100644
index 000000000..79b1508c5
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d.h
@@ -0,0 +1,67 @@
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_conv2d_resource.h"
+
+namespace nnet {
+
+struct conv2d_config {
+    // I/O sizes
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    // Run-time configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelization_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                const typename CONFIG_T::bias_t &biases) {
+    conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+    pointwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 000000000..7265d90e1
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,297 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+enum class conv2d_implementation { combination, im2col, winograd };
+
+// ****************************************************************
+//      im2col - General-purpose 2D Convolution algorithm
+// ****************************************************************
+
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_2d_cl(const data_T &data, data_col_T &data_col, const int row, const int col) {
+    // im2col can be unrolled fully, since number of parallel executions = filt_h x filt_w x n_chann ~ O(100) and very little
+    // DSP usage
+
+    [[intel::fpga_register]] int index = 0;
+
+FiltHeightLoop:
+    #pragma unroll
+    for (int kernel_row = 0; kernel_row < CONFIG_T::impl_filt_height; kernel_row++) {
+        [[intel::fpga_register]] int input_row =
+            -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height;
+
+    FiltWidthLoop:
+        #pragma unroll
+        for (int kernel_col = 0; kernel_col < CONFIG_T::impl_filt_width; kernel_col++) {
+            [[intel::fpga_register]] int input_col =
+                -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width;
+
+        ChannelLoop:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) {
+                    data_col[index++] =
+                        data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+                } else {
+                    data_col[index++] = 0;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_im2col_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                       const typename CONFIG_T::bias_t &biases) {
+    // im2col performs no filter transformations; therefore, filter size remains constant
+    assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
+
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
+
+    using data_col_T =
+        array<typename data_T::value_type, CONFIG_T::impl_filt_height * CONFIG_T::impl_filt_width * CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int i = 0; i < CONFIG_T::out_height; i++) {
+    WidthLoop:
+        #pragma unroll pfc
+        [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::out_width; j++) {
+            // Loop variables should always be declared in the deepest scope available
+            // See Intel's HLS - Loop Best Practices
+            // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+            [[intel::fpga_register]] data_col_T data_col;
+            im2col_2d_cl<data_T, data_col_T, CONFIG_T>(data, data_col, i, j);
+
+            [[intel::fpga_register]] res_col_T res_col;
+            dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+        // Unroll fully, since
+        // (1) n_filt is usually low in io_parallel (< 32)
+        // (2) no complex operations handled in loop, this loop performs a simple register writing operation
+        FiltLoop:
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       2D Convolution for 3x3 kernels from Winograd's algoirithm
+// ****************************************************************
+
+// Explicity transofrmed input (B'dB) needed for Winograd calculation, as explained by Lavin & Gray, 2015
+template <typename data_T, typename res_T>
+inline void winograd_transform_input_tile_3x3_kernel(const data_T I[16], res_T D[16]) {
+    D[0] = I[0] - I[2] - I[8] + I[10];
+    D[1] = I[1] + I[2] - I[9] - I[10];
+    D[2] = -I[1] + I[2] + I[9] - I[10];
+    D[3] = I[1] - I[3] - I[9] + I[11];
+
+    D[4] = I[4] - I[6] + I[8] - I[10];
+    D[5] = I[5] + I[6] + I[9] + I[10];
+    D[6] = -I[5] + I[6] - I[9] + I[10];
+    D[7] = I[5] - I[7] + I[9] - I[11];
+
+    D[8] = -I[4] + I[6] + I[8] - I[10];
+    D[9] = -I[5] - I[6] + I[9] + I[10];
+    D[10] = I[5] - I[6] - I[9] + I[10];
+    D[11] = -I[5] + I[7] + I[9] - I[11];
+
+    D[12] = I[4] - I[6] - I[12] + I[14];
+    D[13] = I[5] + I[6] - I[13] - I[14];
+    D[14] = I[6] - I[5] + I[13] - I[14];
+    D[15] = I[5] - I[7] - I[13] + I[15];
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void winograd_conv2d_3x3_kernel_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+    // Ensure Winograd conditions are met
+    assert(CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3);
+    assert(CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1);
+    assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom);
+    assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2);
+
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
+
+    // Initialise result to bias
+    // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) {
+        int offset = CONFIG_T::n_filt * i;
+        #pragma unroll
+        for (int f = 0; f < CONFIG_T::n_filt; f++) {
+            res[offset + f] = static_cast<typename res_T::value_type>(biases[f]);
+        }
+    }
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int row = 0; row < CONFIG_T::out_height; row += 2) {
+    WidthLoop:
+        #pragma unroll pfc
+        for (int col = 0; col < CONFIG_T::out_width; col += 2) {
+        ChannelLoop:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                // Get current 4x4 tile
+                [[intel::fpga_register]] typename data_T::value_type T[16];
+                [[intel::fpga_register]] typename CONFIG_T::accum_t D[16];
+                [[intel::fpga_register]] uint8_t p = 0;
+
+                #pragma unroll
+                for (int r = row - (int)CONFIG_T::pad_top; r < row + 4 - (int)CONFIG_T::pad_top; r++) {
+                    #pragma unroll
+                    for (int c = col - (int)CONFIG_T::pad_left; c < col + 4 - (int)CONFIG_T::pad_left; c++) {
+                        if (r < CONFIG_T::in_height && r >= 0 && c < CONFIG_T::in_width && c >= 0) {
+                            T[p++] = data[r * CONFIG_T::in_width * CONFIG_T::n_chan + c * CONFIG_T::n_chan + channel];
+                        } else {
+                            T[p++] = 0;
+                        }
+                    }
+                }
+
+                // Transform input tile
+                winograd_transform_input_tile_3x3_kernel<typename data_T::value_type, typename CONFIG_T::accum_t>(T, D);
+
+                #pragma unroll
+                for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+                    [[intel::fpga_register]] int filter_offset = 16 * (CONFIG_T::n_chan * filter + channel);
+
+                    // Hadamard product between transformed input tile and kernel
+                    [[intel::fpga_register]] typename CONFIG_T::accum_t Y[16];
+                    #pragma unroll
+                    for (int i = 0; i < 16; i++) {
+                        Y[i] = static_cast<typename CONFIG_T::accum_t>(D[i] * weights[filter_offset + i]);
+                    }
+
+                    // Explicitly transform intermediate result Z = A'YA and save to output
+                    res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + col) + filter] +=
+                        static_cast<typename res_T::value_type>(Y[0] + Y[1] + Y[2] + Y[4] + Y[5] + Y[6] + Y[8] + Y[9] +
+                                                                Y[10]);
+                    if ((col + 1) < CONFIG_T::out_height)
+                        res[CONFIG_T::n_filt * (row * CONFIG_T::out_width + (col + 1)) + filter] +=
+                            static_cast<typename res_T::value_type>(Y[1] - Y[2] - Y[3] + Y[5] - Y[6] - Y[7] + Y[9] - Y[10] -
+                                                                    Y[11]);
+                    if ((row + 1) < CONFIG_T::out_width)
+                        res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + col) + filter] +=
+                            static_cast<typename res_T::value_type>(Y[4] + Y[5] + Y[6] - Y[8] - Y[9] - Y[10] - Y[12] -
+                                                                    Y[13] - Y[14]);
+                    if ((row + 1) < (CONFIG_T::out_width) && (col + 1) < CONFIG_T::out_height)
+                        res[CONFIG_T::n_filt * ((row + 1) * CONFIG_T::out_width + (col + 1)) + filter] +=
+                            static_cast<typename res_T::value_type>(Y[5] - Y[6] - Y[7] - Y[9] + Y[10] + Y[11] + Y[15] -
+                                                                    Y[13] + Y[14]);
+                }
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//       2D Convolution for 1x1 kernels using optimized im2col
+// ****************************************************************
+
+template <class data_T, class data_col_T, typename CONFIG_T>
+void im2col_2d_pointwise_cl(const data_T &data, data_col_T &data_col, const int row, const int col) {
+    // pointwise_im2col can be unrolled fully, only one loop with n_chan iterations
+
+    [[intel::fpga_register]] int index = 0;
+
+ChannelLoop:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+
+        [[intel::fpga_register]] int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height;
+        [[intel::fpga_register]] int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width;
+
+        if (input_row >= 0 && input_row < CONFIG_T::in_height && input_col >= 0 && input_col < CONFIG_T::in_width) {
+            data_col[index++] =
+                data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+        } else {
+            data_col[index++] = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
+    // Outer loop only gets unrolled after inner loop is fully unrolled
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
+
+    using data_col_T = array<typename data_T::value_type, CONFIG_T::n_chan>;
+    using res_col_T = array<typename res_T::value_type, CONFIG_T::n_filt>;
+
+HeightLoop:
+    #pragma unroll pfr
+    for (int row = 0; row < CONFIG_T::out_height; row++) {
+    WidthLoop:
+        #pragma unroll pfc
+        [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int col = 0; col < CONFIG_T::out_width; col++) {
+            // Loop variables should always be declared in the deepest scope available
+            // See Intel's HLS - Loop Best Practices
+            // https://www.intel.com/content/www/us/en/docs/programmable/683152/22-2/declare-variables-in-the-deepest-scope.html
+
+            [[intel::fpga_register]] data_col_T data_col;
+            im2col_2d_pointwise_cl<data_T, data_col_T, CONFIG_T>(data, data_col, row, col);
+
+            [[intel::fpga_register]] res_col_T res_col;
+            dense_resource<data_col_T, res_col_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+
+        FiltLoop:
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[row * CONFIG_T::out_width * CONFIG_T::n_filt + col * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+// ****************************************************************
+//      Top-level function - handles different implementations
+// ****************************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                         const typename CONFIG_T::bias_t &biases) {
+    static constexpr bool winograd_conditions =
+        // Winograd's minimal filtering algorithm not applicable to stride != 1
+        CONFIG_T::stride_height == 1 && CONFIG_T::stride_width == 1 &&
+
+        // Intel HLS will fail to pipeline the entire component if the Winograd loop only runs once
+        CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2 &&
+
+        // Verify user opted for Winograd
+        (CONFIG_T::implementation == nnet::conv2d_implementation::combination ||
+         CONFIG_T::implementation == nnet::conv2d_implementation::winograd);
+
+    if (CONFIG_T::filt_height == 3 && CONFIG_T::filt_width == 3 && winograd_conditions) {
+        winograd_conv2d_3x3_kernel_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_im2col_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 000000000..08f0eaa87
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,241 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void kernel_shift(shift_buffer, kernel_window)
+ *
+ * Args:
+ *   shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
+ *   kernel_window - array of values from the input curently being convolved with the kernel
+ *
+ * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
+ */
+template <class data_T, class data_window_T, typename CONFIG_T>
+void kernel_shift_2d(typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
+                     data_window_T &kernel_window) {
+/*
+ * Manually shift kernel_window by one step to the left
+ * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix
+ * multiplication Dense matrix multiplication is only implemented for arrays However, provided certain timing constrains are
+ * met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register To verify, see
+ * synthesis report in report.html > Area Analysis of System
+ */
+KernelShiftWidth:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
+    KernelShiftHeight:
+        #pragma unroll
+        for (int row = 0; row < CONFIG_T::filt_height; row++) {
+        KernelShiftChannel:
+            #pragma unroll
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + col * CONFIG_T::n_chan + channel] =
+                    kernel_window[row * CONFIG_T::filt_width * CONFIG_T::n_chan + (col + 1) * CONFIG_T::n_chan + channel];
+            }
+        }
+    }
+
+// Insert shift_buffer values into the last column of the kernel window
+KernelPushHeight:
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_height; col++) {
+    KernelPushChannel:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + col * CONFIG_T::filt_width * CONFIG_T::n_chan +
+                          channel] = shift_buffer[col][channel];
+        }
+    }
+}
+
+/*
+ * void shift_line_buffer(in_element, line_buffer, shift_buffer)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels line_buffer - chained array of shift registers, one for each row of the kernel and channel shift_buffer -
+ * array elements popped from the line the buffer during the shift operation
+ *
+ * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
+ * Popped elements are later used to update the kernel window, during the kernel_shift operation
+ */
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer_2d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan]) {
+// For every channel, insert the incoming pixel at end of the shift buffer
+UpdateBuffer:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        shift_buffer[CONFIG_T::filt_height - 1][channel] = in_elem[channel];
+    }
+
+// Shift line buffer and save popped values to shift buffer
+LineBufferDataIn:
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+    LineBufferShift:
+        #pragma unroll
+        for (unsigned col = 1; col < CONFIG_T::filt_height; col++) {
+            // Shift the line buffer, return the popped pixel
+            typename data_T::value_type pop =
+                line_buffer[col - 1][channel].shift(shift_buffer[CONFIG_T::filt_height - col][channel]);
+
+            // Place popped pixed into the shift buffer, one row above
+            shift_buffer[CONFIG_T::filt_height - col - 1][channel] = pop;
+        }
+    }
+}
+
+/*
+ * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the kernel and channel kernel_window - array of values from the input curently convolved
+ * with the kernel weights - Conv1D/Conv2D layer weights biases - Conv1D/Conv2D layer biases
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and
+ * kernel weights (4) Counter housekeeping - keeps track of current pixel and stride
+ */
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_output_buffer_2d(
+    const data_T &in_elem,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+    data_window_T &kernel_window, const typename CONFIG_T::weight_t &weights, const typename CONFIG_T::bias_t &biases,
+    int &pX, int &pY, int &sX, int &sY) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    // Thresholds
+    constexpr int lShiftX = CONFIG_T::filt_width - 1;
+    constexpr int lShiftY = CONFIG_T::filt_height - 1;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan];
+    nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_2d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
+        // Step 3 - Dense matrix multiplication
+        [[intel::fpga_register]] res_T res_out;
+        dense_resource<data_window_T, res_T, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
+
+        // Write result to output stream
+        [[intel::fpga_register]] res_T res_pack;
+    CastLoop:
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
+            res_pack[channel] = res_out[channel];
+        }
+        res_pipe::write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) &&
+        (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) {
+        pX = 0;
+        sX = 0;
+        pY = 0;
+        sY = 0;
+        // Reached end of row
+    } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        pY++;
+        sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1);
+        // Same row, same colum, therefore, move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void conv_2d_cl_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan>;
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right>
+        line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan];
+    [[intel::fpga_register]] data_window_T kernel_window;
+
+    // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
+    constexpr auto padds = zero_array<data_arr_T>();
+
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // Y position pixel
+    int pY = 0;
+    // X strides
+    int sX = 0;
+    // Y strides
+    int sY = 0;
+
+// Padding above input image
+PaddingTopHeight:
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_top; row++) {
+    PaddingTopWidth:
+        for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
+        }
+    }
+
+ReadInputHeight:
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) {
+    // Input image left-side padding
+    PaddingLeftWidth:
+        for (int col = 0; col < CONFIG_T::pad_left; col++) {
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
+        }
+
+    // Read input image
+    ReadInputWidth:
+        for (int col = 0; col < CONFIG_T::in_width; col++) {
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(
+                data_pipe::read(), line_buffer, kernel_window, weights, biases, pX, pY, sX, sY);
+        }
+
+    // Input image right-side padding
+    PaddingRightWidth:
+        for (int col = 0; col < CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
+        }
+    }
+
+// Padding below input image
+PaddingBottomHeight:
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::pad_bottom; row++) {
+    PaddingBottomWidth:
+        for (int col = 0; col < CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right; col++) {
+            compute_output_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(padds, line_buffer, kernel_window,
+                                                                                    weights, biases, pX, pY, sX, sY);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
new file mode 100644
index 000000000..dc7618908
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -0,0 +1,164 @@
+#ifndef NNET_DENSE_LARGE_H_
+#define NNET_DENSE_LARGE_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <cstdint>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    static const unsigned reuse_factor = 1;
+    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
+    static const unsigned multiplier_factor = 1; // min n_in, rf
+    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
+    static const unsigned reciprocal = 1;        // 2^35 / 25
+    static const unsigned rf_pad = 0;
+    static const unsigned bf_pad = 0;
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_gt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                 const typename CONFIG_T::bias_t &biases) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+    //#pragma ii CONFIG_T::reuse_factor
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+Load:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+    [[intel::fpga_register]] int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    [[intel::fpga_register]] int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + CONFIG_T::reuse_factor * im;
+            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor);
+            d_index[ir][im] = w_index % CONFIG_T::n_in;
+        }
+    }
+Product1:
+    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        [[intel::fpga_register]] typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+    Product2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
+                continue;
+            int data_index = d_index[ir][im];
+            // Modified this
+            tmp_acc[im] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                    data[data_index], weights[w_index]);
+        }
+        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+    ResetMult:
+        #pragma unroll
+        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            int o_index = out_index[ir][im];
+            if (o_index >= CONFIG_T::n_out)
+                continue; // check out of bounds
+            mult[o_index] += tmp_acc[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
+            acc[im] += mult[im];
+        }
+    }
+Store:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(acc[ires]); // acc[jj];
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                 const typename CONFIG_T::bias_t &biases) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+InitAccum:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+ReuseLoop:
+    [[intel::nofusion, intel::speculated_iterations(0)]] for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+    MultLoop:
+        #pragma unroll
+        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
+            uint32_t w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue;
+            // Modified this
+            mult[im] =
+                CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                    data[in_index], weights[w_index]);
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in)
+                in_index = ir;
+        }
+    AccumLoop:
+        #pragma unroll
+        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
+            acc[out_index] += mult[im];
+            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+// Cast to "res_t" type
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                    const typename CONFIG_T::bias_t &biases) {
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 000000000..92c9adc3b
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,23 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// Note:  DataPack logic removed, at least in the initial version
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
+    [[intel::fpga_register]] auto data = data_pipe::read();
+    dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
+                   CONFIG_T>(data, res, weights, biases);
+    res_pipe::write(res);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
new file mode 100644
index 000000000..1188fe3ec
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed.h
@@ -0,0 +1,43 @@
+#ifndef NNET_EMBED_H_
+#define NNET_EMBED_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+struct embed_config {
+    // Internal data type definitions
+    typedef float embeddings_t;
+
+    // (Default layer sizes, overwritten form the backend
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 16;
+    static const unsigned vocab_size = 50;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(const data_T &data, res_T &res, const typename CONFIG_T::embeddings_t &embeddings) {
+
+    /*
+     * Can store embeddings[] in a register, but a large multiiplexer
+     * is created due to a non-constant access pattern
+     */
+
+InputSequence:
+    #pragma unroll
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < CONFIG_T::n_in; j++) {
+    DenseEmbedding:
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            res[j * CONFIG_T::n_out + i] = embeddings[data[j].to_uint() * CONFIG_T::n_out + i];
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
new file mode 100644
index 000000000..0f2acb098
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_embed_stream.h
@@ -0,0 +1,31 @@
+#ifndef NNET_EMBED_STREAM_H_
+#define NNET_EMBED_STREAM_H_
+
+namespace nnet {
+
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void embedding_stream(typename CONFIG_T::embeddings_t embeddings) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+
+    auto in_data = data_pipe::read();
+
+InputSequence:
+    [[intel::initiation_interval(CONFIG_T::reuse_factor)]] for (int j = 0; j < datasize; j++) {
+
+        res_T res_pack;
+
+    DenseEmbedding:
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i];
+        }
+
+        res_pipe::write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
new file mode 100644
index 000000000..c7af2e7a6
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -0,0 +1,118 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+namespace nnet {
+
+template <class srcType, class dest_pipe, size_t SIZE> void convert_data(sycl::queue &q, srcType *src) {
+    constexpr auto dstTypeSize = std::tuple_size<typename ExtractPipeType<dest_pipe>::value_type>{};
+    for (size_t i = 0; i < SIZE / dstTypeSize; i++) {
+        typename ExtractPipeType<dest_pipe>::value_type ctype;
+        for (size_t j = 0; j < dstTypeSize; j++) {
+            ctype[j] = src[i * dstTypeSize + j];
+        }
+        dest_pipe::write(q, ctype);
+    }
+}
+
+template <class src_pipe, class dstType, size_t SIZE> void convert_data_back(sycl::queue &q, dstType *dst) {
+    constexpr auto srcTypeSize = std::tuple_size<typename ExtractPipeType<src_pipe>::value_type>{};
+    for (size_t i = 0; i < SIZE / srcTypeSize; i++) {
+        auto ctype = src_pipe::read(q);
+        for (size_t j = 0; j < srcTypeSize; j++) {
+            dst[i * srcTypeSize + j] = ctype[j].to_double();
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+// constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+// replace with template metaprogramming
+template <int n> struct ceillog2 {
+    enum { val = 1 + ceillog2<((n + 1) / 2)>::val };
+};
+
+template <> struct ceillog2<2> {
+    enum { val = 1 };
+};
+
+template <> struct ceillog2<1> {
+    enum { val = 0 };
+};
+
+// constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+// replace with template metaprogramming
+template <int n> struct floorlog2 {
+    enum { val = 1 + floorlog2<(n / 2)>::val };
+};
+
+template <> struct floorlog2<1> {
+    enum { val = 0 };
+};
+
+template <> struct floorlog2<0> {
+    enum { val = 0 };
+};
+
+// constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+// replace with template metaprogramming
+template <int n> struct pow2 {
+    enum { val = 2 * pow2<(n - 1)>::val };
+};
+
+template <> struct pow2<0> {
+    enum { val = 1 };
+};
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
new file mode 100644
index 000000000..550663b88
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -0,0 +1,232 @@
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    static const unsigned reuse_factor = 1;
+
+    typedef float accum_t;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i] + data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i] - data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i] * data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>((data1[i] + data2[i]) / 2);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>((data1[i] > data2[i]) ? data1[i] : data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<typename res_T::value_type>((data1[i] < data2[i]) ? data1[i] : data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(const input1_T &data1, const input2_T &data2, res_T &res) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+Product:
+    #pragma unroll multiplier_limit
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        mult[i] = CONFIG_T::template product<typename input1_T::value_type, typename input2_T::value_type>::product(
+            data1[i], data2[i]);
+    }
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc = 0;
+Accum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        acc += mult[i];
+    }
+
+    res[0] = static_cast<typename res_T::value_type>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<typename res_T::value_type>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<typename res_T::value_type>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(const input1_T &data1, const input2_T &data2, res_T &res) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
+                static_cast<typename res_T::value_type>(data1[i * CONFIG_T::n_elem1_1 + j]);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
+                static_cast<typename res_T::value_type>(data2[i * CONFIG_T::n_elem2_1 + j]);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(const input1_T &data1, const input2_T &data2, res_T &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(const input1_T &data1, const input2_T &data2, res_T &res) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
+        res[i] = static_cast<typename res_T::value_type>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] =
+            static_cast<typename res_T::value_type>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(const input1_T &data1, const input2_T &data2, res_T &res) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx =
+                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<typename res_T::value_type>(data1[data_idx]);
+            }
+        }
+
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
+                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<typename res_T::value_type>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(const input1_T &data1, const input2_T &data2, res_T &res) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<typename res_T::value_type>(data1[data_idx]);
+            }
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<typename res_T::value_type>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(const input1_T &data1, const input2_T &data2, res_T &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
new file mode 100644
index 000000000..60028ea52
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge_stream.h
@@ -0,0 +1,359 @@
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+namespace nnet {
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void add_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+AddLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    AddPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] + in_data2[j]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void subtract_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+SubtractLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    SubtractPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] - in_data2[j]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void multiply_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+MultLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    MultPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j] * in_data2[j]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void average_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+AvgLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    AvgPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                (in_data1[j] + in_data2[j]) / (typename ExtractPipeType<res_pipe>::value_type::value_type)2);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void maximum_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+MaxLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    MaxPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void minimum_stream() {
+    // both inputs are the same size
+    constexpr auto inputSize = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto outputSize = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+MinLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem / inputSize; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    MinPack:
+        #pragma unroll
+        for (int j = 0; j < outputSize; j++) {
+            out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(
+                (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate1d_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+ConcatLoop1:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0 / input2Size; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+    ConcatPack1:
+        #pragma unroll
+        for (int j = 0; j < input1Size; j++) {
+            out_data[j + (i * input1Size)] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[j]);
+        }
+    }
+
+ConcatLoop2:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0 / input2Size; i++) {
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+    ConcatPack2:
+        #pragma unroll
+        for (int j = 0; j < input2Size; j++) {
+            out_data[j + (i * input2Size) + (CONFIG_T::n_elem1_0)] =
+                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[j]);
+        }
+    }
+    res_pipe::write(out_data);
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_0_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+ConcatLoopHeight1:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    ConcatPackInput1:
+        #pragma unroll
+        for (int k = 0; k < input1Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
+        }
+
+        res_pipe::write(out_data);
+    }
+
+ConcatLoopHeight2:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    ConcatPackInput2:
+        #pragma unroll
+        for (int k = 0; k < input2Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_1_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+ConcatLoopHeight:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+        [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    ConcatPackInput1:
+        #pragma unroll
+        for (int k = 0; k < input1Size; k++) {
+            out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
+        }
+
+    ConcatPackInput2:
+        #pragma unroll
+        for (int k = 0; k < input2Size; k++) {
+            out_data[input1Size + k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate2d_stream() {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
+    } else {
+        concatenate2d_0_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_0_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
+            }
+
+            res_pipe::write(out_data);
+        }
+    }
+
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
+            }
+
+            res_pipe::write(out_data);
+        }
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_1_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
+            }
+
+            res_pipe::write(out_data);
+        }
+    ConcatLoopWidth2:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
+            }
+
+            res_pipe::write(out_data);
+        }
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_2_stream() {
+    constexpr auto input1Size = std::tuple_size<typename ExtractPipeType<input1_pipe>::value_type>{};
+    constexpr auto input2Size = std::tuple_size<typename ExtractPipeType<input2_pipe>::value_type>{};
+
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        [[intel::initiation_interval(1)]] for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            [[intel::fpga_register]] auto in_data1 = input1_pipe::read();
+            [[intel::fpga_register]] auto in_data2 = input2_pipe::read();
+            [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+        ConcatPackInput1:
+            #pragma unroll
+            for (int k = 0; k < input1Size; k++) {
+                out_data[k] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data1[k]);
+            }
+
+        ConcatPackInput2:
+            #pragma unroll
+            for (int k = 0; k < input2Size; k++) {
+                out_data[input1Size + k] =
+                    static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(in_data2[k]);
+            }
+
+            res_pipe::write(out_data);
+        }
+    }
+}
+
+template <class input1_pipe, class input2_pipe, class res_pipe, typename CONFIG_T> void concatenate3d_stream() {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
+    } else {
+        concatenate3d_0_stream<input1_pipe, input2_pipe, res_pipe, CONFIG_T>();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
new file mode 100644
index 000000000..c7dfc2d7c
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_mult.h
@@ -0,0 +1,113 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+//  Different methods to perform the product of input and weight, depending on their types.
+namespace product {
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {}
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    inline static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        // TODO: Implement for Quartus
+        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
+        // pragma
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ac_fixed<2 * (w_T::second_type::width + x_T::width), (w_T::second_type::width + x_T::width), true>;
+    inline static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.second;
+
+        // Negate or not depending on weight sign
+        return w.first == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+} // namespace product
+
+// TO-DO: These may need extra variants if ac_int types are used in more places
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
new file mode 100644
index 000000000..e8e3d6509
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding.h
@@ -0,0 +1,104 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad1d_cl(const data_T &data, res_T &res) {
+
+    auto resIter = res.begin();
+    auto dataIter = data.cbegin();
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(resIter++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(resIter++) = static_cast<typename res_T::value_type>(*(dataIter++));
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(resIter++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(const data_T &data, res_T &res) {
+
+    auto resIter = res.begin();
+    auto dataIter = data.cbegin();
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(resIter++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(resIter++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(resIter++) = static_cast<typename res_T::value_type>(*(dataIter++));
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(resIter++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(resIter++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
new file mode 100644
index 000000000..adb2efee2
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_padding_stream.h
@@ -0,0 +1,81 @@
+#ifndef NNET_PADDING_STREAM_H_
+#define NNET_PADDING_STREAM_H_
+
+namespace nnet {
+
+template <class res_pipe, typename CONFIG_T> inline void fill_zero() {
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res_part;
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_chan; i++) {
+        res_part[i] = 0;
+    }
+    res_pipe::write(res_part);
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> inline void fill_data() {
+    [[intel::fpga_register]] auto data_part = data_pipe::read();
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res_part;
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_chan; i++) {
+        res_part[i] = data_part[i];
+    }
+    res_pipe::write(res_part);
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void zeropad1d_cl_stream() {
+PadLeft:
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        fill_zero<res_pipe, CONFIG_T>();
+    }
+
+CopyMain:
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        fill_data<data_pipe, res_pipe, CONFIG_T>();
+    }
+
+PadRight:
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        fill_zero<res_pipe, CONFIG_T>();
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void zeropad2d_cl_stream() {
+PadTop:
+    [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::pad_top; i++) {
+    PadTopWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_pipe, CONFIG_T>();
+        }
+    }
+
+PadMain:
+    [[intel::loop_coalesce(2)]] for (int i = 0; i < CONFIG_T::in_height; i++) {
+
+    PadLeft:
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            fill_zero<res_pipe, CONFIG_T>();
+        }
+
+    CopyMain:
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            fill_data<data_pipe, res_pipe, CONFIG_T>();
+        }
+
+    PadRight:
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            fill_zero<res_pipe, CONFIG_T>();
+        }
+    }
+
+PadBottom:
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+    PadBottomWidth:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_pipe, CONFIG_T>();
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
new file mode 100644
index 000000000..d4ae91533
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling.h
@@ -0,0 +1,257 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+// Returns the maximum value from an array of size N
+template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
+    [[intel::fpga_register]] T y = x[0];
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    [[intel::disable_loop_pipelining]] for (int i = 1; i < N; i++) {
+        if (x[i] > y)
+            y = x[i];
+    }
+
+    return y;
+}
+
+// Returns the mean value of an array of size N
+template <typename T, int N, typename accum_t> accum_t avg(T x[N], unsigned length) {
+    [[intel::fpga_register]] accum_t y = 0;
+
+    // Due to loop dependencies, pipelining & unrolling is not possible
+    // Explictily disabling pipeline significantly reduces resource usage
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < N; i++) { y += x[i]; }
+
+    y /= length;
+    return y;
+}
+
+// Enumeration for pooling functions
+enum Pool_Op { Max, Average };
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T x[N], unsigned length) {
+    switch (op) {
+    case Max:
+        return max<T, N, accum_t>(x);
+    case Average:
+        return avg<T, N, accum_t>(x, length);
+    }
+}
+
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
+    return pool_op<T, N, op, accum_t>(x, N);
+}
+
+/*
+ * In Tensorflow, pooling ignores the value in the padded cells
+ * For Avg pooling, return 0 (the divisior is modified to the area overlapping the unpadded image.)
+ * For ax pooling, return the most negative value for the type.
+ */
+template <typename T, Pool_Op op> inline T pad_val() {
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // Pooling paramaters
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+
+    // I/O sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned n_filt = 4;
+
+    // Padding
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(const data_T &data, res_T &res) {
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+
+FiltLoop:
+    #pragma unroll
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    InputWidthLoop:
+        #pragma unroll
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_width;
+                                                inp_col += CONFIG_T::stride_width) {
+            [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_width];
+
+            // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
+            [[intel::fpga_register]] unsigned img_overlap = 0;
+
+        PoolWidthLoop:
+            #pragma unroll
+            [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
+                if (inp_col + pool_col < CONFIG_T::pad_left ||
+                    inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[pool_col] = pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
+                    if (CONFIG_T::count_pad)
+                        img_overlap++;
+                } else {
+                    // Current element is from input image
+                    pool[pool_col] = data[(inp_col + pool_col - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt];
+                    img_overlap++;
+                }
+            }
+
+            // Pooling operation
+            res[(inp_col / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(
+                    pool, img_overlap));
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling1d_cl(const data_T &data, res_T &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+FiltLoop:
+    #pragma unroll
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::n_in];
+
+    InputWidthLoop:
+        #pragma unroll
+        [[intel::disable_loop_pipelining]] for (int col = 0; col < CONFIG_T::n_in; col++) {
+            pool[col] = data[col * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<typename res_T::value_type>(
+            pool_op<typename data_T::value_type, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
+    }
+}
+
+struct pooling2d_config {
+    // Pooling parameters
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+
+    // I/O sizes
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(const data_T &data, res_T &res) {
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+FiltLoop:
+    #pragma unroll
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+    InputHeightLoop:
+        #pragma unroll
+        [[intel::disable_loop_pipelining]] for (int inp_col = 0; inp_col < restricted_padded_height;
+                                                inp_col += CONFIG_T::stride_height) {
+        InputWidthLoop:
+            #pragma unroll
+            [[intel::disable_loop_pipelining]] for (int inp_width = 0; inp_width < restricted_padded_width;
+                                                    inp_width += CONFIG_T::stride_width) {
+                [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+
+                // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
+                [[intel::fpga_register]] unsigned img_overlap = 0;
+
+            PoolHeightLoop:
+                #pragma unroll
+                [[intel::disable_loop_pipelining]] for (int pool_col = 0; pool_col < CONFIG_T::stride_height; pool_col++) {
+                PoolWidthLoop:
+                    #pragma unroll
+                    [[intel::disable_loop_pipelining]] for (int pool_row = 0; pool_row < CONFIG_T::stride_width;
+                                                            pool_row++) {
+                        if (inp_col + pool_col < CONFIG_T::pad_top ||
+                            inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) ||
+                            inp_width + pool_row < CONFIG_T::pad_left ||
+                            inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[pool_col * CONFIG_T::stride_width + pool_row] =
+                                pad_val<typename data_T::value_type, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            // Current element is from input image
+                            pool[pool_col * CONFIG_T::stride_width + pool_row] =
+                                data[(inp_col + pool_col - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt +
+                                     (inp_width + pool_row - CONFIG_T::pad_left) * CONFIG_T::n_filt + filt];
+                            img_overlap++;
+                        }
+                    }
+                }
+
+                // Pooling operation
+                res[(inp_col / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (inp_width / CONFIG_T::stride_width) * CONFIG_T::n_filt + filt] =
+                    static_cast<typename res_T::value_type>(
+                        pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                                typename CONFIG_T::accum_t>(pool, img_overlap));
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void global_pooling2d_cl(const data_T &data, res_T &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+FiltLoop:
+    #pragma unroll
+    [[intel::disable_loop_pipelining]] for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        [[intel::fpga_register]] typename data_T::value_type pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        #pragma unroll
+        [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<typename res_T::value_type>(
+            pool_op<typename data_T::value_type, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op,
+                    typename CONFIG_T::accum_t>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 000000000..9c30aab67
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,322 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+#include "nnet_conv1d_stream.h"
+#include "nnet_conv2d_stream.h"
+#include "nnet_pooling.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+/*
+ * void compute_pool_buffer_1d(in_element, res_stream, line_buffer, kernel_window)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4)
+ * Counter housekeeping - performs the required pooling operation
+ *
+ */
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_pool_buffer_1d(const data_T &in_elem,
+                            nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt],
+                            data_window_T &kernel_window, int &pX, int &sX) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    // Thresholds
+    constexpr int lShiftX = CONFIG_T::pool_width - 1;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::n_filt];
+    nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_1d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full pool window
+    if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
+        [[intel::fpga_register]] res_T res_pack;
+
+    FiltLoop:
+        #pragma unroll
+        for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+            [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_width];
+
+        // Retrieve data for current channel
+        PoolLoop:
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::pool_width; i++) {
+                pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter];
+            }
+
+            // Step 3 - Pooling
+            res_pack[filter] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(
+                    pool_window));
+        }
+
+        // Write result to output stream
+        res_pipe::write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        // Move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling1d_cl_stream() {
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::pool_width * CONFIG_T::n_filt>;
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::in_width> line_buffer[CONFIG_T::n_filt];
+    [[intel::fpga_register]] data_window_T kernel_window;
+
+    // move former static variables outside the function calls
+    // X position pixel
+    int pX = 0;
+    // X strides
+    int sX = 0;
+
+// Read input image
+ReadInputWidth:
+    for (int col = 0; col < CONFIG_T::in_width; col++) {
+        compute_pool_buffer_1d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer, kernel_window,
+                                                                              pX, sX);
+    }
+}
+
+/*
+ * void compute_pool_buffer_2d(in_element, res_stream, line_buffer, kernel_window)
+ *
+ * Args:
+ *   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number
+ * of channels res_stream - output stream, passed by reference to allow direct writing line_buffer - chained array of shift
+ * registers, one for each row of the pool and channel kernel_window - array of values from the input curently being pooled
+ *
+ * Function executes 4 steps:
+ *   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last
+ * elements (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from
+ * the line buffer (3) Pooling - performs dense matrix multiplication between the current input window and kernel weights (4)
+ * Counter housekeeping - performs the required pooling operation
+ *
+ */
+template <class data_T, class data_window_T, class res_pipe, typename CONFIG_T>
+void compute_pool_buffer_2d(const data_T &in_elem,
+                            nnet::shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                line_buffer[CONFIG_T::pool_height - 1][CONFIG_T::n_filt],
+                            data_window_T &kernel_window, int &pX, int &pY, int &sX, int &sY) {
+
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::pool_width - 1;
+    static constexpr int lShiftY = CONFIG_T::pool_height - 1;
+
+    // Step 1 - Shift line buffer
+    [[intel::fpga_register]] typename data_T::value_type shift_buffer[CONFIG_T::pool_height][CONFIG_T::n_filt];
+    nnet::shift_line_buffer_2d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_2d<data_T, data_window_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full pool window
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > (lShiftY - 1) && pX > (lShiftX - 1)) {
+        [[intel::fpga_register]] res_T res_pack;
+
+    FiltLoop:
+        #pragma unroll
+        for (int filter = 0; filter < CONFIG_T::n_filt; filter++) {
+            [[intel::fpga_register]] typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+
+        // Retrieve data for current channel
+        PoolLoop:
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::pool_height * CONFIG_T::pool_width; i++) {
+                pool_window[i] = kernel_window[i * CONFIG_T::n_filt + filter];
+            }
+
+            // Step 3 - Pooling
+            res_pack[filter] = static_cast<typename res_T::value_type>(
+                pool_op<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                        typename CONFIG_T::accum_t>(pool_window));
+        }
+
+        // Write result to output stream
+        res_pipe::write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right) &&
+        (pY + 1) == (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom)) {
+        pX = 0;
+        sX = 0;
+        pY = 0;
+        sY = 0;
+        // Reached end of row
+    } else if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0;
+        sX = 0;
+        pY++;
+        sY = ((sY - lShiftY) == 0) ? (sY - CONFIG_T::stride_height + 1) : (sY + 1);
+        // Same row, same colum, therefore, move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void pooling2d_cl_stream() {
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+
+    using data_arr_T = typename ExtractPipeType<data_pipe>::value_type;
+    using data_element_T = typename data_arr_T::value_type;
+    using data_window_T = array<data_element_T, CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt>;
+
+    // Line buffer and kernel window
+    [[intel::fpga_register]] nnet::shift_reg<data_element_T, CONFIG_T::in_width>
+        line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt];
+    [[intel::fpga_register]] data_window_T kernel_window;
+
+    // former static variables
+    // X, Y position pixels
+    int pX = 0;
+    int pY = 0;
+
+    // X, Y strides
+    int sX = 0;
+    int sY = 0;
+
+ReadInputHeight:
+    [[intel::loop_coalesce(2)]] for (int row = 0; row < CONFIG_T::in_height; row++) {
+    // Read input image
+    ReadInputWidth:
+        for (int col = 0; col < CONFIG_T::in_width; col++) {
+            compute_pool_buffer_2d<data_arr_T, data_window_T, res_pipe, CONFIG_T>(data_pipe::read(), line_buffer,
+                                                                                  kernel_window, pX, pY, sX, sY);
+        }
+    }
+}
+
+/*
+ * A function used with Global Pooling
+ * Updates the output pooling value
+ * Max : Return the maximum between the previous maximum and current input
+ * Avg : Returns the cumulative sum
+ */
+template <class T_y, class T_x, Pool_Op op> inline T_y reduce_global_pool(T_y y, T_x x) {
+    if (op == Max) {
+        return (x > y) ? (T_y)x : y;
+    } else {
+        return (T_y)(x + y);
+    }
+}
+
+/*
+ * A function used with Global Pooling
+ * For every filter, it updates the value by summing the current input (Average) or updating the maximum value (Max)
+ */
+template <class data_T, class res_T, typename CONFIG_T> void compute_global_pool(const data_T &in_elem, res_T &data_input) {
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = reduce_global_pool<typename CONFIG_T::accum_t, typename data_T::value_type, CONFIG_T::pool_op>(
+            data_input[i], in_elem[i]);
+    }
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void global_pooling1d_cl_stream() {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    using accum_arr_t = array<typename CONFIG_T::accum_t, CONFIG_T::n_filt>;
+
+    [[intel::fpga_register]] accum_arr_t data_input;
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = pad_val<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+    }
+
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        compute_global_pool<data_T, accum_arr_t, CONFIG_T>(data_pipe::read(), data_input);
+    }
+
+    [[intel::fpga_register]] res_T res_pack;
+    if (CONFIG_T::pool_op == Average) {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i] / CONFIG_T::n_in);
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i]);
+        }
+    }
+
+    res_pipe::write(res_pack);
+}
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void global_pooling2d_cl_stream() {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    using accum_arr_t = array<typename CONFIG_T::accum_t, CONFIG_T::n_filt>;
+
+    [[intel::fpga_register]] accum_arr_t data_input;
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_filt; i++) {
+        data_input[i] = pad_val<typename CONFIG_T::accum_t, CONFIG_T::pool_op>();
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            compute_global_pool<data_T, accum_arr_t, CONFIG_T>(data_pipe::read(), data_input);
+        }
+    }
+
+    [[intel::fpga_register]] res_T res_pack;
+    if (CONFIG_T::pool_op == Average) {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] =
+                static_cast<typename res_T::value_type>(data_input[i] / (CONFIG_T::in_width * CONFIG_T::in_height));
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < CONFIG_T::n_filt; i++) {
+            res_pack[i] = static_cast<typename res_T::value_type>(data_input[i]);
+        }
+    }
+
+    res_pipe::write(res_pack);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
new file mode 100644
index 000000000..5fec90d1a
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_printf.h
@@ -0,0 +1,18 @@
+#ifndef NNET_PRINTF_H_
+#define NNET_PRINTF_H_
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define CL_CONSTANT __attribute__((opencl_constant))
+#else
+#define CL_CONSTANT
+#endif
+
+using namespace sycl;
+
+#define PRINTF(format, ...)                                                                                                 \
+    {                                                                                                                       \
+        static const CL_CONSTANT char _format[] = format;                                                                   \
+        ext::oneapi::experimental::printf(_format, ##__VA_ARGS__);                                                          \
+    }
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
new file mode 100644
index 000000000..4c20f28d1
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -0,0 +1,566 @@
+#ifndef NNET_RECURRENT_H_
+#define NNET_RECURRENT_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recurrent_activation.h"
+
+namespace nnet {
+
+//----------------------
+// Utils
+//----------------------
+
+template <class data_T, class res_T, class weight_t, int N_IN, int N_OUT>
+void multiply_W(const data_T &input, res_T &out, const weight_t &weight) {
+MULTIPLY_W_LOOP_I:
+    #pragma unroll
+    for (int i = 0; i < N_OUT; i++) {
+        out[i] = 0;
+
+    MULTIPLY_W_LOOP_J:
+        #pragma unroll
+        for (int j = 0; j < N_IN; j++) {
+            out[i] += input[j] * weight[i * N_IN + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, class weight_t, int N_OUT>
+void multiply_U(const data_T &input, res_T &out, const weight_t &weight) {
+MULTIPLY_U_LOOP_I:
+    #pragma unroll
+    for (int i = 0; i < N_OUT; i++) {
+        out[i] = 0;
+
+    MULTIPLY_U_LOOP_J:
+        #pragma unroll
+        for (int j = 0; j < N_OUT; j++) {
+            out[i] += input[j] * weight[i * N_OUT + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, class bias_t, int N>
+void add_bias(const data_T &inputs, res_T &out, const bias_t &bias) {
+ADD_BIAS_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = inputs[i] + bias[i];
+    }
+}
+
+template <class data1_T, class data2_T, class res_T, int N>
+void multiply_vectors(const data1_T &in1, const data2_T &in2, res_T &out) {
+MULTIPLY_VECT_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = in1[i] * in2[i];
+    }
+}
+
+template <class data1_T, class data2_T, class res_T, int N>
+void add_vectors(const data1_T &in1, const data2_T &in2, res_T &out) {
+ADD_VECTOR_LOOP:
+    #pragma unroll
+    for (int i = 0; i < N; i++) {
+        out[i] = in1[i] + in2[i];
+    }
+}
+
+//----------------------
+// GRU
+//----------------------
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_units = 1;
+    static const unsigned n_timesteps = 1;
+    static const unsigned n_outputs = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class h_T, typename CONFIG_T>
+void gru_cell(const data_T &x, h_T &h, const typename CONFIG_T::weight_t &weights,
+              const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+              const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
+    static constexpr int recurrent_unroll_factor = CONFIG_T::n_units / CONFIG_T::reuse_factor;
+    // A matrix containing the values of matrix product between input (x) and weights (weights), for update, reset and
+    // candidate state gates, for each of the units
+
+    using accum_array_T = array<typename CONFIG_T::accum_t, 3 * CONFIG_T::n_units>;
+
+    [[intel::fpga_register]] accum_array_T mat_mul_x_w;
+    nnet::dense_resource<data_T, accum_array_T, typename CONFIG_T::mult_config_x>(x, mat_mul_x_w, weights, bias);
+
+    // A matrix containing the values of matrix product between previou state (h) and recurrent weights (recurrent_weights),
+    // for update, reset and candidate state gates, for each of the units
+    [[intel::fpga_register]] accum_array_T mat_mul_h_wr;
+    nnet::dense_resource<h_T, accum_array_T, typename CONFIG_T::mult_config_h>(h, mat_mul_h_wr, recurrent_weights,
+                                                                               recurrent_bias);
+
+    // A vector containing both the values of z(t) and r(t) for every state
+    using z_activ_array_T = array<typename CONFIG_T::accum_t, 2 * CONFIG_T::n_units>;
+    [[intel::fpga_register]] z_activ_array_T z_r;
+
+    // Add the individual vectors from the multiplication of mat_mul_x_w = Wx*x(t) and mat_mul_h_wr = Wh*h(t-1)
+    // Unrolled fully, no DSPs used
+    #pragma unroll
+    for (int i = 0; i < (2 * CONFIG_T::n_units); i++) {
+        z_r[i] = mat_mul_x_w[i] + mat_mul_h_wr[i];
+    }
+
+    // Activation on z(t) and r(t)
+    [[intel::fpga_register]] z_activ_array_T z_r_act;
+    CONFIG_T::template activation_recr<z_activ_array_T, z_activ_array_T,
+                                       typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(z_r, z_r_act);
+
+    // A matrix containing the values of Hadamard product between r(t) = z_r_act[n_units:2*n_units] and h(t-1) = h
+    using h_activ_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_units>;
+    [[intel::fpga_register]] h_activ_array_T hadamard_r_h;
+    #pragma unroll recurrent_unroll_factor
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        hadamard_r_h[i] = z_r_act[i + CONFIG_T::n_units] * mat_mul_h_wr[i + 2 * CONFIG_T::n_units];
+    }
+
+    // The candidate state; X * W_{hx} + hadmard(r(t), h_(t-1)) * W_{hh} + b_{h}
+    [[intel::fpga_register]] h_activ_array_T h_cand;
+    // Addition - can unroll fully; no DSPs used here
+    #pragma unroll
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        h_cand[i] = mat_mul_x_w[i + 2 * CONFIG_T::n_units] + hadamard_r_h[i];
+    }
+
+    // Activation on candidate state
+    [[intel::fpga_register]] h_activ_array_T h_cand_act;
+    CONFIG_T::template activation<h_activ_array_T, h_activ_array_T, typename CONFIG_T::ACT_CONFIG_T>::activation(h_cand,
+                                                                                                                 h_cand_act);
+
+    // Update state
+    #pragma unroll recurrent_unroll_factor
+    for (int i = 0; i < (CONFIG_T::n_units); i++) {
+        h[i] = static_cast<typename h_T::value_type>(h_cand_act[i] * (1 - z_r_act[i]) + h[i] * z_r_act[i]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+         const typename CONFIG_T::recurrent_weight_t &recurrent_weights, const typename CONFIG_T::bias_t &bias,
+         const typename CONFIG_T::recurrent_bias_t &recurrent_bias) {
+
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
+    [[intel::fpga_register]] data_T x;
+    [[intel::fpga_register]] h_T h;
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_units; i++) {
+        h[i] = 0;
+    }
+
+    // Loop depedency - cannot pipeline
+    [[intel::disable_loop_pipelining]] for (int t = 0; t < CONFIG_T::n_timesteps; t++) {
+        // Get data at current time step
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            x[j] = data[j + t * CONFIG_T::n_in];
+        }
+
+        nnet::gru_cell<data_T, h_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
+
+        if (CONFIG_T::return_sequences) {
+            #pragma unroll
+            for (int i = 0; i < CONFIG_T::n_units; i++) {
+                res[CONFIG_T::n_units * t + i] = h[i];
+            }
+        }
+    }
+
+    if (!CONFIG_T::return_sequences) {
+        #pragma unroll
+        for (int i = 0; i < (CONFIG_T::n_units); i++) {
+            res[i] = h[i];
+        }
+    }
+}
+
+//----------------------
+// SimpleRNN
+//----------------------
+
+struct simpleRNN_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_outputs = 1;
+    static const unsigned n_timesteps = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class in_T, class h_T, typename CONFIG_T>
+void simple_rnn_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, const typename CONFIG_T::weight_t &kernel,
+                     const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) {
+
+    using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
+    // Weight multiplication
+    [[intel::fpga_register]] accum_array_T afterW;
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, afterW, kernel);
+
+    // Bias addition
+    [[intel::fpga_register]] accum_array_T afterBias;
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_t, CONFIG_T::n_out>(afterW, afterBias, bias);
+
+    // Hidden state
+    [[intel::fpga_register]] accum_array_T hiddenCand;
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
+                                                                                           rec_kernel);
+
+    // Vector addition
+    [[intel::fpga_register]] accum_array_T afterAdd;
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(afterBias, hiddenCand, afterAdd);
+
+    // Activation
+    CONFIG_T::template activation<accum_array_T, h_T, typename CONFIG_T::ACT_CONFIG_T>::activation(afterAdd, hidden_state_o);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void simple_rnn(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &kernel,
+                const typename CONFIG_T::recurrent_weight_t &rec_kernel, const typename CONFIG_T::bias_t &bias) {
+
+    using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
+
+    [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T hidden_state_temp;
+    [[intel::fpga_register]] h_T h;
+    [[intel::fpga_register]] in_T in;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[0][x] = 0;
+    }
+
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+
+        // Data at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[i][x];
+        }
+
+        // Do SimpleRNN
+        simple_rnn_cell<in_T, h_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[i + 1][x] = h[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            #pragma unroll
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
+            }
+        }
+    }
+}
+
+//----------------------
+// LSTM
+//----------------------
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 1;
+    static const unsigned n_out = 1;
+    static const unsigned n_outputs = 1;
+
+    static const unsigned n_timesteps = 1;
+    static const bool return_sequences = false;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    // Activation
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class in_T, class h_T, typename CONFIG_T>
+void lstm_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_state_o, h_T &cell_state, h_T &cell_state_o,
+               const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF,
+               const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO,
+               const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF,
+               const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO,
+               const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t BF,
+               const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t BO) {
+
+    using accum_array_T = array<typename CONFIG_T::accum_t, CONFIG_T::n_out>;
+
+    // Internals definitions
+    [[intel::fpga_register]] accum_array_T i_afterW;
+    [[intel::fpga_register]] accum_array_T i_afterBias;
+    [[intel::fpga_register]] accum_array_T c_afterW;
+    [[intel::fpga_register]] accum_array_T c_afterBias;
+    [[intel::fpga_register]] accum_array_T o_afterW;
+    [[intel::fpga_register]] accum_array_T o_afterBias;
+    [[intel::fpga_register]] accum_array_T f_afterW;
+    [[intel::fpga_register]] accum_array_T f_afterBias;
+
+    // Hidden state Gate candidates, intermediate variables
+    [[intel::fpga_register]] accum_array_T i_hiddenCand;
+    [[intel::fpga_register]] accum_array_T f_hiddenCand;
+    [[intel::fpga_register]] accum_array_T c_hiddenCand;
+    [[intel::fpga_register]] accum_array_T o_hiddenCand;
+
+    // After addition, intermediate variables
+    [[intel::fpga_register]] accum_array_T i_afterAdd;
+    [[intel::fpga_register]] accum_array_T f_afterAdd;
+    [[intel::fpga_register]] accum_array_T c_afterAdd;
+    [[intel::fpga_register]] accum_array_T o_afterAdd;
+
+    // Gate outputs
+    [[intel::fpga_register]] accum_array_T gate_i;
+    [[intel::fpga_register]] accum_array_T gate_f;
+    [[intel::fpga_register]] accum_array_T gate_c;
+    [[intel::fpga_register]] accum_array_T gate_o;
+    [[intel::fpga_register]] accum_array_T gate_ic;
+    [[intel::fpga_register]] accum_array_T gate_forget;
+    [[intel::fpga_register]] accum_array_T h;
+
+    // Intermediate variable cell calculation
+    [[intel::fpga_register]] accum_array_T cell_act_multp;
+    [[intel::fpga_register]] accum_array_T cell_act_add;
+
+    //-----------Gate I Calculations
+    // Weight multiplication
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_i_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, i_afterW, WI);
+
+    // Bias addition
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_i_t, CONFIG_T::n_out>(i_afterW, i_afterBias, BI);
+
+    // Hidden Candidate
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_i_t, CONFIG_T::n_out>(hidden_state, i_hiddenCand,
+                                                                                             RWI);
+
+    // Vector addition
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(i_afterBias, i_hiddenCand, i_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        i_afterAdd, gate_i);
+
+    //-----------Gate F Calculations
+    // Weight multiplication
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_f_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, f_afterW, WF);
+
+    // Bias addition
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_f_t, CONFIG_T::n_out>(f_afterW, f_afterBias, BF);
+
+    // Hidden Candidate
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_f_t, CONFIG_T::n_out>(hidden_state, f_hiddenCand,
+                                                                                             RWF);
+
+    // Vector addition
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(f_afterBias, f_hiddenCand, f_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        f_afterAdd, gate_f);
+
+    //-----------Gate C Calculations
+    // Weight multiplication
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_c_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, c_afterW, WC);
+
+    // Bias addition
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_c_t, CONFIG_T::n_out>(c_afterW, c_afterBias, BC);
+
+    // Hidden Candidate
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_c_t, CONFIG_T::n_out>(hidden_state, c_hiddenCand,
+                                                                                             RWC);
+
+    // Vector addition
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(c_afterBias, c_hiddenCand, c_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        c_afterAdd, gate_c);
+
+    //-----------gate I and C multiply
+    // Vector multiplication
+    multiply_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_i, gate_c, gate_ic);
+
+    //-----------Gate O Calculations
+    // Weight multiplication
+    multiply_W<in_T, accum_array_T, typename CONFIG_T::weight_o_t, CONFIG_T::n_in, CONFIG_T::n_out>(inputs, o_afterW, WO);
+
+    // Bias addition
+    add_bias<accum_array_T, accum_array_T, typename CONFIG_T::bias_o_t, CONFIG_T::n_out>(o_afterW, o_afterBias, BO);
+
+    // Hidden Candidate
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_o_t, CONFIG_T::n_out>(hidden_state, o_hiddenCand,
+                                                                                             RWO);
+
+    // Vector addition
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(o_afterBias, o_hiddenCand, o_afterAdd);
+
+    // Activation
+    CONFIG_T::template activation_recr<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        o_afterAdd, gate_o);
+
+    //-----------Cell State Calculation
+    // Vector multiplication
+    multiply_vectors<accum_array_T, h_T, accum_array_T, CONFIG_T::n_out>(gate_f, cell_state, cell_act_multp);
+
+    // Vector addition
+    add_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_ic, cell_act_multp, cell_act_add);
+
+    //-----------Forget gate Calculation
+    // Activation
+    CONFIG_T::template activation<accum_array_T, accum_array_T, typename CONFIG_T::ACT_CONFIG_RECURRENT_T>::activation(
+        cell_act_add, gate_forget);
+
+    // Vector multiplication
+    multiply_vectors<accum_array_T, accum_array_T, accum_array_T, CONFIG_T::n_out>(gate_o, gate_forget, h);
+
+OUTPUT_WRITE_LOOP:
+    #pragma unroll
+    for (int x = (CONFIG_T::n_out - 1); x >= 0; x--) {
+        hidden_state_o[x] = h[x];
+        cell_state_o[x] = cell_act_add[x];
+    }
+}
+
+template <class data_T, class res_T, class CONFIG_T>
+void lstm(const data_T &data, res_T &res, const typename CONFIG_T::weight_i_t &WI, const typename CONFIG_T::weight_f_t &WF,
+          const typename CONFIG_T::weight_c_t &WC, const typename CONFIG_T::weight_o_t &WO,
+          const typename CONFIG_T::recurrent_weight_i_t &RWI, const typename CONFIG_T::recurrent_weight_f_t &RWF,
+          const typename CONFIG_T::recurrent_weight_c_t &RWC, const typename CONFIG_T::recurrent_weight_o_t &RWO,
+          const typename CONFIG_T::bias_i_t &BI, const typename CONFIG_T::bias_f_t &BF,
+          const typename CONFIG_T::bias_c_t &BC, const typename CONFIG_T::bias_o_t &BO) {
+
+    // Note:  currently this does not support recurrent bias
+
+    using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_out>;
+
+    [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T hidden_state_temp;
+    [[intel::fpga_register]] h_T cell_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T cell_state_temp;
+    [[intel::fpga_register]] h_T h;
+    [[intel::fpga_register]] h_T c;
+    [[intel::fpga_register]] in_T in;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[0][x] = 0;
+        cell_state[0][x] = 0;
+    }
+
+    // Input dimension
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+        // Data at current time step
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[i][x];
+            cell_state_temp[x] = cell_state[i][x];
+        }
+
+        // Do LSTM
+        lstm_cell<in_T, h_T, CONFIG_T>(in, hidden_state_temp, h, cell_state_temp, c, WI, WF, WC, WO, RWI, RWF, RWC, RWO, BI,
+                                       BF, BC, BO);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[i + 1][x] = h[x];
+            cell_state[i + 1][x] = c[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
new file mode 100644
index 000000000..893fd027c
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_activation.h
@@ -0,0 +1,47 @@
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "nnet_activation.h"
+#include "nnet_common.h"
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(const data_T &data, res_T &res) {}
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(const data_T &data, res_T &res) { nnet::relu<data_T, res_T, CONFIG_T>(data, res); }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(const data_T &data, res_T &res) { nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res); }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(const data_T &data, res_T &res) { nnet::dense_tanh<data_T, res_T, CONFIG_T>(data, res); }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
new file mode 100644
index 000000000..7429419cd
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent_stream.h
@@ -0,0 +1,68 @@
+#ifndef NNET_RECURRENT_STREAM_H_
+#define NNET_RECURRENT_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recurrent_activation.h"
+
+namespace nnet {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+void gru_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::recurrent_weight_t recurrent_weights,
+                typename CONFIG_T::bias_t bias, typename CONFIG_T::recurrent_bias_t recurrent_bias) {
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    using h_T = array<typename res_T::value_type, CONFIG_T::n_units>;
+
+    constexpr auto datasize = std::tuple_size<data_T>{};
+    constexpr auto ressize = std::tuple_size<res_T>{};
+
+    [[intel::fpga_register]] h_T h;
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_units; i++) {
+        h[i] = 0;
+    }
+
+    [[intel::fpga_register]] data_T x;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_timesteps * CONFIG_T::n_in / datasize; i_in++) {
+        auto data_pack = data_pipe::read();
+
+    DataPack:
+        #pragma unroll
+        for (int i_pack = 0; i_pack < datasize; i_pack++) {
+            x[i_pack] = data_pack[i_pack];
+        }
+
+        nnet::gru_cell<data_T, h_T, CONFIG_T>(x, h, weights, recurrent_weights, bias, recurrent_bias);
+
+        if (CONFIG_T::return_sequences) {
+            res_T res_pack;
+
+        ResPackRetSeq:
+            #pragma unroll
+            for (int i_pack = 0; i_pack < ressize; i_pack++) {
+                res_pack[i_pack] = h[i_pack];
+            }
+
+            res_pipe::write(res_pack);
+        }
+    }
+
+    if (!CONFIG_T::return_sequences) {
+        res_T res_pack;
+
+    ResPackNoRetSeq:
+        #pragma unroll
+        for (int i_pack = 0; i_pack < ressize; i_pack++) {
+            res_pack[i_pack] = h[i_pack];
+        }
+
+        res_pipe::write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
new file mode 100644
index 000000000..c461e337d
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize.h
@@ -0,0 +1,36 @@
+#ifndef NNET_IMAGE_H_
+#define NNET_IMAGE_H_
+
+namespace nnet {
+
+struct resize_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void resize_nearest(const data_T &image, res_T &resized) {
+    int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1;
+    int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1;
+
+    for (int i = 0; i < CONFIG_T::new_height; i++) {
+        for (int j = 0; j < CONFIG_T::new_width; j++) {
+            int x = ((j * x_ratio) >> 16);
+            int y = ((i * y_ratio) >> 16);
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] =
+                    image[(y * CONFIG_T::width * CONFIG_T::n_chan) + x * CONFIG_T::n_chan + k];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
new file mode 100644
index 000000000..9a37f098e
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_resize_stream.h
@@ -0,0 +1,58 @@
+#ifndef NNET_IMAGE_STREAM_H_
+#define NNET_IMAGE_STREAM_H_
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void resize_nearest_stream() {
+    assert(CONFIG_T::new_height % CONFIG_T::height == 0);
+    assert(CONFIG_T::new_width % CONFIG_T::width == 0);
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+
+    constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height;
+    constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width;
+
+ImageHeight:
+    for (unsigned h = 0; h < CONFIG_T::height; h++) {
+        [[intel::fpga_register]] data_T data_in_row[CONFIG_T::width];
+
+    ImageWidth:
+        for (unsigned i = 0; i < CONFIG_T::width; i++) {
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
+
+        ImageChan:
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_chan; j++) {
+                data_in_row[i][j] = in_data[j];
+            }
+        }
+
+    ResizeHeight:
+        for (unsigned i = 0; i < ratio_height; i++) {
+
+        ImageWidth2:
+            for (unsigned l = 0; l < CONFIG_T::width; l++) {
+
+            ResizeWidth:
+                for (unsigned j = 0; j < ratio_width; j++) {
+
+                    [[intel::fpga_register]] data_T out_data;
+
+                ResizeChan:
+                    #pragma unroll
+                    for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
+                        out_data[k] = data_in_row[l][k];
+                    }
+
+                    res_pipe::write(out_data);
+                }
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
new file mode 100644
index 000000000..6e5e86a58
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream.h
@@ -0,0 +1,126 @@
+#ifndef NNET_CLONE_H
+#define NNET_CLONE_H
+
+#include "nnet_common.h"
+
+namespace nnet {
+
+struct broadcast_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned n_dupl = 2;
+};
+
+template <class data_pipe, class res1_pipe, class res2_pipe, int N> void clone_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res1_T = typename ExtractPipeType<res1_pipe>::value_type;
+    using res2_T = typename ExtractPipeType<res2_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
+CloneLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+        data_T in_data = data_pipe::read();
+        res1_T out_data1;
+        res2_T out_data2;
+
+    ClonePack:
+        #pragma unroll
+        for (int j = 0; j < datasize; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+        }
+
+        res1_pipe::write(out_data1);
+        res2_pipe::write(out_data2);
+    }
+}
+
+template <class data_pipe, class res1_pipe, class res2_pipe, class res3_pipe, int N> void clone_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res1_T = typename ExtractPipeType<res1_pipe>::value_type;
+    using res2_T = typename ExtractPipeType<res2_pipe>::value_type;
+    using res3_T = typename ExtractPipeType<res3_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
+CloneLoop:
+    [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+        data_T in_data = data_pipe::read();
+        res1_T out_data1;
+        res2_T out_data2;
+        res3_T out_data3;
+
+    ClonePack:
+        #pragma unroll
+        for (int j = 0; j < datasize; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+            out_data3[j] = in_data[j];
+        }
+
+        res1_pipe::write(out_data1);
+        res2_pipe::write(out_data2);
+        res3_pipe::write(out_data3);
+    }
+}
+
+template <class data_pipe, class res_pipe, int N> void repack_stream() {
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+    constexpr auto datasize = std::tuple_size<data_T>{};
+    constexpr auto ressize = std::tuple_size<res_T>{};
+
+    if constexpr (datasize == ressize) {
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
+            [[intel::fpga_memory]] res_T out_data;
+
+            #pragma unroll
+            for (int j = 0; j < datasize; j++) {
+                out_data[j] = in_data[j];
+            }
+
+            res_pipe::write(out_data);
+        }
+    } else if constexpr (datasize > ressize) {
+        constexpr unsigned pack_diff = datasize / ressize;
+
+        for (int i = 0; i < N / datasize; i++) {
+
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
+            [[intel::fpga_memory]] res_T out_data;
+
+            [[intel::initiation_interval(1)]] for (int j = 0; j < pack_diff; j++) {
+
+                #pragma unroll
+                for (int k = 0; k < ressize; k++) {
+                    out_data[k] = in_data[j * ressize + k];
+                }
+                res_pipe::write(out_data);
+            }
+        }
+    } else { // datasize < ressize
+        [[intel::fpga_memory]] res_T out_data;
+        constexpr unsigned pack_diff = ressize / datasize;
+        unsigned pack_cnt = 0;
+        [[intel::initiation_interval(1)]] for (int i = 0; i < N / datasize; i++) {
+
+            [[intel::fpga_memory]] auto in_data = data_pipe::read();
+
+            #pragma unroll
+            for (int j = 0; j < datasize; j++) {
+                out_data[pack_cnt * datasize + j] = in_data[j];
+            }
+
+            if (pack_cnt == pack_diff - 1) {
+                res_pipe::write(out_data);
+                pack_cnt = 0;
+            } else {
+                pack_cnt++;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
new file mode 100644
index 000000000..2c4991a13
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose.h
@@ -0,0 +1,48 @@
+#ifndef NNET_TRANSPOSE_H_
+#define NNET_TRANSPOSE_H_
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T> void transpose_2d(const data_T &data, res_T &res) {
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            res[j * CONFIG_T::height + i] = static_cast<typename res_T::value_type>(data[i * CONFIG_T::width + j]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void transpose_3d(const data_T &data, res_T &res) {
+    static constexpr unsigned dim_data[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    static constexpr unsigned dim_res[3] = {dim_data[CONFIG_T::perm[0]], dim_data[CONFIG_T::perm[1]],
+                                            dim_data[CONFIG_T::perm[2]]};
+
+    int index_data[3] = {0}, index_res[3] = {0};
+
+    for (index_data[0] = 0; index_data[0] < dim_data[0]; index_data[0]++) {
+        #pragma unroll
+        for (index_data[1] = 0; index_data[1] < dim_data[1]; index_data[1]++) {
+            #pragma unroll
+            for (index_data[2] = 0; index_data[2] < dim_data[2]; index_data[2]++) {
+                index_res[0] = index_data[CONFIG_T::perm[0]];
+                index_res[1] = index_data[CONFIG_T::perm[1]];
+                index_res[2] = index_data[CONFIG_T::perm[2]];
+
+                res[index_res[0] * dim_res[1] * dim_res[2] + index_res[1] * dim_res[2] + index_res[2]] =
+                    static_cast<typename res_T::value_type>(
+                        data[index_data[0] * dim_data[1] * dim_data[2] + index_data[1] * dim_data[2] + index_data[2]]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
new file mode 100644
index 000000000..e15f63c13
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_transpose_stream.h
@@ -0,0 +1,39 @@
+#ifndef NNET_TRANSPOSE_STREAM_H_
+#define NNET_TRANSPOSE_STREAM_H_
+
+namespace nnet {
+
+template <class data_pipe, class res_pipe, typename CONFIG_T> void transpose_2d_stream() {
+
+    using data_T = typename ExtractPipeType<data_pipe>::value_type;
+    using res_T = typename ExtractPipeType<res_pipe>::value_type;
+
+    constexpr auto data_size = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    constexpr auto res_size = std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
+
+    [[intel::fpga_register]] typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_size; i++) {
+        [[intel::fpga_register]] data_T in_data = data_pipe::read();
+
+        #pragma unroll
+        for (int j = 0; j < data_size; j++) {
+            data_array[i * data_size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_size; i++) {
+        [[intel::fpga_register]] res_T out_data;
+
+        #pragma unroll
+        for (int j = 0; j < res_size; j++) {
+            out_data[j] = typename res_T::value_type(data_array[j * data_size + i]);
+        }
+
+        res_pipe::write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
new file mode 100644
index 000000000..8cf883c1d
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -0,0 +1,71 @@
+#ifndef NNET_TYPES_H_
+#define NNET_TYPES_H_
+
+#include <array>
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+#include <tuple>
+#include <utility>
+
+namespace nnet {
+
+// Define the pipe type that we use
+template <class T, std::size_t N> using array = std::array<T, N>;
+
+// T should be an array
+template <class T> constexpr T zero_array() {
+    T ar;
+    #pragma unroll
+    for (auto &a : ar) {
+        a = 0;
+    }
+    return ar;
+}
+
+// This is a helper to extract the value_type of a pipe
+template <typename T> struct ExtractPipeType { typedef T value_type; };
+
+template <template <class, class, int32_t, class, typename...> class PipeClass, class PipeName, class PipeDataT,
+          int32_t kPipeMinCapacity, class PipeProperties, typename... Args>
+struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProperties,
+                                 Args...>> // specialization
+{
+    typedef PipeDataT value_type;
+};
+
+/*
+ * HLS Shift Register Implementation
+ * To verify a shift register is used in hardware, go to report.html > Area Analysis of System
+ * Unrolling the shift loop minimizes resource usage and latency at the same time
+ * The shift loop should be either fully unrolled or not unrolled at all
+ * Unrolling with a specific unroll factor or pipelining with certain ii's, can cause an irregular access pattern, which
+ * wouldn't allow shift register usage in RTL
+ */
+template <typename T, int N> struct shift_reg {
+  private:
+    T data[N];
+
+  public:
+    // Default constructor
+    shift_reg() {}
+
+    // Shift queue, insert new element and return element from the front
+    T shift(T inp) {
+        T out = data[N - 1];
+
+        #pragma unroll
+        for (int i = N - 1; i > 0; i--) {
+            data[i] = data[i - 1];
+        }
+        data[0] = inp;
+
+        return out;
+    }
+
+    T read(int pos) { return data[pos]; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h
new file mode 100644
index 000000000..e23ca9770
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/parameters.h
@@ -0,0 +1,11 @@
+#ifndef PARAMETERS_H_
+#define PARAMETERS_H_
+
+#include "defines.h"
+
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+
+// hls-fpga-machine-learning insert layer-config
+
+#endif
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
new file mode 100644
index 000000000..ddad1d054
--- /dev/null
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -0,0 +1,77 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+#include "exception_handler.hpp"
+
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// hls-fpga-machine-learning insert class def #float
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+    static sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+// hls-fpga-machine-learning insert class def #double
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+    static sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
new file mode 100644
index 000000000..a2a2a3cec
--- /dev/null
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -0,0 +1,133 @@
+#include <algorithm>
+#include <cctype>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+#include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/ext/intel/prototype/interfaces.hpp>
+
+#include "exception_handler.hpp"
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+int main(int argc, char **argv) {
+
+#if FPGA_SIMULATOR
+    auto selector = sycl::ext::intel::fpga_simulator_selector_v;
+#elif FPGA_HARDWARE
+    auto selector = sycl::ext::intel::fpga_selector_v;
+#else // #if FPGA_EMULATOR
+    auto selector = sycl::ext::intel::fpga_emulator_selector_v;
+#endif
+
+    sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{});
+
+    auto device = q.get_device();
+
+    // make sure the device supports USM host allocations
+    if (!device.has(sycl::aspect::usm_host_allocations)) {
+        std::cerr << "This design must either target a board that supports USM "
+                     "Host/Shared allocations, or IP Component Authoring. "
+                  << std::endl;
+        std::terminate();
+    }
+
+    std::cout << "Running on device: " << device.get_info<sycl::info::device::name>().c_str() << std::endl;
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "tb_data/results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+
+    if (fin.is_open() && fpr.is_open()) {
+        std::vector<std::vector<float>> predictions;
+        unsigned int iteration = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) {
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "Processing input " << iteration << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::stringstream ssin(iline);
+            while (ssin >> current) {
+                in.push_back(current);
+            }
+
+            std::stringstream sspred(pline);
+            while (sspred >> current) {
+                pr.push_back(current);
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            q.single_task(MyProject{});
+
+            // hls-fpga-machine-learning convert output
+
+            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
+
+            for (auto outval : outputs) {
+                fout << outval << " ";
+            }
+            fout << std::endl;
+            if (iteration % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                for (auto predval : pr) {
+                    std::cout << predval << " ";
+                }
+                std::cout << std::endl;
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+                for (auto outval : outputs) {
+                    std::cout << outval << " ";
+                }
+                std::cout << std::endl;
+            }
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+
+        // hls-fpga-machine-learning insert top-level-function
+        for (int i = 0; i < num_iterations; i++) {
+            // hls-fpga-machine-learning insert zero
+            q.single_task(MyProject{});
+            // hls-fpga-machine-learning convert output
+            for (auto outval : outputs) {
+                std::cout << outval << " ";
+            }
+            std::cout << std::endl;
+
+            for (auto outval : outputs) {
+                fout << outval << " ";
+            }
+            fout << std::endl;
+        }
+    }
+    q.wait();
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
index 8897e1315..005b84217 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
@@ -32,7 +32,7 @@ struct conv1d_config {
     // Run-time Configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
index 974bb9580..fa213ca17 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_resource.h
@@ -46,8 +46,8 @@ void conv_1d_im2col_cl(
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
 ColLoop:
     #pragma unroll pf
@@ -97,8 +97,8 @@ void winograd_conv1d_3x1_kernel_cl(
     assert(CONFIG_T::stride_width == 1);
     assert(CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -184,8 +184,8 @@ void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
-    static constexpr int pf = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
+    static constexpr int pf = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
 
 ColLoop:
     #pragma unroll pf
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
index 3aa71a74b..55b635908 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d.h
@@ -38,7 +38,7 @@ struct conv2d_config {
     // Run-time configuration
     static const unsigned n_zeros = 0;
     static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
+    static const unsigned parallelization_factor = 1;
 
     // TODO: BRAM Storage on Quartus
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
index c1c14d53b..961c65037 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -58,10 +58,10 @@ void conv_2d_im2col_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CO
     // im2col performs no filter transformations; therefore, filter size remains constant
     assert(CONFIG_T::filt_height == CONFIG_T::impl_filt_height && CONFIG_T::filt_width == CONFIG_T::impl_filt_width);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
 HeightLoop:
     #pragma unroll pfr
@@ -133,10 +133,10 @@ void winograd_conv2d_3x3_kernel_cl(
     assert(CONFIG_T::pad_left == CONFIG_T::pad_right && CONFIG_T::pad_top == CONFIG_T::pad_bottom);
     assert(CONFIG_T::out_height > 2 && CONFIG_T::out_width > 2);
 
-    // Unroll factor for loop traversing input image, derived from parallelisation_factor
+    // Unroll factor for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, DIV_ROUNDUP(CONFIG_T::out_width, 2));
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), DIV_ROUNDUP(CONFIG_T::out_height, 2));
 
     // Initialise result to bias
     // Unroll fully, as loop performs a simple operation - assigning the outputs to a constant value
@@ -241,10 +241,10 @@ void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::i
                                    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
 
-    // Unroll factors for loop traversing input image, derived from parallelisation_factor
+    // Unroll factors for loop traversing input image, derived from parallelization_factor
     // Outer loop only gets unrolled after inner loop is fully unrolled
-    static constexpr int pfc = MIN(CONFIG_T::parallelisation_factor, CONFIG_T::out_width);
-    static constexpr int pfr = MIN((CONFIG_T::parallelisation_factor / pfc), CONFIG_T::out_height);
+    static constexpr int pfc = MIN(CONFIG_T::parallelization_factor, CONFIG_T::out_width);
+    static constexpr int pfr = MIN((CONFIG_T::parallelization_factor / pfc), CONFIG_T::out_height);
 
 HeightLoop:
     #pragma unroll pfr
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
index bbfc0908e..6bc254db9 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_pooling.h
@@ -122,11 +122,9 @@ struct pooling1d_config {
 
 template <class data_T, class res_T, typename CONFIG_T>
 void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::n_in / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
 
 FiltLoop:
     #pragma unroll
@@ -135,7 +133,7 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
     InputWidthLoop:
         #pragma unroll
         #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_width; inp_col += CONFIG_T::stride_width) {
+        for (int inp_col = 0; inp_col < restricted_padded_width; inp_col += CONFIG_T::stride_width) {
             hls_register data_T pool[CONFIG_T::pool_width];
 
             // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
@@ -145,7 +143,8 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
             #pragma unroll
             #pragma disable_loop_pipelining
             for (int pool_col = 0; pool_col < CONFIG_T::stride_width; pool_col++) {
-                if (inp_col + pool_col < CONFIG_T::pad_left || inp_col + pool_col >= (padded_width - CONFIG_T::pad_right)) {
+                if (inp_col + pool_col < CONFIG_T::pad_left ||
+                    inp_col + pool_col >= (full_padded_width - CONFIG_T::pad_right)) {
                     // Add padding
                     pool[pool_col] = pad_val<data_T, CONFIG_T::pool_op>();
                     if (CONFIG_T::count_pad)
@@ -220,14 +219,11 @@ struct pooling2d_config {
 template <class data_T, class res_T, typename CONFIG_T>
 void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
                   res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    // For 'same' padding, increase input width by left- and right-side padding
-    // For 'valid' padding, reduce input width to area covered by pooling function
-    static constexpr int padded_width = (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0)
-                                            ? (CONFIG_T::in_width / CONFIG_T::stride_width * CONFIG_T::stride_width)
-                                            : (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right);
-    static constexpr int padded_height = (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0)
-                                             ? (CONFIG_T::in_height / CONFIG_T::stride_height * CONFIG_T::stride_height)
-                                             : (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom);
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
 
 FiltLoop:
     #pragma unroll
@@ -236,11 +232,11 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
     InputHeightLoop:
         #pragma unroll
         #pragma disable_loop_pipelining
-        for (int inp_col = 0; inp_col < padded_height; inp_col += CONFIG_T::stride_height) {
+        for (int inp_col = 0; inp_col < restricted_padded_height; inp_col += CONFIG_T::stride_height) {
         InputWidthLoop:
             #pragma unroll
             #pragma disable_loop_pipelining
-            for (int inp_width = 0; inp_width < padded_width; inp_width += CONFIG_T::stride_width) {
+            for (int inp_width = 0; inp_width < restricted_padded_width; inp_width += CONFIG_T::stride_width) {
                 hls_register data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
 
                 // Keep track of number of pixels in image vs padding region; needed for rescaling Average Pooling
@@ -255,9 +251,9 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
                     #pragma disable_loop_pipelining
                     for (int pool_row = 0; pool_row < CONFIG_T::stride_width; pool_row++) {
                         if (inp_col + pool_col < CONFIG_T::pad_top ||
-                            inp_col + pool_col >= (padded_height - CONFIG_T::pad_bottom) ||
+                            inp_col + pool_col >= (full_padded_height - CONFIG_T::pad_bottom) ||
                             inp_width + pool_row < CONFIG_T::pad_left ||
-                            inp_width + pool_row >= (padded_width - CONFIG_T::pad_right)) {
+                            inp_width + pool_row >= (full_padded_width - CONFIG_T::pad_right)) {
                             // Add padding
                             pool[pool_col * CONFIG_T::stride_width + pool_row] = pad_val<data_T, CONFIG_T::pool_op>();
                             if (CONFIG_T::count_pad)
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
index b5b55e204..a0c115f0e 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_stream.h
@@ -2,6 +2,7 @@
 #define NNET_CLONE_H
 
 #include "nnet_common.h"
+#include <cstdio>
 
 namespace nnet {
 
diff --git a/hls4ml/utils/fixed_point_utils.py b/hls4ml/utils/fixed_point_utils.py
index 0ca29d7d1..020e5083c 100644
--- a/hls4ml/utils/fixed_point_utils.py
+++ b/hls4ml/utils/fixed_point_utils.py
@@ -125,14 +125,17 @@ def uint_to_binary(i, N):
     return bits
 
 
-'''
+def ceil_log2(i):
+    """
     Returns log2(i), rounding up
     Args:
         - i : Number
     Returns:
         - val : representing ceil(log2(i))
-'''
+    """
+    return i.bit_length() - 1
 
 
-def ceil_log2(i):
-    return i.bit_length() - 1
+def next_pow2(x):
+    """Return the next bigger power of 2 of an integer"""
+    return 1 << (x - 1).bit_length()
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index f16cccc9f..8de19fe1d 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,4 +1,5 @@
 from hls4ml.writer.catapult_writer import CatapultWriter
+from hls4ml.writer.oneapi_writer import OneAPIWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
 from hls4ml.writer.vitis_writer import VitisWriter
@@ -10,5 +11,6 @@
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
 register_writer('Quartus', QuartusWriter)
+register_writer('oneAPI', OneAPIWriter)
 register_writer('Catapult', CatapultWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
new file mode 100644
index 000000000..fe633214f
--- /dev/null
+++ b/hls4ml/writer/oneapi_writer.py
@@ -0,0 +1,969 @@
+import glob
+import os
+import tarfile
+from collections import OrderedDict
+from shutil import copyfile
+
+import numpy as np
+import yaml
+
+from hls4ml.backends import get_backend
+from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
+from hls4ml.utils.string_utils import convert_to_pascal_case
+from hls4ml.writer.writers import Writer
+
+config_filename = 'hls4ml_config.yml'
+
+
+class OneAPIWriter(Writer):
+
+    def __make_dat_file(self, original_path, project_path):
+        """
+        Convert other input/output data types into a dat file, which is
+        a text file with the falttened matrix printed out. Note that ' ' is
+        assumed to be the delimiter.
+        """
+
+        # Take in data from current supported data files
+        if original_path[-3:] == "npy":
+            data = np.load(original_path)
+        else:
+            raise Exception("Unsupported input/output data files.")
+
+        # Faltten data, just keep first dimension
+        data = data.reshape(data.shape[0], -1)
+
+        def print_data(f):
+            for i in range(data.shape[0]):
+                for j in range(data.shape[1]):
+                    f.write(str(data[i][j]) + " ")
+                f.write("\n")
+
+        # Print out in dat file
+        with open(project_path, "w") as f:
+            print_data(f)
+
+    def get_max_reuse_factor(self, model):
+        max_rf = 0
+        for layer in model.get_layers():
+            rf = int(layer.get_attr('reuse_factor'))
+            if rf > max_rf:
+                max_rf = rf
+        return max_rf
+
+    def print_array_to_cpp(self, var, layer, odir):
+        """Write a weights array to C++ header files.
+
+        Args:
+            var (WeightVariable): Weight to write
+            layer (Layer): Instance of the layer to which the weights belong
+            odir (str): Output directory
+        """
+        with open(f"{odir}/src/firmware/weights/{var.name}.h", "w") as h_file:
+            # meta data
+            h_file.write(f"//Numpy array shape {var.shape}\n")
+            h_file.write(f"//Min {np.min(var.min):.12f}\n")
+            h_file.write(f"//Max {np.max(var.max):.12f}\n")
+            h_file.write(f"//Number of zeros {var.nzeros}\n")
+            h_file.write("\n")
+
+            h_file.write(f"#ifndef {var.name.upper()}_H_\n")
+            h_file.write(f"#define {var.name.upper()}_H_\n")
+            h_file.write("\n")
+
+            rf = int(layer.get_attr('reuse_factor', 1))
+
+            h_file.write(var.definition_cpp(rf) + " = {{")
+
+            # fill c++ array.
+            # not including internal brackets for multidimensional case
+            sep = ''
+            for x in var:
+                h_file.write(sep + x)
+                sep = ", "
+            h_file.write("}};\n")
+            h_file.write("\n#endif\n")
+
+    def write_project_dir(self, model):
+        """Write the base project directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        if not os.path.isdir(f"{model.config.get_output_dir()}/src/firmware/weights"):
+            os.makedirs(f"{model.config.get_output_dir()}/src/firmware/weights")
+
+    def write_project_cpp(self, model):
+        """Write the main architecture source file (myproject.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        project_name = model.config.get_project_name()
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w'
+        ) as fout:
+            model_inputs = model.get_input_variables()
+            model_outputs = model.get_output_variables()
+            model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+            if len(model_brams) != 0:
+                raise NotImplementedError("Weights on the interface is currently not supported")
+
+            io_type = model.config.get_config_value('IOType')
+            indent = '    '
+
+            for line in f.readlines():
+                # Add headers to weights and biases
+                if 'myproject' in line:
+                    newline = line.replace('myproject', project_name)
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                # oneAPI pipes need to be declared and passed as template parameters
+                elif '// hls-fpga-machine-learning insert inter-task pipes' in line:
+                    newline = line
+                    if io_type == 'io_stream':
+                        for layer in model.get_layers():
+                            vars = layer.get_variables()
+                            for var in vars:
+                                if var not in model_inputs and var not in model_outputs:
+                                    newline += var.declare_cpp()
+
+                # Read in inputs
+                elif '// hls-fpga-machine-learning read in' in line:
+                    newline = line
+                    if io_type == 'io_parallel':
+                        for inp in model_inputs:
+                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n'
+                    # for streaming we don't need to read it in
+
+                # Insert weights
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            if w not in model_brams:
+                                newline += f'#include "weights/{w.name}.h"\n'
+
+                # Insert task sequences
+                elif '// hls-fpga-machine-learning declare task sequences' in line:
+                    newline = line
+                    if io_type == 'io_stream':  # only need this for io_stream
+                        for layer in model.get_layers():
+                            ts = layer.get_attr('tast_sequence_cpp')
+                            if ts:
+                                newline += '    ' + ts + '\n'
+
+                # Neural net instantiation
+                elif '// hls-fpga-machine-learning insert layers' in line:
+                    newline = line + '\n'
+                    for layer in model.get_layers():
+                        if io_type != 'io_stream':
+                            vars = layer.get_variables()
+                            for var in vars:
+                                if var not in model_inputs:
+                                    def_cpp = var.definition_cpp()
+                                    if def_cpp is not None:
+                                        newline += '    ' + def_cpp + ';\n'
+                        func = (
+                            layer.get_attr('function_cpp')
+                            if io_type == 'io_parallel'
+                            else layer.get_attr('stream_function_cpp')
+                        )
+                        if func:
+                            newline += '    ' + func + '\n'
+                            if model.config.trace_output and layer.get_attr('trace', False):
+                                newline += '#ifndef HLS_SYNTHESIS\n'
+                                for var in vars:
+                                    newline += '    nnet::save_layer_output<{}>({}, "{}", {});\n'.format(
+                                        var.type.name, var.name, layer.name, var.size_cpp()
+                                    )
+                                newline += '#endif\n'
+
+                # Write the output
+                elif '// hls-fpga-machine-learning return' in line:
+                    newline = line
+                    if io_type == 'io_parallel':
+                        for out in model_outputs:
+                            newline += indent + f'{out.pipe_name}::write({out.name});\n'
+                    # don't need to add anything in io_stream
+
+                # Just copy line
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+    def write_project_header(self, model):
+        """Write the main architecture header file (myproject.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        project_name = model.config.get_project_name()
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w'
+        ) as fout:
+            model_inputs = model.get_input_variables()
+            model_outputs = model.get_output_variables()
+            # model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+            # io_parallel and io_stream instantiate the top-level function differently (io_stream not yet supported)
+            # io_type = model.config.get_config_value('IOType')
+            # indent = '    '
+            # brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+            for line in f.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', format(project_name.upper()))
+
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', project_name)
+
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                # Declarations for the inputs. May need modification when io_stream is supported
+                elif '// hls-fpga-machine-learning insert inputs' in line:
+                    newline = line
+                    for inp in model_inputs:
+                        newline += inp.declare_cpp()
+
+                # and declareations for the outputs
+                elif '// hls-fpga-machine-learning insert outputs' in line:
+                    newline = line
+                    for out in model_outputs:
+                        newline += out.declare_cpp()
+
+                # Simply copy line, if no inserts are required
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+    def write_defines(self, model):
+        """Write the C++ type definitions file (defines.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/defines.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/defines.h', 'w'
+        ) as fout:
+            for line in f.readlines():
+                # Insert numbers
+                if '// hls-fpga-machine-learning insert numbers' in line:
+                    newline = line
+
+                    defines_list = []
+                    for layer in model.get_layers():
+                        defines = ''
+                        # Note: this assumes all the layers have one ouput
+                        # (or in clones, one type of output)
+                        for k, v in layer.get_output_variable().get_shape():
+                            defines += f'#define {k} {v}\n'
+
+                        defines_list.append(defines)
+
+                    newline += ''.join(defines_list)
+
+                elif '// hls-fpga-machine-learning insert layer-precision' in line:
+                    newline = line
+                    all_precision = OrderedDict()
+                    for layer in model.get_layers():
+                        layer_precision = layer.get_layer_precision()
+                        for type_name, type_var in layer_precision.items():
+                            # Ensure that layer's types doesn't override existing types
+                            # This can happen in case of InplaceVariable types
+                            if type_name not in all_precision:
+                                all_precision[type_name] = type_var
+                    for used_type in all_precision.values():
+                        newline += used_type.definition_cpp()
+
+                else:
+                    newline = line
+                fout.write(newline)
+
+    def write_parameters(self, model):
+        """Write the C++ layer config file (parameters.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/firmware/parameters.h')) as f, open(
+            f'{model.config.get_output_dir()}/src/firmware/parameters.h', 'w'
+        ) as fout:
+            for line in f.readlines():
+                if '// hls-fpga-machine-learning insert includes' in line:
+                    newline = line
+                    for include in sorted(
+                        set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))
+                    ):
+                        newline += '#include "%s"\n' % include
+
+                elif "// hls-fpga-machine-learning insert layer-config" in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        config = layer.get_attr('config_cpp', None)
+                        if config:
+                            newline += config + '\n'
+                else:
+                    newline = line
+                fout.write(newline)
+
+    def write_weights(self, model):
+        """Write the weights into header files
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        for layer in model.get_layers():
+            for weights in layer.get_weights():
+                self.print_array_to_cpp(weights, layer, model.config.get_output_dir())
+
+    def write_test_bench(self, model):
+        """Write the testbench
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        # TODO - This function only works with one model input
+        # (NOT one data point - it works as expected with multiple data points)
+
+        # copy the exception handler
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp')
+        dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp'
+        copyfile(srcpath, dstpath)
+
+        project_name = model.config.get_project_name()
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        if len(model_brams) != 0:
+            raise NotImplementedError("Weights on the interface is currently not supported")
+
+        if len(model_inputs) != 1 or len(model_outputs) != 1:
+            print("The testbench supports only single input arrays and single output arrays.")
+            print("Please modify it before using it.")
+
+        if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'):
+            os.mkdir(f'{model.config.get_output_dir()}/tb_data/')
+
+        input_data = model.config.get_config_value('InputData')
+        output_predictions = model.config.get_config_value('OutputPredictions')
+
+        if input_data:
+            if input_data[-3:] == "dat":
+                copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+            else:
+                self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+
+        if output_predictions:
+            if output_predictions[-3:] == "dat":
+                copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat')
+            else:
+                self.__make_dat_file(
+                    output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
+                )
+
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_test.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w'
+        ) as fout:
+            for line in f.readlines():
+                indent = ' ' * (len(line) - len(line.lstrip(' ')))
+
+                if 'myproject' in line:
+                    newline = line.replace('myproject', project_name)
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+                elif '// hls-fpga-machine-learning insert zero' in line:
+                    newline = line
+                    inp = model_inputs[0]
+                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
+                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
+                    newline += indent + '    vals[j] = 0.0; \n'
+                    newline += indent + '}\n'
+                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                elif '// hls-fpga-machine-learning insert data' in line:
+                    newline = line
+                    inp = model_inputs[0]
+                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
+                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
+                    newline += indent + '    vals[j] = in[j]; \n'
+                    newline += indent + '}\n'
+                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                elif '// hls-fpga-machine-learning convert output' in line:
+                    newline = line
+                    out = model_outputs[0]
+                    newline += indent + f'float outputs[{out.size_cpp()}];\n'
+                    newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n'
+                else:
+                    newline = line
+
+                fout.write(newline)
+
+    def write_bridge(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        project_name = model.config.get_project_name()
+        stamp = model.config.get_config_value('Stamp')
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+        # model brambs aren't actually supported yet
+
+        # io_type = model.config.get_config_value('IOType')
+        indent = '    '
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        with open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, open(
+            f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w'
+        ) as fout:
+            for line in f.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', format(project_name.upper()))
+
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', format(project_name))
+
+                elif 'MyProject' in line:
+                    newline = line.replace('MyProject', convert_to_pascal_case(project_name))
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+                elif '// hls-fpga-machine-learning insert class def' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    newline = f'class {convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp};\n'
+
+                elif '// hls-fpga-machine-learning insert header' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                    outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                    newline = ''
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str + '\n'
+
+                elif '// hls-fpga-machine-learning insert wrapper' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    newline = ''
+                    for i in model_inputs:
+                        newline += indent + f'nnet::convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n'
+
+                    newline += (
+                        indent
+                        + f'q.single_task<{convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp}>'
+                        + f'({convert_to_pascal_case(project_name)}{{}});\n'
+                    )
+
+                    for o in model_outputs:
+                        newline += (
+                            indent + f'nnet::convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n'
+                        )
+                    newline += '\n'
+                    newline += indent + 'q.wait();\n'
+
+                elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                    newline = ''
+                    for layer in model.get_layers():
+                        func = layer.get_attr('function_cpp')
+                        if func and model.config.trace_output and layer.get_attr('trace', False):
+                            vars = layer.get_variables()
+                            for var in vars:
+                                newline += (
+                                    indent
+                                    + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                    + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                                )
+
+                else:
+                    newline = line
+                fout.write(newline)
+
+    def write_build_script(self, model):
+        """Write the build scripts (Makefile, build_lib.sh)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # Makefile
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        device = model.config.get_config_value('Part')
+        with open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, open(
+            f'{model.config.get_output_dir()}/CMakeLists.txt', 'w'
+        ) as fout:
+            for line in f.readlines():
+                line = line.replace('myproject', model.config.get_project_name())
+                line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+                if 'set(FPGA_DEVICE' in line:
+                    line = f'    set(FPGA_DEVICE "{device}")\n'
+
+                fout.write(line)
+
+    def write_nnet_utils(self, model):
+        """Copy the nnet_utils, AP types headers and any custom source to the project output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # nnet_utils
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        srcpath = os.path.join(filedir, '../templates/oneapi/firmware/nnet_utils/')
+        dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/'
+
+        if not os.path.exists(dstpath):
+            os.mkdir(dstpath)
+
+        headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')]
+
+        for h in headers:
+            copyfile(srcpath + h, dstpath + h)
+
+        # custom source
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        custom_source = get_backend('oneAPI').get_custom_source()
+        for dst, srcpath in custom_source.items():
+            dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
+            copyfile(srcpath, dstpath)
+
+    def __get_table_size(self, model, activation):
+        for layer in model.get_layers():
+            if (
+                layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
+            ) and layer.get_attr('table_size') is not None:
+                return int(layer.get_attr('table_size'))
+        return 1024
+
+    def __get_table_header(self, table_name, table_size):
+        table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+        return table_header
+
+    def __write_elu_table(self, model, path):
+        table_name = 'elu_table'
+        table_size = self.__get_table_size(model, 'elu')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = -8.0 * i / float(table_size)
+            real_val = np.exp(in_val) - 1.0
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_sigmoid_table(self, model, path):
+        MAX_VALUE = 8
+        MIN_VALUE = 0
+
+        table_name = 'sigmoid_table'
+        table_size = self.__get_table_size(model, 'sigmoid')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(int(table_size)):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+            real_val = 1.0 / (1 + np.exp(-in_val))
+            if real_val >= 0.5:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_tanh_table(self, model, path):
+        MAX_VALUE = 4
+        MIN_VALUE = 0
+
+        table_name = 'tanh_table'
+        table_size = self.__get_table_size(model, 'tanh')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+            real_val = np.tanh(in_val)
+            if real_val >= 0:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_softplus_table(self, model, path):
+        table_name = 'softplus_table'
+        table_size = self.__get_table_size(model, 'softplus')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
+            real_val = np.log(np.exp(in_val) + 1.0)
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_softsign_table(self, model, path):
+        MAX_VALUE = 8
+        MIN_VALUE = 0
+        table_name = 'softsign_table'
+        table_size = self.__get_table_size(model, 'softsign')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = (
+                i * (MAX_VALUE - MIN_VALUE) / float(table_size)
+                + (MAX_VALUE - MIN_VALUE) / (float(table_size) * 2)
+                + MIN_VALUE
+            )
+
+            real_val = in_val / (np.fabs(in_val) + 1.0)
+            if real_val >= 0:
+                h_file.write(sep + str(real_val))
+                sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_selu_table(self, model, path):
+        table_name = 'selu_table'
+        table_size = self.__get_table_size(model, 'selu')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = -8.0 * i / float(table_size)
+            real_val = 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (np.exp(in_val) - 1.0))
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table(self, model, path):
+        table_name = 'exp_table'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+                    if fp_signed is False:
+                        raise Exception('Softmax types need to be signed')
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            b = uint_to_binary(i, N)
+            if i == 0:
+                b.insert(0, 0)
+            else:
+                b.insert(0, 1)
+            f.set_msb_bits(b)
+            real_val = f.exp_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table(self, model, path):
+        table_name = 'invert_table'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+                    if fp_signed is False:
+                        raise Exception('Softmax types need to be signed')
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            b = uint_to_binary(i, N)
+            b.insert(0, 0)
+            f.set_msb_bits(b)
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table_latency(self, model, path):
+        table_name = 'exp_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.exp_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table_latency(self, model, path):
+        table_name = 'invert_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except Exception:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_exp_table_legacy(self, model, path):
+        table_name = 'exp_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
+            real_val = np.exp(in_val)
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def __write_invert_table_legacy(self, model, path):
+        table_name = 'invert_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open(f'{path}/{table_name}.tb', 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            real_val = 0
+            in_val = 64.0 * i / float(table_size)
+            if in_val > 0.0:
+                real_val = 1.0 / in_val
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.close()
+
+    def write_activation_tables(self, model):
+        """Write the lookup tables for activation functions
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        # Output path
+        dstpath = f'{model.config.get_output_dir()}/src/firmware/nnet_utils/activation_tables'
+        if not os.path.exists(dstpath):
+            os.mkdir(dstpath)
+
+        # Tables
+        # TODO - Only write tables needed by model, not all of them
+        self.__write_elu_table(model, dstpath)
+        self.__write_sigmoid_table(model, dstpath)
+        self.__write_tanh_table(model, dstpath)
+        self.__write_softplus_table(model, dstpath)
+        self.__write_softsign_table(model, dstpath)
+        self.__write_selu_table(model, dstpath)
+        self.__write_exp_table(model, dstpath)
+        self.__write_invert_table(model, dstpath)
+        self.__write_exp_table_latency(model, dstpath)
+        self.__write_invert_table_latency(model, dstpath)
+        self.__write_exp_table_legacy(model, dstpath)
+        self.__write_invert_table_legacy(model, dstpath)
+
+    def write_yml(self, model):
+        """Write the config to the YAML file
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        def keras_model_representer(dumper, keras_model):
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
+            keras_model.save(model_path)
+            return dumper.represent_scalar('!keras_model', model_path)
+
+        try:
+            from tensorflow.keras import Model as KerasModel
+
+            yaml.add_multi_representer(KerasModel, keras_model_representer)
+        except Exception:
+            pass
+
+        with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file:
+            yaml.dump(model.config.config, file)
+
+    def write_tar(self, model):
+        """Write the generated project as a .tar.gz archive
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive:
+            archive.add(model.config.get_output_dir(), recursive=True)
+
+    def write_hls(self, model):
+        print('Writing HLS project')
+        self.write_project_dir(model)
+        self.write_project_cpp(model)
+        self.write_project_header(model)
+        self.write_weights(model)
+        self.write_defines(model)
+        self.write_parameters(model)
+        self.write_test_bench(model)
+        self.write_bridge(model)
+        self.write_build_script(model)
+        self.write_nnet_utils(model)
+        self.write_activation_tables(model)
+        self.write_yml(model)
+        self.write_tar(model)
+        print('Done')
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index f156b1cdc..5d97067c4 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -12,7 +12,7 @@
 # Variable 'name' is simply used as an identifier for the activation
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 @pytest.mark.parametrize(
     'activation, name',
diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
index 15774fa39..34ff84695 100644
--- a/test/pytest/test_batchnorm.py
+++ b/test/pytest/test_batchnorm.py
@@ -29,10 +29,10 @@ def model(request):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('model', [True, False], indirect=True)
 def test_batchnorm(model, data, backend, io_type):
-    default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
+    default_precision = 'fixed<32, 1>'
 
     center = model.layers[0].center
     scale = model.layers[0].scale
diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py
index 48357a42a..b58a35417 100644
--- a/test/pytest/test_conv1d.py
+++ b/test/pytest/test_conv1d.py
@@ -33,6 +33,8 @@ def keras_model():
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
@@ -85,6 +87,8 @@ def hls_model(keras_model, backend, io_type, strategy):
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py
index c045629a4..632f8e75f 100644
--- a/test/pytest/test_embed.py
+++ b/test/pytest/test_embed.py
@@ -25,7 +25,7 @@ def keras_model():
 
 
 @pytest.fixture
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
@@ -41,7 +41,7 @@ def hls_model(keras_model, backend, io_type):
     return hls_model
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_embedding_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py
index d0b635595..57f06dbdd 100644
--- a/test/pytest/test_globalpooling.py
+++ b/test/pytest/test_globalpooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -89,7 +89,7 @@ def keras_model_2d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index 49ceb625a..af2507e8f 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -25,7 +25,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_dense(backend, io_type):
     model = tf.keras.models.Sequential()
@@ -67,7 +67,7 @@ def test_dense(backend, io_type):
     assert len(model.layers) + 1 == len(hls_model.get_layers())
     assert list(hls_model.get_layers())[0].attributes['class_name'] == "InputLayer"
     assert list(hls_model.get_layers())[1].attributes["class_name"] == model.layers[0]._name
-    assert list(hls_model.get_layers())[2].attributes['class_name'] == model.layers[1]._name
+    assert list(hls_model.get_layers())[2].attributes['class_name'] == 'ELU'
     assert list(hls_model.get_layers())[0].attributes['input_shape'] == list(model.layers[0].input_shape[1:])
     assert list(hls_model.get_layers())[1].attributes['n_in'] == model.layers[0].input_shape[1:][0]
     assert list(hls_model.get_layers())[1].attributes['n_out'] == model.layers[0].output_shape[1:][0]
@@ -90,7 +90,7 @@ def test_dense(backend, io_type):
     ],
 )
 # ThresholdedReLU(theta=1.0)])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -119,7 +119,7 @@ def test_activations(activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -192,7 +192,7 @@ def test_conv1d(padds, backend, io_type):
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(chans, padds, backend, io_type):
     model = tf.keras.models.Sequential()
@@ -355,7 +355,7 @@ def test_depthwise1d(backend, io_type):
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_pooling(pooling, padds, chans, backend):
     assert '1D' in pooling.__name__ or '2D' in pooling.__name__
 
diff --git a/test/pytest/test_merge.py b/test/pytest/test_merge.py
index cabe3968f..3ecbc951d 100644
--- a/test/pytest/test_merge.py
+++ b/test/pytest/test_merge.py
@@ -12,7 +12,7 @@
 
 @pytest.mark.parametrize('merge_layer', [Add, Average, Maximum, Minimum, Multiply, Subtract])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('swap_inputs', [True, False])
 def test_merge(merge_layer, io_type, backend, swap_inputs):
     input_shape = (10, 10, 3)
@@ -48,7 +48,7 @@ def test_merge(merge_layer, io_type, backend, swap_inputs):
 
 @pytest.mark.parametrize('axes', [1])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # No io_stream implementation yet
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_dot(axes, io_type, backend):
     # Only 1D implemented
     input_shape = (10,)
@@ -77,7 +77,7 @@ def test_dot(axes, io_type, backend):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate1d(io_type, backend):
     input_shape = (10,)
 
@@ -106,7 +106,7 @@ def test_concatenate1d(io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate2d(axis, io_type, backend):
     input_shape = (10, 3)
 
@@ -135,7 +135,7 @@ def test_concatenate2d(axis, io_type, backend):
 
 @pytest.mark.parametrize('axis', [1, 2, 3])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_concatenate3d(axis, io_type, backend):
     input_shape = (10, 10, 3)
 
diff --git a/test/pytest/test_multi_dense.py b/test/pytest/test_multi_dense.py
index e07dc119b..3dc183719 100644
--- a/test/pytest/test_multi_dense.py
+++ b/test/pytest/test_multi_dense.py
@@ -18,6 +18,7 @@
         ('Vitis', 'Latency'),
         ('Vitis', 'Resource'),
         ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
         ('Catapult', 'Latency'),
         ('Catapult', 'Resource'),
     ],
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index d7f9281b3..678b22bfe 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -23,6 +23,8 @@
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vitis', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
@@ -71,7 +73,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend == 'Quartus' and io_type == 'io_stream'):
+    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
@@ -85,6 +87,8 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     [
         ('Quartus', 'io_parallel', 'resource'),
         ('Quartus', 'io_stream', 'resource'),
+        ('oneAPI', 'io_parallel', 'resource'),
+        ('oneAPI', 'io_stream', 'resource'),
         ('Vivado', 'io_parallel', 'resource'),
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
@@ -131,7 +135,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     hls_model.compile()
     hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
 
-    if not (backend == 'Quartus' and io_type == 'io_stream'):
+    if not (backend in ['Quartus', 'oneAPI'] and io_type == 'io_stream'):
         # Quartus io_stream does not currently have a special pointwise implementation
         assert 'Pointwise' in list(hls_model.graph.values())[1].class_name
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py
index 7a10cd273..1486ee33f 100644
--- a/test/pytest/test_pooling.py
+++ b/test/pytest/test_pooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -71,6 +71,39 @@ def test_pool1d(backend, keras_model_1d, data_1d, io_type):
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
 
 
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
+@pytest.mark.parametrize(
+    'keras_model_1d',
+    [
+        {'model_type': 'max', 'padding': 'valid'},
+        {'model_type': 'avg', 'padding': 'valid'},
+    ],
+    ids=[
+        'model_type-max-padding-valid',
+        'model_type-avg-padding-valid',
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize('io_type', ['io_stream'])
+def test_pool1d_stream(backend, keras_model_1d, data_1d, io_type):
+    model, model_type, padding = keras_model_1d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        io_type=io_type,
+        output_dir=str(test_root_path / f'hls4mlprj_globalplool1d_{backend}_{io_type}_{model_type}_padding_{padding}'),
+        backend=backend,
+    )
+    hls_model.compile()
+
+    y_keras = model.predict(data_1d)
+    y_hls = hls_model.predict(data_1d).reshape(y_keras.shape)
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
+
+
 @pytest.fixture(scope='module')
 def data_2d():
     return np.random.rand(100, in_shape, in_shape, in_filt)
@@ -89,7 +122,7 @@ def keras_model_2d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
@@ -126,3 +159,36 @@ def test_pool2d(backend, keras_model_2d, data_2d, io_type):
     y_keras = model.predict(data_2d)
     y_hls = hls_model.predict(data_2d).reshape(y_keras.shape)
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
+
+
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'oneAPI'])
+@pytest.mark.parametrize(
+    'keras_model_2d',
+    [
+        {'model_type': 'max', 'padding': 'valid'},
+        {'model_type': 'avg', 'padding': 'valid'},
+    ],
+    ids=[
+        'model_type-max-padding-valid',
+        'model_type-avg-padding-valid',
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize('io_type', ['io_stream'])
+def test_pool2d_stream(backend, keras_model_2d, data_2d, io_type):
+    model, model_type, padding = keras_model_2d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,9>', granularity='name')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        io_type=io_type,
+        output_dir=str(test_root_path / f'hls4mlprj_globalplool2d_{backend}_{io_type}_{model_type}_padding_{padding}'),
+        backend=backend,
+    )
+    hls_model.compile()
+
+    y_keras = model.predict(data_2d)
+    y_hls = hls_model.predict(data_2d).reshape(y_keras.shape)
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index fee7b9a3a..b8cce4259 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -22,7 +22,7 @@ def forward(self, x):
         return self.linear(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_linear(backend, io_type):
     model = LinearModel()
@@ -72,7 +72,7 @@ def test_linear(backend, io_type):
         nn.Threshold(threshold=1.0, value=0.0),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activations(activation_function, backend, io_type):
     model = torch.nn.Sequential(nn.Linear(1, 1), activation_function).to()
@@ -170,7 +170,7 @@ def forward(self, x):
         ThresholdModel(),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_activation_functionals(activation_function, backend, io_type):
     model = activation_function
@@ -205,7 +205,7 @@ def test_activation_functionals(activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv1d(padds, backend, io_type):
     n_in = 2
@@ -312,7 +312,7 @@ def test_conv1d(padds, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_conv2d(padds, backend, io_type):
     n_in = 2
@@ -467,7 +467,7 @@ def test_conv2d(padds, backend, io_type):
 
 @pytest.mark.parametrize('pooling', pooling_layers)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_pooling(pooling, padds, backend):
     assert '1d' in pooling.__name__ or '2d' in pooling.__name__
 
@@ -587,7 +587,7 @@ def forward(self, x):
         return self.bn(x)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_bn(backend, io_type):
     model = BatchNormModel()
@@ -628,7 +628,7 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_squeeze(backend, io_type):
     model = SqueezeModel()
@@ -650,7 +650,8 @@ def test_squeeze(backend, io_type):
 
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01)
 
-    if io_type == 'io_parallel':
+    # oneAPI doesn't use the Repack class (and for io_stream does not use inplace variables)
+    if io_type == 'io_parallel' or backend == 'oneAPI':
         assert list(hls_model.get_layers())[1].attributes['class_name'] == 'Reshape'
         assert list(hls_model.get_layers())[1].attributes['target_shape'] == [1, 5]
         assert list(hls_model.get_layers())[3].attributes['class_name'] == 'Reshape'
@@ -662,7 +663,7 @@ def test_squeeze(backend, io_type):
         assert list(hls_model.get_layers())[3].attributes['target_shape'] == [3]
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_flatten(backend):
     input = torch.randn(1, 1, 5, 5)
     model = nn.Sequential(nn.Conv2d(1, 32, 5, 1, 1), nn.Flatten(), nn.ReLU())
@@ -706,7 +707,7 @@ def forward(self, x):
         return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_skipped_layers(backend, io_type):
     model = ModelSkippedLayers()
@@ -739,7 +740,7 @@ def test_skipped_layers(backend, io_type):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel'])  # Only io_parallel for now
 @pytest.mark.parametrize('tensor_rank', [2, 3])
 def test_remove_transpose(backend, io_type, tensor_rank):
@@ -806,7 +807,7 @@ def forward(self, x):
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_view(backend, io_type):
 
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index a6cdaabca..3d66107c8 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -134,7 +134,7 @@ def randX_100_16():
 # https://github.com/fastmachinelearning/hls4ml/issues/381
 # @pytest.mark.parametrize('bits', [4, 6, 8])
 @pytest.mark.parametrize('bits,alpha', [(4, 1), (4, 'auto_po2')])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_single_dense_activation_exact(randX_100_16, bits, alpha, backend, io_type):
     '''
@@ -191,7 +191,7 @@ def randX_100_10():
 @pytest.mark.parametrize(
     'quantizer', [(quantized_tanh(8)), (quantized_sigmoid(5)), (quantized_sigmoid(7, use_real_sigmoid=True))]
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -232,7 +232,7 @@ def test_quantizer_special(randX_1000_1, quantizer, backend, io_type):
         (7, 10, binary(), quantized_bits(5, 2), binary(), False, True),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_btnn(make_btnn, randX_100_10, backend, io_type):
     model, is_xnor, test_no = make_btnn
@@ -275,7 +275,7 @@ def randX_1000_1():
         (quantized_relu(10, 5)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -315,7 +315,7 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type):
         (quantized_relu(10, 2, negative_slope=0.25)),
     ],
 )
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type):
     '''
@@ -399,7 +399,7 @@ def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer)
         assert sum(wrong) / len(wrong) <= 0.005
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_quantizer_parsing(randX_100_10, backend, io_type):
     X = randX_100_10
@@ -438,7 +438,7 @@ def randX_100_8_8_1():
     return np.random.rand(100, 8, 8, 1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_qconv2dbn(randX_100_8_8_1, backend, io_type):
     '''
@@ -519,7 +519,7 @@ def test_qdepthwiseconv2d(randX_10_32_32_3, backend, io_type):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), rtol=1e-2, atol=0.01)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('strategy', ['Latency', 'Resource'])
 def test_quantised_po2_bit_width(backend, io_type, strategy):
@@ -557,7 +557,7 @@ def test_quantised_po2_bit_width(backend, io_type, strategy):
     np.testing.assert_allclose(y_hls.flatten(), y_keras.flatten(), rtol=2e-2)
 
 
-@pytest.mark.parametrize('backend', ['Quartus'])
+@pytest.mark.parametrize('backend', ['Quartus', 'oneAPI'])
 def test_qsimplernn(backend):
     '''
     Test proper handling of QSimpleRNN.
@@ -592,7 +592,7 @@ def test_qsimplernn(backend):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), atol=0.1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 def test_qlstm(backend):
     '''
     Test proper handling of QLSTM.
@@ -628,7 +628,7 @@ def test_qlstm(backend):
     np.testing.assert_allclose(y_qkeras, y_hls4ml.reshape(y_qkeras.shape), atol=0.1)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'oneAPI'])
 def test_qgru(backend):
     '''
     Test proper handling of QGRU.
diff --git a/test/pytest/test_repack_stream.py b/test/pytest/test_repack_stream.py
index 04cc9867a..a108af441 100644
--- a/test/pytest/test_repack_stream.py
+++ b/test/pytest/test_repack_stream.py
@@ -9,7 +9,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 def test_repack_precision(backend: str):
     inp = keras.Input(shape=(3, 3), name='inp')
     out = keras.layers.Reshape((3, 3), name='reshape')(inp)
@@ -33,16 +33,29 @@ def test_repack_precision(backend: str):
         io_type='io_stream',
     )
     model_hls.write()  # Not needed for this test, but useful for debugging
-    assert 'repack_reshape' in model_hls.graph, 'repack_reshape not found in graph'
-    repack_precision = model_hls.graph['repack_reshape'].attributes['result_t'].precision
+
+    reshape_name = 'reshape' if backend == 'oneAPI' else 'repack_reshape'
+    assert reshape_name in model_hls.graph, f'{reshape_name} not found in graph'
+    repack_precision = model_hls.graph[reshape_name].attributes['result_t'].precision
     assert repack_precision.integer == 10, 'Precision mismatch'
     assert repack_precision.fractional == 10, 'Precision mismatch'
     assert repack_precision.width == 20, 'Precision mismatch'
     assert repack_precision.signed is True, 'Precision mismatch'
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
-@pytest.mark.parametrize('strategy', ['Latency', 'Resource'])
+@pytest.mark.parametrize(
+    'backend, strategy',
+    [
+        ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
+        ('Vivado', 'Resource'),
+        ('Vitis', 'Resource'),
+        ('Vivado', 'Latency'),
+        ('Vitis', 'Latency'),
+        ('Catapult', 'Latency'),
+        ('Catapult', 'Resource'),
+    ],
+)
 def test_repack(backend: str, strategy: str):
     inp1 = keras.Input(shape=(4,), name='inp1')
     inp2 = keras.Input(shape=(4,), name='inp2')
diff --git a/test/pytest/test_reshape.py b/test/pytest/test_reshape.py
index ac277bb49..7143ebc9f 100755
--- a/test/pytest/test_reshape.py
+++ b/test/pytest/test_reshape.py
@@ -21,7 +21,7 @@ def randX_20_10():
     return randX(20, 10)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_reshape_parallel(randX_20_10, backend, io_type):
     model = tf.keras.models.Sequential(
@@ -33,7 +33,7 @@ def test_reshape_parallel(randX_20_10, backend, io_type):
         ]
     )
     model.compile(optimizer='adam', loss='mse')
-    config = hls4ml.utils.config_from_keras_model(model)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='fixed<32,16>')
     prj_name = f'hls4mlprj_reshape_{backend}_{io_type}'
     output_dir = str(test_root_path / prj_name)
     hls_model = hls4ml.converters.convert_from_keras_model(
diff --git a/test/pytest/test_rnn.py b/test/pytest/test_rnn.py
index dc991f7f5..d2303669f 100644
--- a/test/pytest/test_rnn.py
+++ b/test/pytest/test_rnn.py
@@ -66,25 +66,36 @@ def test_rnn_parsing(rnn_layer, return_sequences):
 
 
 @pytest.mark.parametrize(
-    'rnn_layer,backend, io_type',
+    'rnn_layer, backend, io_type, strategy',
     [
-        (SimpleRNN, 'Quartus', 'io_parallel'),
-        (LSTM, 'Vivado', 'io_parallel'),
-        (LSTM, 'Vitis', 'io_parallel'),
-        (LSTM, 'Quartus', 'io_parallel'),
-        (LSTM, 'Vivado', 'io_stream'),
-        (LSTM, 'Vitis', 'io_stream'),
-        (GRU, 'Vivado', 'io_parallel'),
-        (GRU, 'Vivado', 'io_stream'),
-        (GRU, 'Vitis', 'io_parallel'),
-        (GRU, 'Vitis', 'io_stream'),
-        (GRU, 'Quartus', 'io_parallel'),
-        (GRU, 'Quartus', 'io_stream'),
+        (SimpleRNN, 'Quartus', 'io_parallel', 'resource'),
+        (SimpleRNN, 'oneAPI', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_parallel', 'latency'),
+        (LSTM, 'Vitis', 'io_parallel', 'resource'),
+        (LSTM, 'Vitis', 'io_parallel', 'latency'),
+        (LSTM, 'Quartus', 'io_parallel', 'resource'),
+        (LSTM, 'oneAPI', 'io_parallel', 'resource'),
+        (LSTM, 'Vivado', 'io_stream', 'resource'),
+        (LSTM, 'Vivado', 'io_stream', 'latency'),
+        (LSTM, 'Vitis', 'io_stream', 'resource'),
+        (LSTM, 'Vitis', 'io_stream', 'latency'),
+        (GRU, 'Vivado', 'io_parallel', 'resource'),
+        (GRU, 'Vivado', 'io_parallel', 'latency'),
+        (GRU, 'Vitis', 'io_parallel', 'resource'),
+        (GRU, 'Vitis', 'io_parallel', 'latency'),
+        (GRU, 'Quartus', 'io_parallel', 'resource'),
+        (GRU, 'oneAPI', 'io_parallel', 'resource'),
+        (GRU, 'Vivado', 'io_stream', 'resource'),
+        (GRU, 'Vivado', 'io_stream', 'latency'),
+        (GRU, 'Vitis', 'io_stream', 'resource'),
+        (GRU, 'Vitis', 'io_stream', 'latency'),
+        (GRU, 'Quartus', 'io_stream', 'resource'),
+        (GRU, 'oneAPI', 'io_stream', 'resource'),
     ],
 )
 @pytest.mark.parametrize('return_sequences', [True, False])
 @pytest.mark.parametrize('static', [True, False])
-@pytest.mark.parametrize('strategy', ['latency', 'resource'])
 def test_rnn_accuracy(rnn_layer, return_sequences, backend, io_type, strategy, static):
     # Subtract 0.5 to include negative values
     input_shape = (12, 8)
diff --git a/test/pytest/test_stream_clone.py b/test/pytest/test_stream_clone.py
index 7f5988f65..b871047e9 100644
--- a/test/pytest/test_stream_clone.py
+++ b/test/pytest/test_stream_clone.py
@@ -40,7 +40,7 @@ def data():
     return X
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'oneAPI'])
 def test_multi_clone(model_multi_clone, data, backend: str):
     output_dir = str(test_root_path / f'hls4mlprj_stream_clone_multiclone_{backend}')
     hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
@@ -58,7 +58,7 @@ def test_multi_clone(model_multi_clone, data, backend: str):
     assert np.allclose(r_hls, r_keras, atol=1e-5, rtol=0)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'oneAPI'])
 def test_clone_precision_inherition(model_clone_precision_inherition, data, backend: str):
     output_dir = str(test_root_path / f'hls4mlprj_stream_clone_precision_{backend}')
     layer_config = {
diff --git a/test/pytest/test_stream_multi_clone.py b/test/pytest/test_stream_multi_clone.py
deleted file mode 100644
index bd7d7a0bb..000000000
--- a/test/pytest/test_stream_multi_clone.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pytest
-from keras.layers import Add, Dense
-from tensorflow import keras
-
-from hls4ml.converters import convert_from_keras_model
-
-test_root_path = Path(__file__).parent
-
-
-@pytest.fixture(scope='module')
-def model():
-    inp = keras.Input(shape=(10,))
-    x = Dense(10)(inp)
-    y = Dense(10)(inp)
-    z = Dense(10)(inp)
-    xy = Add()([x, y])  # 5
-    xy = Add()([xy, y])  # 5
-    out = Add()([xy, z])  # 5
-    model = keras.Model(inp, out)
-    return model
-
-
-@pytest.fixture(scope='module')
-def data():
-    X = np.random.normal(0, 1, (1000, 10))
-    X = np.clip(X, -16, 15)
-    return X
-
-
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
-def test_multi_clone(model, data, backend: str):
-    output_dir = str(test_root_path / f'hls4mlprj_stream_multi_clone_{backend}')
-    hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
-    model_hls = convert_from_keras_model(
-        model,
-        backend=backend,
-        output_dir=output_dir,
-        hls_config=hls_config,
-        io_type='io_stream',  # clone only happens with stream io.
-    )
-    model_hls.compile()
-    r_hls = model_hls.predict(data)
-    r_keras = model(data).numpy()
-
-    assert np.allclose(r_hls, r_keras, atol=1e-5, rtol=0)
diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py
index 7447545d2..884d5859d 100644
--- a/test/pytest/test_transpose_concat.py
+++ b/test/pytest/test_transpose_concat.py
@@ -29,7 +29,7 @@ def keras_model():
 
 @pytest.fixture
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
         keras_model, default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>', granularity='name', backend=backend
@@ -45,7 +45,7 @@ def hls_model(keras_model, backend, io_type):
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
 def test_accuracy(data, keras_model, hls_model):
     X = data
     model = keras_model
diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py
index c81be7693..dc29a53c2 100644
--- a/test/pytest/test_upsampling.py
+++ b/test/pytest/test_upsampling.py
@@ -46,7 +46,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':
diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py
index 6a22a2247..4e65acef2 100644
--- a/test/pytest/test_zeropadding.py
+++ b/test/pytest/test_zeropadding.py
@@ -50,7 +50,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult', 'oneAPI'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':