diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index 1279ec22d..007f6da8d 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -11,6 +11,12 @@
     "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
     "c_drivers": {}
   },
+  "zcu104": {
+    "part": "xczu7ev-ffvc1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
   "alveo-u50": {
     "part": "xcu50-fsvh2104-2-e",
     "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
diff --git a/hls4ml/templates/vivado_accelerator/zcu104/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/zcu104/python_drivers/axi_stream_driver.py
new file mode 100644
index 000000000..1aac79f2d
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/zcu104/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(
+        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+    ):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vivado_accelerator/zcu104/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/zcu104/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 000000000..38536d0a7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/zcu104/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,58 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu7ev-ffvc1156-2-e -force
+
+set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__M_AXI_GP0 {1} CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/test/hls4ml-keras-test.sh b/test/hls4ml-keras-test.sh
index de1127810..337bbc694 100755
--- a/test/hls4ml-keras-test.sh
+++ b/test/hls4ml-keras-test.sh
@@ -13,6 +13,8 @@ VIVADO_VERSION=2020.1
 #./keras-to-hls.sh KERAS_1layer KERAS_conv1d_small
 ./keras-to-hls.sh -b alveo-u250 -B VivadoAccelerator -x xcu250-figd2104-2L-e KERAS_3layer
 ./keras-to-hls.sh -b pynq-z2 -B VivadoAccelerator -x xc7z020clg400-1 KERAS_3layer
+# ./keras-to-hls.sh -b zcu102 -B VivadoAccelerator -x xczu9eg-ffvb1156-2-e KERAS_3layer
+./keras-to-hls.sh -b zcu104 -B VivadoAccelerator -x xczu7ev-ffvc1156-2-e KERAS_3layer
 # KERAS_3layer b:pynq-z2 B:VivadoAccelerator x:xc7z020clg400-1 s:Resource
 
 # Build the projects generated by keras-to-hls script.