From 7985d042666df7f3791ee2a8c65162b7770591da Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 13 Aug 2024 15:36:13 -0600
Subject: [PATCH] Channel Broadcast Examples (#688)

* Stub out format of broadcast example

* First attempt a broadcast

* Add multi herd broadcast detection example

* Make channel names a little less error prone

* Updated broadcast example to use new XRTRunner class

* Update documentation on broadcast examples

* First attempt at exposing broadcast_shape ChannelOp addr to python bindings

* closer to working

* Add ChannelOp wrapper

* Broadcast example works!

* Fixed second broadcast example

* flip broadcast shape for variety

* update lit test and documentation

* Use ChannelOp wrapper in tests and other programming examples
---
 .../channel_examples/README.md                |   7 +-
 .../channel_examples/broadcast/README.md      |   4 +
 .../broadcast/multi_herd/Makefile             |  17 +++
 .../broadcast/multi_herd/broadcast.py         | 129 ++++++++++++++++++
 .../broadcast/multi_herd/run_makefile.lit     |   8 ++
 .../broadcast/single_herd/Makefile            |  17 +++
 .../broadcast/single_herd/broadcast.py        | 123 +++++++++++++++++
 .../broadcast/single_herd/run_makefile.lit    |   8 ++
 .../channel_size/channel_size.py              |   4 +-
 .../multi_segment/herd_to_herd.py             |   6 +-
 .../single_segment/herd_to_herd.py            |   6 +-
 .../hierarchical/hierarchical.py              |   8 +-
 .../worker_to_self/worker_to_self.py          |   6 +-
 .../worker_to_worker/worker_to_worker.py      |   6 +-
 .../channel/transpose.py                      |   4 +-
 .../multi_core_channel/multi_core_channel.py  |   4 +-
 .../multi_launch_channel.py                   |   4 +-
 .../single_core_channel.py                    |   4 +-
 .../multi_segment_channel/multi_segment.py    |   8 +-
 .../passthrough_channel.py                    |   4 +-
 .../passthrough_kernel/passthrough_kernel.py  |   4 +-
 python/air/dialects/_air_ops_ext.py           |  33 +++++
 python/test/dialect/channel_get_put.py        |   6 +-
 test/xrt/02_mul_shim_1x1/run.py               |   6 +-
 test/xrt/03_mul_L1L2_1x1/run.py               |  12 +-
 25 files changed, 391 insertions(+), 47 deletions(-)
 create mode 100644 programming_examples/channel_examples/broadcast/README.md
 create mode 100644 programming_examples/channel_examples/broadcast/multi_herd/Makefile
 create mode 100644 programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
 create mode 100644 programming_examples/channel_examples/broadcast/multi_herd/run_makefile.lit
 create mode 100644 programming_examples/channel_examples/broadcast/single_herd/Makefile
 create mode 100644 programming_examples/channel_examples/broadcast/single_herd/broadcast.py
 create mode 100644 programming_examples/channel_examples/broadcast/single_herd/run_makefile.lit

diff --git a/programming_examples/channel_examples/README.md b/programming_examples/channel_examples/README.md
index ce5e89fc4..35ce7d234 100644
--- a/programming_examples/channel_examples/README.md
+++ b/programming_examples/channel_examples/README.md
@@ -16,7 +16,7 @@ Warning: The multi-segment example is a work in progress!
 
 #### ```channel-size```: Use the channel size argument
 
-This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `ChannelOp` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).
+This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `Channel` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).
 
 #### ```hierarchical```: Use channels for sending data from Launch to Segment to Herd and back again
 
@@ -34,6 +34,11 @@ This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_
 
 WARNING: This example currently fails for unknown reasons.
 
+#### ```broadcast```:
+
+These examples ([broadcast/multi_herd/broadcast.py](broadcast/multi_herd/broadcast.py) and [broadcast/single_herd/broadcast.py](broadcast/single_herd/broadcast.py)) are examples of using channels to broadcast data to multiple workers.
+
+
 #### Usage (For All Examples)
 
 To generate AIR MLIR from Python:
diff --git a/programming_examples/channel_examples/broadcast/README.md b/programming_examples/channel_examples/broadcast/README.md
new file mode 100644
index 000000000..9162a2b7c
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/README.md
@@ -0,0 +1,4 @@
+# Broadcast Examples
+
+In both of these examples, we attempt to broadcast an input `a` to 3 workers. In `single_herd`, the herd `size=[1, 3]` whereas in `multi_herd` there are 3 herds of `size=[1, 1]`.
+The workers then add a unique value to each element in the input image and output the new image to a unique per-worker output.
\ No newline at end of file
diff --git a/programming_examples/channel_examples/broadcast/multi_herd/Makefile b/programming_examples/channel_examples/broadcast/multi_herd/Makefile
new file mode 100644
index 000000000..d0cd3ada1
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/multi_herd/Makefile
@@ -0,0 +1,17 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/broadcast.py -p
+
+run:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/broadcast.py
+
+clean:
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
new file mode 100644
index 000000000..590dac2b2
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.int32
+
+OUTPUT_HERD_NAMES = ["ChanOutB", "ChanOutC", "ChanOutD"]
+
+
+@module_builder
+def build_module():
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
+
+    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    image_type_l1 = MemRefType.get(
+        shape=IMAGE_SIZE,
+        element_type=xrt_dtype,
+        memory_space=mem_space_l1,
+    )
+
+    Channel("ChanIn", size=[1, 1], broadcast_shape=[3, 1])
+    for name in OUTPUT_HERD_NAMES:
+        Channel(name)
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1, arg2, arg3):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1, arg2, arg3])
+        def launch_body(a, b, c, d):
+
+            ChannelPut("ChanIn", a)
+            ChannelGet(OUTPUT_HERD_NAMES[0], b)
+            ChannelGet(OUTPUT_HERD_NAMES[1], c)
+            ChannelGet(OUTPUT_HERD_NAMES[2], d)
+
+            @segment(name="seg")
+            def segment_body():
+
+                for herd_num in range(3):
+
+                    @herd(name="broadcastherd" + str(herd_num), sizes=[1, 1])
+                    def herd_body(_tx, _ty, _sx, _sy):
+
+                        # We must allocate a buffer of image size for the input/output
+                        image_in = AllocOp(image_type_l1, [], [])
+                        image_out = AllocOp(image_type_l1, [], [])
+
+                        ChannelGet("ChanIn", image_in, indices=[herd_num, 0])
+
+                        # Access every value in the image
+                        for i in range_(IMAGE_HEIGHT):
+                            for j in range_(IMAGE_WIDTH):
+                                # Load the input value
+                                val_in = load(image_in, [i, j])
+
+                                # Calculate the output value
+                                val_out = arith.addi(
+                                    val_in, arith.ConstantOp(T.i32(), herd_num + 1)
+                                )
+
+                                # Store the output value
+                                store(val_out, image_out, [i, j])
+                                yield_([])
+                            yield_([])
+
+                        ChannelPut(OUTPUT_HERD_NAMES[herd_num], image_out)
+
+                        DeallocOp(image_in)
+                        DeallocOp(image_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel broadcast multi herd example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_b = np.arange(1, np.prod(IMAGE_SIZE) + 1, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_c = np.arange(2, np.prod(IMAGE_SIZE) + 2, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_d = np.arange(3, np.prod(IMAGE_SIZE) + 3, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_a],
+            expected_outputs=[output_b, output_c, output_d],
+        )
+    )
diff --git a/programming_examples/channel_examples/broadcast/multi_herd/run_makefile.lit b/programming_examples/channel_examples/broadcast/multi_herd/run_makefile.lit
new file mode 100644
index 000000000..f71210631
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/multi_herd/run_makefile.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: MIT
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/channel_examples/broadcast/single_herd/Makefile b/programming_examples/channel_examples/broadcast/single_herd/Makefile
new file mode 100644
index 000000000..d0cd3ada1
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/single_herd/Makefile
@@ -0,0 +1,17 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/broadcast.py -p
+
+run:
+	mkdir -p ${srcdir}/build
+	cd ${srcdir}/build && ${powershell} python3 ${srcdir}/broadcast.py
+
+clean:
+	rm -rf ${srcdir}/build ${srcdir}/__pycache__
\ No newline at end of file
diff --git a/programming_examples/channel_examples/broadcast/single_herd/broadcast.py b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py
new file mode 100644
index 000000000..b15d554b8
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
+
+range_ = for_
+
+IMAGE_WIDTH = 8
+IMAGE_HEIGHT = 6
+IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]
+
+INOUT_DATATYPE = np.int32
+
+
+@module_builder
+def build_module():
+    xrt_dtype = type_mapper(INOUT_DATATYPE)
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
+
+    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    image_type_l1 = MemRefType.get(
+        shape=IMAGE_SIZE,
+        element_type=xrt_dtype,
+        memory_space=mem_space_l1,
+    )
+
+    Channel("ChanIn", size=[1, 1], broadcast_shape=[1, 3])
+    Channel("ChanOut", size=[1, 3])
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1, arg2, arg3):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1, arg2, arg3])
+        def launch_body(a, b, c, d):
+
+            ChannelPut("ChanIn", a)
+            ChannelGet("ChanOut", b, indices=[0, 0])
+            ChannelGet("ChanOut", c, indices=[0, 1])
+            ChannelGet("ChanOut", d, indices=[0, 2])
+
+            @segment(name="seg")
+            def segment_body():
+
+                @herd(name="broadcastherd", sizes=[1, 3])
+                def herd_body(tx, ty, _sx, _sy):
+
+                    # We must allocate a buffer of image size for the input/output
+                    image_in = AllocOp(image_type_l1, [], [])
+                    image_out = AllocOp(image_type_l1, [], [])
+
+                    ChannelGet("ChanIn", image_in, indices=[tx, ty])
+
+                    # Access every value in the image
+                    for i in range_(IMAGE_HEIGHT):
+                        for j in range_(IMAGE_WIDTH):
+                            # Load the input value
+                            val_in = load(image_in, [i, j])
+
+                            # Calculate the output value
+                            val_out = arith.addi(val_in, arith.index_cast(T.i32(), ty))
+                            val_out = arith.addi(val_out, arith.ConstantOp(T.i32(), 1))
+
+                            # Store the output value
+                            store(val_out, image_out, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    ChannelPut("ChanOut", image_out, indices=[tx, ty])
+
+                    DeallocOp(image_in)
+                    DeallocOp(image_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel broadcast multi herd example",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--print-module-only",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module()
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
+    output_b = np.arange(1, np.prod(IMAGE_SIZE) + 1, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_c = np.arange(2, np.prod(IMAGE_SIZE) + 2, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+    output_d = np.arange(3, np.prod(IMAGE_SIZE) + 3, dtype=INOUT_DATATYPE).reshape(
+        IMAGE_SIZE
+    )
+
+    runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
+    exit(
+        runner.run_test(
+            mlir_module,
+            inputs=[input_a],
+            expected_outputs=[output_b, output_c, output_d],
+        )
+    )
diff --git a/programming_examples/channel_examples/broadcast/single_herd/run_makefile.lit b/programming_examples/channel_examples/broadcast/single_herd/run_makefile.lit
new file mode 100644
index 000000000..f71210631
--- /dev/null
+++ b/programming_examples/channel_examples/broadcast/single_herd/run_makefile.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: MIT
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index 89d335acf..dd755b87f 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -32,8 +32,8 @@ def build_module():
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # Create an input/output channel pair per worker
-    ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
-    ChannelOp("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    Channel("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    Channel("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
index 9f9e6a4c8..3db2acb29 100644
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
@@ -36,11 +36,11 @@ def build_module():
 
     # Create two channels which will send/receive the
     # input/output data respectively
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # Create a channel we will use to pass data between works in two herds
-    ChannelOp("Herd2Herd")
+    Channel("Herd2Herd")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
index 729802f4d..88052954a 100644
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
@@ -38,11 +38,11 @@ def build_module():
 
     # Create two channels which will send/receive the
     # input/output data respectively
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # Create a channel we will use to pass data between works in two herds
-    ChannelOp("Herd2Herd")
+    Channel("Herd2Herd")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/channel_examples/hierarchical/hierarchical.py b/programming_examples/channel_examples/hierarchical/hierarchical.py
index f9969bb82..568b49d49 100644
--- a/programming_examples/channel_examples/hierarchical/hierarchical.py
+++ b/programming_examples/channel_examples/hierarchical/hierarchical.py
@@ -38,10 +38,10 @@ def build_module():
         memory_space=mem_space_l2,
     )
 
-    ChannelOp("ChanInL2")
-    ChannelOp("ChanOutL2")
-    ChannelOp("ChanInL1")
-    ChannelOp("ChanOutL1")
+    Channel("ChanInL2")
+    Channel("ChanOutL2")
+    Channel("ChanInL1")
+    Channel("ChanOutL1")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
index 0d19f24eb..cc8693211 100644
--- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py
+++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
@@ -25,9 +25,9 @@ def build_module():
 
     # Type and method of input/output
     memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, xrt_dtype)
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
-    ChannelOp("ToSelf")
+    Channel("ChanIn")
+    Channel("ChanOut")
+    Channel("ToSelf")
 
     mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
     image_type_l1 = MemRefType.get(
diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
index a88b5ba06..40b8b5361 100644
--- a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
+++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
@@ -33,9 +33,9 @@ def build_module():
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
     # Create an input/output channel pair per worker
-    ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
-    ChannelOp("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
-    ChannelOp(
+    Channel("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    Channel("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
+    Channel(
         "SwitchTiles", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH]
     )
 
diff --git a/programming_examples/data_transfer_transpose/channel/transpose.py b/programming_examples/data_transfer_transpose/channel/transpose.py
index f1fe6cf66..94638b969 100644
--- a/programming_examples/data_transfer_transpose/channel/transpose.py
+++ b/programming_examples/data_transfer_transpose/channel/transpose.py
@@ -23,8 +23,8 @@ def build_module(m, k, dtype):
     memrefTyIn = MemRefType.get(shape=[m, k], element_type=xrt_dtype)
     memrefTyOut = MemRefType.get(shape=[k, m], element_type=xrt_dtype)
 
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyIn, memrefTyOut)
diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
index 02a3f0157..53e1b60e9 100644
--- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
@@ -29,8 +29,8 @@ def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
     # Create an input/output channel pair per worker
     for h in range(image_height // tile_height):
         for w in range(image_width // tile_width):
-            ChannelOp(format_name("ChanIn", h, w))
-            ChannelOp(format_name("ChanOut", h, w))
+            Channel(format_name("ChanIn", h, w))
+            Channel(format_name("ChanOut", h, w))
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index 17249ecf7..c56688982 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -29,8 +29,8 @@ def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
     # Create an input/output channel pair per launch
     for h in range(image_height // tile_height):
         for w in range(image_width // tile_width):
-            ChannelOp(format_name("ChanIn", h, w))
-            ChannelOp(format_name("ChanOut", h, w))
+            Channel(format_name("ChanIn", h, w))
+            Channel(format_name("ChanOut", h, w))
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index b48d41f82..766d6f8de 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -24,8 +24,8 @@ def build_module(image_height, image_width, tile_height, tile_width, np_dtype):
 
     # Create two channels which will send/receive the
     # input/output data respectively
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
index 705cec06f..570be2840 100644
--- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
@@ -33,10 +33,10 @@ def build_module():
         memory_space=mem_space_l1,
     )
 
-    ChannelOp("ChanInA")
-    ChannelOp("ChanInB")
-    ChannelOp("ChanOutC")
-    ChannelOp("ChanOutD")
+    Channel("ChanInA")
+    Channel("ChanInB")
+    Channel("ChanOutC")
+    Channel("ChanOutD")
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
index 776576fa1..f438e68ec 100644
--- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
+++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
@@ -22,8 +22,8 @@ def build_module(vector_size, num_subvectors):
 
     # Type and method of input/output
     memrefTyInOut = T.memref(vector_size, xrt_dtype)
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # The compute core splits input into subvectors for processing
     lineWidthInBytes = vector_size // num_subvectors
diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
index 7268cfadb..7bfd628ee 100644
--- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
@@ -22,8 +22,8 @@ def build_module(vector_size, num_subvectors):
 
     # Type and method of input/output
     memrefTyInOut = T.memref(vector_size, xrt_dtype)
-    ChannelOp("ChanIn")
-    ChannelOp("ChanOut")
+    Channel("ChanIn")
+    Channel("ChanOut")
 
     # The compute core splits input into subvectors for processing
     lineWidthInBytes = vector_size // num_subvectors
diff --git a/python/air/dialects/_air_ops_ext.py b/python/air/dialects/_air_ops_ext.py
index 749583ea4..65f837938 100644
--- a/python/air/dialects/_air_ops_ext.py
+++ b/python/air/dialects/_air_ops_ext.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: MIT
 
 import functools
+from typing import Optional, Sequence, Union
 
 from ..ir import *
 from ._air_ops_gen import *
@@ -129,6 +130,38 @@ def __init__(
         self.regions[0].blocks.append(*operand_types)
 
 
+class Channel(ChannelOp):
+    def __init__(
+        self,
+        sym_name,
+        broadcast_shape: Optional[
+            Union[Sequence[Union[int, IntegerAttr, Operation, Value]], ArrayAttr]
+        ] = None,
+        size=None,
+        loc=None,
+        ip=None,
+    ):
+        super().__init__(
+            sym_name=sym_name,
+            size=size,
+            loc=loc,
+            ip=ip,
+        )
+
+        if not (broadcast_shape is None):
+            static_sizes = []
+            if isinstance(broadcast_shape, ArrayAttr):
+                broadcast_shape_attr = broadcast_shape
+            else:
+                for size in broadcast_shape:
+                    if isinstance(size, int):
+                        static_sizes.append(IntegerAttr.get(T.index(), size))
+                    else:
+                        static_sizes.append(ShapedType.get_dynamic_size())
+                broadcast_shape_attr = ArrayAttr.get(static_sizes)
+            super().attributes["broadcast_shape"] = broadcast_shape_attr
+
+
 class ChannelGet(ChannelGetOp):
     def __init__(
         self,
diff --git a/python/test/dialect/channel_get_put.py b/python/test/dialect/channel_get_put.py
index ecec57e72..4da1994db 100644
--- a/python/test/dialect/channel_get_put.py
+++ b/python/test/dialect/channel_get_put.py
@@ -27,9 +27,9 @@ def build_module(shape, idtype, odtype):
     # CHECK: air.channel @ChanA
     # CHECK: air.channel @ChanB
     # CHECK: air.channel @ChanC
-    ChannelOp("ChanA")
-    ChannelOp("ChanB")
-    ChannelOp("ChanC")
+    Channel("ChanA")
+    Channel("ChanB")
+    Channel("ChanC")
 
     @FuncOp.from_py_func(memrefTyIn, memrefTyIn, memrefTyOut)
     def mul(arg0, arg1, arg2):
diff --git a/test/xrt/02_mul_shim_1x1/run.py b/test/xrt/02_mul_shim_1x1/run.py
index 04be0b7ae..fd957f43a 100644
--- a/test/xrt/02_mul_shim_1x1/run.py
+++ b/test/xrt/02_mul_shim_1x1/run.py
@@ -48,9 +48,9 @@ def to_type(dtype):
 def build_module(shape, idtype, odtype, tile_size):
     memrefTyIn = MemRefType.get(shape, to_type(idtype))
     memrefTyOut = MemRefType.get(shape, to_type(odtype))
-    ChannelOp("ChanA")
-    ChannelOp("ChanB")
-    ChannelOp("ChanC")
+    Channel("ChanA")
+    Channel("ChanB")
+    Channel("ChanC")
 
     @FuncOp.from_py_func(memrefTyIn, memrefTyIn, memrefTyOut)
     def mul(arg0, arg1, arg2):
diff --git a/test/xrt/03_mul_L1L2_1x1/run.py b/test/xrt/03_mul_L1L2_1x1/run.py
index 23496b0f6..4351503a1 100644
--- a/test/xrt/03_mul_L1L2_1x1/run.py
+++ b/test/xrt/03_mul_L1L2_1x1/run.py
@@ -72,12 +72,12 @@ def build_module(idtype, odtype, l3_shape, l2_shape, l1_shape):
         memory_space=l2_mem_space,
     )
 
-    ChannelOp("ChanL2A")
-    ChannelOp("ChanL2B")
-    ChannelOp("ChanL2C")
-    ChannelOp("ChanL1A")
-    ChannelOp("ChanL1B")
-    ChannelOp("ChanL1C")
+    Channel("ChanL2A")
+    Channel("ChanL2B")
+    Channel("ChanL2C")
+    Channel("ChanL1A")
+    Channel("ChanL1B")
+    Channel("ChanL1C")
 
     @FuncOp.from_py_func(memrefTyIn, memrefTyIn, memrefTyOut)
     def mul(arg0, arg1, arg2):