Skip to content

Commit

Permalink
Channel Broadcast Examples (#688)
Browse files Browse the repository at this point in the history
* Stub out format of broadcast example

* First attempt a broadcast

* Add multi herd broadcast detection example

* Make channel names a little less error prone

* Updated broadcast example to use new XRTRunner class

* Update documentation on broadcast examples

* First attempt at exposing broadcast_shape ChannelOp addr to python bindings

* closer to working

* Add ChannelOp wrapper

* Broadcast example works!

* Fixed second broadcast example

* flip broadcast shape for variety

* update lit test and documentation

* Use ChannelOp wrapper in tests and other programming examples
  • Loading branch information
hunhoffe authored Aug 13, 2024
1 parent 28638f2 commit 7985d04
Show file tree
Hide file tree
Showing 25 changed files with 391 additions and 47 deletions.
7 changes: 6 additions & 1 deletion programming_examples/channel_examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Warning: The multi-segment example is a work in progress!

#### ```channel-size```: Use the channel size argument

This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `ChannelOp` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).
This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `Channel` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).

#### ```hierarchical```: Use channels for sending data from Launch to Segment to Herd and back again

Expand All @@ -34,6 +34,11 @@ This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_

WARNING: This example currently fails for unknown reasons.

#### ```broadcast```:

These examples ([broadcast/multi_herd/broadcast.py](broadcast/multi_herd/broadcast.py) and [broadcast/single_herd/broadcast.py](broadcast/single_herd/broadcast.py)) are examples of using channels to broadcast data to multiple workers.


#### Usage (For All Examples)

To generate AIR MLIR from Python:
Expand Down
4 changes: 4 additions & 0 deletions programming_examples/channel_examples/broadcast/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Broadcast Examples

In both of these examples, we attempt to broadcast an input `a` to 3 workers. In `single_herd`, the herd `size=[1, 3]` whereas in `multi_herd` there are 3 herds of `size=[1, 1]`.
The workers then add a unique value to each element in the input image and output the new image to a unique per-worker output.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

all: run

print:
${powershell} python3 ${srcdir}/broadcast.py -p

run:
mkdir -p ${srcdir}/build
cd ${srcdir}/build && ${powershell} python3 ${srcdir}/broadcast.py

clean:
rm -rf ${srcdir}/build ${srcdir}/__pycache__
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
import argparse
import numpy as np

from air.ir import *
from air.dialects.air import *
from air.dialects.memref import AllocOp, DeallocOp, load, store
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_
from air.backend.xrt_runner import XRTRunner, type_mapper

range_ = for_

IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]

INOUT_DATATYPE = np.int32

OUTPUT_HERD_NAMES = ["ChanOutB", "ChanOutC", "ChanOutD"]


@module_builder
def build_module():
xrt_dtype = type_mapper(INOUT_DATATYPE)
memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)

mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
image_type_l1 = MemRefType.get(
shape=IMAGE_SIZE,
element_type=xrt_dtype,
memory_space=mem_space_l1,
)

Channel("ChanIn", size=[1, 1], broadcast_shape=[3, 1])
for name in OUTPUT_HERD_NAMES:
Channel(name)

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
def copy(arg0, arg1, arg2, arg3):

# The arguments are the input and output
@launch(operands=[arg0, arg1, arg2, arg3])
def launch_body(a, b, c, d):

ChannelPut("ChanIn", a)
ChannelGet(OUTPUT_HERD_NAMES[0], b)
ChannelGet(OUTPUT_HERD_NAMES[1], c)
ChannelGet(OUTPUT_HERD_NAMES[2], d)

@segment(name="seg")
def segment_body():

for herd_num in range(3):

@herd(name="broadcastherd" + str(herd_num), sizes=[1, 1])
def herd_body(_tx, _ty, _sx, _sy):

# We must allocate a buffer of image size for the input/output
image_in = AllocOp(image_type_l1, [], [])
image_out = AllocOp(image_type_l1, [], [])

ChannelGet("ChanIn", image_in, indices=[herd_num, 0])

# Access every value in the image
for i in range_(IMAGE_HEIGHT):
for j in range_(IMAGE_WIDTH):
# Load the input value
val_in = load(image_in, [i, j])

# Calculate the output value
val_out = arith.addi(
val_in, arith.ConstantOp(T.i32(), herd_num + 1)
)

# Store the output value
store(val_out, image_out, [i, j])
yield_([])
yield_([])

ChannelPut(OUTPUT_HERD_NAMES[herd_num], image_out)

DeallocOp(image_in)
DeallocOp(image_out)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the channel broadcast multi herd example",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
parser.add_argument(
"-p",
"--print-module-only",
action="store_true",
)
args = parser.parse_args()

mlir_module = build_module()
if args.print_module_only:
print(mlir_module)
exit(0)

input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
output_b = np.arange(1, np.prod(IMAGE_SIZE) + 1, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)
output_c = np.arange(2, np.prod(IMAGE_SIZE) + 2, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)
output_d = np.arange(3, np.prod(IMAGE_SIZE) + 3, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)

runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
exit(
runner.run_test(
mlir_module,
inputs=[input_a],
expected_outputs=[output_b, output_c, output_d],
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

all: run

print:
${powershell} python3 ${srcdir}/broadcast.py -p

run:
mkdir -p ${srcdir}/build
cd ${srcdir}/build && ${powershell} python3 ${srcdir}/broadcast.py

clean:
rm -rf ${srcdir}/build ${srcdir}/__pycache__
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
import argparse
import numpy as np

from air.ir import *
from air.dialects.air import *
from air.dialects.memref import AllocOp, DeallocOp, load, store
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_
from air.backend.xrt_runner import XRTRunner, type_mapper

range_ = for_

IMAGE_WIDTH = 8
IMAGE_HEIGHT = 6
IMAGE_SIZE = [IMAGE_HEIGHT, IMAGE_WIDTH]

INOUT_DATATYPE = np.int32


@module_builder
def build_module():
xrt_dtype = type_mapper(INOUT_DATATYPE)
memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)

mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
image_type_l1 = MemRefType.get(
shape=IMAGE_SIZE,
element_type=xrt_dtype,
memory_space=mem_space_l1,
)

Channel("ChanIn", size=[1, 1], broadcast_shape=[1, 3])
Channel("ChanOut", size=[1, 3])

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
def copy(arg0, arg1, arg2, arg3):

# The arguments are the input and output
@launch(operands=[arg0, arg1, arg2, arg3])
def launch_body(a, b, c, d):

ChannelPut("ChanIn", a)
ChannelGet("ChanOut", b, indices=[0, 0])
ChannelGet("ChanOut", c, indices=[0, 1])
ChannelGet("ChanOut", d, indices=[0, 2])

@segment(name="seg")
def segment_body():

@herd(name="broadcastherd", sizes=[1, 3])
def herd_body(tx, ty, _sx, _sy):

# We must allocate a buffer of image size for the input/output
image_in = AllocOp(image_type_l1, [], [])
image_out = AllocOp(image_type_l1, [], [])

ChannelGet("ChanIn", image_in, indices=[tx, ty])

# Access every value in the image
for i in range_(IMAGE_HEIGHT):
for j in range_(IMAGE_WIDTH):
# Load the input value
val_in = load(image_in, [i, j])

# Calculate the output value
val_out = arith.addi(val_in, arith.index_cast(T.i32(), ty))
val_out = arith.addi(val_out, arith.ConstantOp(T.i32(), 1))

# Store the output value
store(val_out, image_out, [i, j])
yield_([])
yield_([])

ChannelPut("ChanOut", image_out, indices=[tx, ty])

DeallocOp(image_in)
DeallocOp(image_out)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the channel broadcast multi herd example",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
parser.add_argument(
"-p",
"--print-module-only",
action="store_true",
)
args = parser.parse_args()

mlir_module = build_module()
if args.print_module_only:
print(mlir_module)
exit(0)

input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
output_b = np.arange(1, np.prod(IMAGE_SIZE) + 1, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)
output_c = np.arange(2, np.prod(IMAGE_SIZE) + 2, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)
output_d = np.arange(3, np.prod(IMAGE_SIZE) + 3, dtype=INOUT_DATATYPE).reshape(
IMAGE_SIZE
)

runner = XRTRunner(verbose=args.verbose, experimental_passes=True)
exit(
runner.run_test(
mlir_module,
inputs=[input_a],
expected_outputs=[output_b, output_c, output_d],
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def build_module():
memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)

# Create an input/output channel pair per worker
ChannelOp("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
ChannelOp("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
Channel("ChanIn", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])
Channel("ChanOut", size=[IMAGE_HEIGHT // TILE_HEIGHT, IMAGE_WIDTH // TILE_WIDTH])

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def build_module():

# Create two channels which will send/receive the
# input/output data respectively
ChannelOp("ChanIn")
ChannelOp("ChanOut")
Channel("ChanIn")
Channel("ChanOut")

# Create a channel we will use to pass data between works in two herds
ChannelOp("Herd2Herd")
Channel("Herd2Herd")

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def build_module():

# Create two channels which will send/receive the
# input/output data respectively
ChannelOp("ChanIn")
ChannelOp("ChanOut")
Channel("ChanIn")
Channel("ChanOut")

# Create a channel we will use to pass data between works in two herds
ChannelOp("Herd2Herd")
Channel("Herd2Herd")

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def build_module():
memory_space=mem_space_l2,
)

ChannelOp("ChanInL2")
ChannelOp("ChanOutL2")
ChannelOp("ChanInL1")
ChannelOp("ChanOutL1")
Channel("ChanInL2")
Channel("ChanOutL2")
Channel("ChanInL1")
Channel("ChanOutL1")

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
Expand Down
Loading

0 comments on commit 7985d04

Please sign in to comment.