-
Notifications
You must be signed in to change notification settings - Fork 12
E2E Linux Example
This is a minimal guide to executing an E2E test of llvm-aie
on an AIE2 device (all environment specifics will be described below).
- Working AIE2 device with a driver and runtime installed (more info at https://github.com/amd/xdna-driver);
- This doc was written against the following system configuration:
System Configuration OS Name : Linux Release : 6.8.8 Version : #2 SMP PREEMPT_DYNAMIC Fri May 3 14:13:56 CDT 2024 Machine : x86_64 CPU Cores : 16 Memory : 94278 MB Distribution : Ubuntu 22.04.3 LTS GLIBC : 2.35 Model : F7BSC BIOS vendor : American Megatrends International, LLC. BIOS version : 1.04 XRT Version : 2.18.0 Branch : HEAD Hash : c678a9469f9b20fcb9a04bbedb5c51f8473faec0 Hash Date : 2024-05-24 18:16:53 XOCL : unknown, unknown XCLMGMT : unknown, unknown WARNING: xclmgmt version is unknown. Is xclmgmt driver loaded? Or is MSD/MPD running? AMDXDNA : 2.18.0_20240524, 4ef6d95ad37a2de0aa22264c950dec8ec1bd9f52 Firmware Version : N/A Devices present BDF : Name --------------------------------- [0000:c5:00.1] : RyzenAI-npu1
- This doc was written against the following system configuration:
- Distro install of
llvm-aie
(test on commit 70703e80a6ecf8f8cf3fa724191dd1f36951dea3);- Here is a plausible CMake configure:
-C $LLVM_AIE_REPO_ROOT/clang/cmake/caches/Peano-AIE.cmake \ -DCMAKE_INSTALL_PREFIX=$LLVM_AIE_REPO_ROOT/install
- Here is a plausible CMake configure:
- A python environment with
xaiepy
installed;- A one-liner:
pip install xaiepy==0.0.1 -f https://github.com/nod-ai/prototype-aie-toolchain/releases/expanded_assets/release
- A one-liner:
All programs/scripts are "attached" below.
The example program is very simple and does exactly one thing:
#include "aiev2_locks.h"
#define ACQ_LOCK 48
#define REL_LOCK 49
extern float _anonymous0[1];
int main() {
acquire_greater_equal(ACQ_LOCK, 1);
_anonymous0[0] = 5 * 3.14159;
release(REL_LOCK, 1);
return 0;
}
i.e., it stores 5 * 3.14159 == 15.70795
to a global array. To go along with this brilliant program you will need the following linker script:
MEMORY
{
program (RX) : ORIGIN = 0, LENGTH = 0x0020000
data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC
}
ENTRY(_main_init)
SECTIONS
{
. = 0x0;
.text : {
*me_basic.o(.text)
. = 0x200;
_ctors_start = .;
_init_array_start = .;
KEEP(SORT(*.init_array))
_ctors_end = .;
_init_array_end = .;
_dtors_start = .;
_dtors_end = .;
*(.text)
} > program
.data : {
*(.data*);
*(.rodata*)
} > data
. = 0x70000;
_sp_start_value_DM_stack = .;
. += 0x400; /* stack */
. = 0x40000;
. += 0x10000;
. = 0x50000;
. += 0x10000;
. = 0x70400;
_anonymous0 = .;
. += 0x4;
.bss : { *(.bss) } > data
.bss.DMb.4 : { *(.bss.DMb.4) } > data
}
PROVIDE(_main = main);
Writing this is beyond the scope of this intro.
Get all of your ducks in line (turn the above code into a main.cpp
, find Peano and set PEANO_INSTALL_DIR=...
) and then incant the following magical incantations:
me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \
-S --target=aie2-none-unknown-elf main.cpp -emit-llvm
me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \
-ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \
-o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elf
If everything went according to plan you will see roughly the following as verification that your elf file is fully baked:
There are 8 section headers, starting at offset 0x1544:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
[ 1] .text PROGBITS 00000000 001000 000260 00 AX 0 0 16
[ 2] .text._Exit PROGBITS 00000260 001260 000020 00 AX 0 0 16
[ 3] .text._main_init PROGBITS 00000280 001280 000050 00 AX 0 0 16
[ 4] .comment PROGBITS 00000000 0012d0 00007f 01 MS 0 0 1
[ 5] .symtab SYMTAB 00000000 001350 000100 10 7 3 4
[ 6] .shstrtab STRTAB 00000000 001450 000047 00 0 0 1
[ 7] .strtab STRTAB 00000000 001497 0000ad 00 0 0 1
Symbol table '.symtab' contains 16 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 00000000 0 NOTYPE LOCAL DEFAULT UND
1: 00000000 0 FILE LOCAL DEFAULT ABS main.cpp
2: 00000000 0 FILE LOCAL DEFAULT ABS crt1.cc
3: 00000200 64 FUNC GLOBAL DEFAULT 1 main
4: 00070400 0 NOTYPE GLOBAL DEFAULT 1 _anonymous0
5: 00000240 0 FUNC GLOBAL DEFAULT 1 __start
6: 00070000 0 NOTYPE GLOBAL DEFAULT 1 _sp_start_value_DM_stack
7: 00000280 80 FUNC GLOBAL DEFAULT 3 _main_init
8: 00000260 32 FUNC GLOBAL DEFAULT 2 _Exit
9: 00000200 0 FUNC GLOBAL DEFAULT 1 _main
10: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_start
11: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_start
12: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_end
13: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_end
14: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_start
15: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_end
There are two example scripts in xaiepy
that demonstrate how to configure the Phoenix device and run the program using XRT APIs:
-
gen_example.py
, which generates an.xclbin
that can be loaded/run by XRT -
xrt.py
, which loads and runs the aforementioned.xclbin
.
Run both of these et voila you should see something resembling 15.70795
.
In the next episode we'll explain what all of these things actually do...
#include "aiev2_locks.h"
#define ACQ_LOCK 48
#define REL_LOCK 49
extern float _anonymous0[1];
int main() {
acquire_greater_equal(ACQ_LOCK, 1);
_anonymous0[0] = 5 * 3.14159;
release(REL_LOCK, 1);
return 0;
}
MEMORY
{
program (RX) : ORIGIN = 0, LENGTH = 0x0020000
data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC
}
ENTRY(_main_init)
SECTIONS
{
. = 0x0;
.text : {
/* the _main_init symbol from me_basic.o has to come at address zero. */
*me_basic.o(.text)
. = 0x200;
_ctors_start = .;
_init_array_start = .;
KEEP(SORT(*.init_array))
_ctors_end = .;
_init_array_end = .;
_dtors_start = .;
_dtors_end = .;
*(.text)
} > program
.data : {
*(.data*);
*(.rodata*)
} > data
. = 0x70000;
_sp_start_value_DM_stack = .;
. += 0x400; /* stack */
/* No tile with memory exists to the south. */
. = 0x40000;
. += 0x10000;
/* No tile with memory exists to the west. */
. = 0x50000;
. += 0x10000;
. = 0x70400;
_anonymous0 = .;
. += 0x4;
.bss : { *(.bss) } > data
.bss.DMb.4 : { *(.bss.DMb.4) } > data
}
PROVIDE(_main = main);
PEANO_INSTALL_DIR=<fill me in.................>
$PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \
-S --target=aie2-none-unknown-elf main.cpp -emit-llvm
$PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \
-ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \
-o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elf
#! /usr/bin/env python
import argparse
import json
import logging
import platform
from pathlib import Path
from xaiepy import bootgen, xclbinutil
from xaiepy.cdo import (
startCDOFileStream,
FileHeader,
configureHeader,
endCurrentCDOFileStream,
EnAXIdebug,
setEndianness,
Little_Endian,
)
logging.basicConfig(
level=logging.DEBUG,
format="%(message)s",
datefmt="%H:%M:%S",
)
from xaiepy import (
XAie_Config,
XAie_BackendType,
XAie_PartitionProp,
XAie_DevInst,
XAie_CfgInitialize,
XAie_LocType,
XAie_LoadElf,
XAie_SetupPartitionConfig,
XAie_UpdateNpiAddr,
XAie_CoreReset,
XAie_CoreUnreset,
XAie_LockSetValue,
XAie_Lock,
XAie_DmaDescInit,
XAie_DmaSetAddrLen,
XAie_DmaEnableBd,
XAie_DmaWriteBd,
XAie_DmaChannelSetStartQueue,
XAie_DmaChannelEnable,
XAie_StrmConnCctEnable,
XAie_CoreEnable,
StrmSwPortType,
XAie_EnableAieToShimDmaStrmPort,
XAie_DmaDesc,
)
if platform.system() != "Windows":
from xaiepy import XAie_ErrorHandlingInit
XAIE_DEV_GEN_AIEML = 2
XAIE_BASE_ADDR = 0x40000000
XAIE_COL_SHIFT = 25
XAIE_ROW_SHIFT = 20
XAIE_SHIM_ROW = 0
XAIE_MEM_TILE_ROW_START = 1
XAIE_PARTITION_BASE_ADDR = 0x0
XAIE_TRANSACTION_DISABLE_AUTO_FLUSH = 0b0
DDR_AIE_ADDR_OFFSET = 0x80000000
col = 0
def build_cdo(which_pi):
tile_0_0 = XAie_LocType(0, col)
tile_0_1 = XAie_LocType(1, col)
tile_0_2 = XAie_LocType(2, col)
configPtr = XAie_Config(
XAIE_DEV_GEN_AIEML,
XAIE_BASE_ADDR,
XAIE_COL_SHIFT,
XAIE_ROW_SHIFT,
6,
5,
XAIE_SHIM_ROW,
XAIE_MEM_TILE_ROW_START,
1,
(XAIE_MEM_TILE_ROW_START + 1),
(6 - 1 - 1),
XAie_PartitionProp(),
XAie_BackendType.XAIE_IO_BACKEND_CDO,
)
devInst = XAie_DevInst()
XAie_SetupPartitionConfig(devInst, 0, 1, 1)
XAie_CfgInitialize(devInst, configPtr)
XAie_UpdateNpiAddr(devInst, 0)
EnAXIdebug()
setEndianness(Little_Endian)
cdo_fp = Path(__file__).parent.absolute() / f"{which_pi}_cdo.bin"
startCDOFileStream(str(cdo_fp))
FileHeader()
if platform.system() != "Windows":
XAie_ErrorHandlingInit(devInst)
elf_path = Path(__file__).parent.absolute() / f"{which_pi}.elf"
assert elf_path.exists()
XAie_LoadElf(devInst, tile_0_2, str(elf_path), False)
XAie_CoreReset(devInst, tile_0_2)
XAie_CoreUnreset(devInst, tile_0_2)
XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(0, 1))
XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(1, 0))
dmaTileBd = XAie_DmaDesc()
XAie_DmaDescInit(devInst, dmaTileBd, tile_0_2)
dmaTileBd.DmaMod.contents.SetLock(
dmaTileBd, XAie_Lock(1, -1), XAie_Lock(0, 1), 1, 0
)
XAie_DmaSetAddrLen(dmaTileBd, 1024, 4)
XAie_DmaEnableBd(dmaTileBd)
XAie_DmaWriteBd(devInst, dmaTileBd, tile_0_2, 0)
XAie_DmaChannelSetStartQueue(devInst, tile_0_2, 0, 1, 0, 1, 0)
XAie_DmaChannelEnable(devInst, tile_0_2, 0, 1)
XAie_StrmConnCctEnable(
devInst, tile_0_0, StrmSwPortType.CTRL, 0, StrmSwPortType.SOUTH, 0
)
XAie_StrmConnCctEnable(
devInst, tile_0_0, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 2
)
XAie_StrmConnCctEnable(
devInst, tile_0_1, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 0
)
XAie_StrmConnCctEnable(
devInst, tile_0_2, StrmSwPortType.DMA, 0, StrmSwPortType.SOUTH, 0
)
XAie_EnableAieToShimDmaStrmPort(devInst, tile_0_0, 2)
XAie_CoreEnable(devInst, tile_0_2)
configureHeader()
endCurrentCDOFileStream()
bif_fp = Path(__file__).parent.absolute() / f"{which_pi}.bif"
with open(bif_fp, "w") as f:
f.write(bootgen.emit_design_bif([cdo_fp]))
pdi_fp = Path(__file__).parent.absolute() / f"{which_pi}.pdi"
bootgen.make_design_pdi(str(bif_fp), str(pdi_fp))
mem_top_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_mem_topology.json"
with open(mem_top_json_fp, "w") as f:
json.dump(xclbinutil.mem_topology, f, indent=2)
aie_part_json_fp = (
Path(__file__).parent.absolute() / f"{which_pi}_aie_partition.json"
)
kernel_id = "0x902" if "two" in which_pi else "0x901"
pdi_spec = xclbinutil.pdi_spec(pdi_fp, kernel_ids=[kernel_id])
with open(aie_part_json_fp, "w") as f:
json.dump(xclbinutil.emit_partition([pdi_spec], num_cols=1), f, indent=2)
kernels_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_kernel.json"
kernel_spec = xclbinutil.kernel_spec(
kernel_name=which_pi, kernel_id=kernel_id, buffer_args=["c0"]
)
with open(kernels_json_fp, "w") as f:
json.dump(xclbinutil.emit_design_kernel_json([kernel_spec]), f, indent=2)
pi_xclbin_fp = Path(__file__).parent.absolute() / f"{which_pi}.xclbin"
xclbinutil.make_xclbin(
str(mem_top_json_fp),
str(aie_part_json_fp),
str(kernels_json_fp),
str(pi_xclbin_fp),
)
if __name__ == "__main__":
build_cdo("fivepi")
from pathlib import Path
import numpy as np
from xaiepy import pyxrt
from xaiepy.pyxrt import ert_cmd_state
def init_xrt_load_kernel(xclbin: Path):
device = pyxrt.device(0)
xclbin = pyxrt.xclbin(str(xclbin))
device.register_xclbin(xclbin)
return device, xclbin
_PROLOG = [
0x00000011,
0x01000405,
0x01000100,
0x0B590100,
0x000055FF,
0x00000001,
0x00000010,
0x314E5A5F,
0x635F5F31,
0x676E696C,
0x39354E5F,
0x6E693131,
0x5F727473,
0x64726F77,
0x00004573,
0x07BD9630,
0x000055FF,
]
shim_instr_v = [
0x06000100,
0x00000000,
0x00000001,
0x00000000,
0x00000000,
0x00000000,
0x80000000,
0x00000000,
0x00000000,
0x02000000,
0x02000000,
0x0001D204,
0x80000000,
0x03000000,
0x00010100,
]
whichpi = "fivepi"
instr_v = _PROLOG + shim_instr_v
instr_v = np.array(instr_v, dtype=np.uint32)
inout0 = np.zeros((1,), dtype=np.float32)
device, xclbin = init_xrt_load_kernel(Path(__file__).parent.absolute() / f"{whichpi}.xclbin")
def go():
context = pyxrt.hw_context(device, xclbin.get_uuid())
xkernel = next(k for k in xclbin.get_kernels() if k.get_name() == whichpi)
kernel = pyxrt.kernel(context, xkernel.get_name())
bo_instr = pyxrt.bo(
device, len(instr_v) * 4, pyxrt.bo.cacheable, kernel.group_id(0)
)
bo_inout0 = pyxrt.bo(device, 1 * 4, pyxrt.bo.host_only, kernel.group_id(2))
bo_instr.write(instr_v, 0)
bo_inout0.write(inout0, 0)
bo_instr.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
h = kernel(bo_instr, len(instr_v), bo_inout0)
assert h.wait() == ert_cmd_state.ERT_CMD_STATE_COMPLETED
bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
entire_buffer = bo_inout0.read(4, 0).view(np.float32)
print(entire_buffer[0])
v = entire_buffer[0].item()
assert isinstance(v, float)
assert np.isclose(v, 3.14)
go()