Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added 1 in 1 out support for new simplified testbench and updated additional designs #2065

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions aie_kernels/aie2/reduce_add.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@

static void _reduce_add_scalar(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {
event0();
int32_t running_total = 0;
for (int32_t i = 0; i < input_size; i++) {
running_total = running_total + in[i];
}
*out = running_total;
event1();
return;
}

static void _reduce_add_vector(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {
event0();
v16int32 zero = broadcast_to_v16int32((int32_t)0);
const int32_t vector_size = 16;
v16int32 after_vector;
Expand All @@ -37,6 +40,7 @@ static void _reduce_add_vector(int32_t *restrict in, int32_t *restrict out,
v16int32 fifth = add(fourth, fourth_shift);
int32_t last = extract_elem(fifth, 0U);
*(int32_t *)out = last;
event1();
return;
}

Expand Down
4 changes: 4 additions & 0 deletions aie_kernels/aie2/reduce_max.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
void _reduce_max_vector(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {

event0();
v16int32 tiny = broadcast_to_v16int32((int32_t)INT32_MIN);
const int32_t vector_size = 16;
v16int32 after_vector;
Expand All @@ -29,17 +30,20 @@ void _reduce_max_vector(int32_t *restrict in, int32_t *restrict out,
v16int32 fifth = max(fourth, fourth_shift);
int32_t last = extract_elem(fifth, 0U);
*(int32_t *)out = last;
event1();
return;
}

void _reduce_max_scalar(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {
event0();
int32_t running_max = (int32_t)INT32_MIN;
for (int32_t i = 0; i < input_size; i++) {
if (in[i] > running_max)
running_max = in[i];
}
*(int32_t *)out = running_max;
event1();

return;
}
Expand Down
4 changes: 4 additions & 0 deletions aie_kernels/aie2/reduce_min.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
void _reduce_min_vector(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {

event0();
v16int32 massive = broadcast_to_v16int32((int32_t)INT32_MAX);
const int32_t vector_size = 16;
v16int32 after_vector;
Expand All @@ -29,17 +30,20 @@ void _reduce_min_vector(int32_t *restrict in, int32_t *restrict out,
v16int32 fifth = min(fourth, fourth_shift);
int32_t last = extract_elem(fifth, 0U);
*(int32_t *)out = last;
event1();
return;
}

void _reduce_min_scalar(int32_t *restrict in, int32_t *restrict out,
const int32_t input_size) {
event0();
int32_t running_min = (int32_t)INT32_MAX;
for (int32_t i = 0; i < input_size; i++) {
if (in[i] < running_min)
running_min = in[i];
}
*(int32_t *)out = running_min;
event1();

return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ else()
set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif ()

set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
set(IN1_SIZE 4096 CACHE STRING "in1 buffer size")
set(OUT_SIZE 4096 CACHE STRING "out buffer size")
set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName ${TARGET_NAME})
Expand All @@ -46,8 +47,9 @@ add_executable(${currentTarget}
)

target_compile_definitions(${currentTarget} PUBLIC
PASSTHROUGH_SIZE=${PASSTHROUGH_SIZE}
DISABLE_ABI_CHECK=1
IN1_SIZE=${IN1_SIZE}
OUT_SIZE=${OUT_SIZE}
DISABLE_ABI_CHECK=1
)

target_include_directories (${currentTarget} PUBLIC
Expand Down
25 changes: 12 additions & 13 deletions programming_examples/basic/passthrough_kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

include ${srcdir}/../../makefile-common
VPATH := ${srcdir}/../../../aie_kernels/generic

devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu)
targetname = passthrough_kernel
VPATH := ${srcdir}/../../../aie_kernels/generic
data_size = 4096
in1_size = 4096 # in bytes
out_size = 4096 # in bytes, should always be equal to in1_size
trace_size = 8192
PASSTHROUGH_SIZE = ${data_size}
CHESS ?= false

data_size = in1_size
aie_py_src=${targetname}.py
use_alt?=0

Expand All @@ -32,13 +34,13 @@ all: build/final_${data_size}.xclbin

build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${devicename} ${data_size} 0 > $@
python3 $< ${devicename} ${in1_size} ${out_size} 0 > $@

build/aie2_trace_lineBased_8b_${data_size}.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${devicename} ${data_size} ${trace_size} > $@
python3 $< ${devicename} ${in1_size} ${out_size} ${trace_size} > $@

build/passThrough.cc.o: passThrough.cc
build/%.cc.o: %.cc
mkdir -p ${@D}
ifeq ($(devicename),npu)
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
Expand All @@ -64,7 +66,7 @@ build/final_trace_${data_size}.xclbin: build/aie2_trace_lineBased_8b_${data_size
${targetname}_${data_size}.exe: ${srcdir}/test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DPASSTHROUGH_SIZE=${data_size}
cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DIN1_SIZE=${in1_size} -DOUT_SIZE=${out_size}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
Expand All @@ -75,24 +77,21 @@ endif
run: ${targetname}_${data_size}.exe build/final_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE

#run-g: ${targetname}.exe build/final_${data_size}.xclbin build/insts.txt
# ${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size}

run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 ${srcdir}/test.py -s ${data_size} -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE
${powershell} python3 ${srcdir}/test.py -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -i1s ${in1_size} -os ${out_size}

trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --colshift 1 > trace_passthrough_kernel.json
${srcdir}/../../utils/get_trace_summary.py --filename trace_passthrough_kernel.json

trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -os ${out_size}
${srcdir}/../../utils/parse_trace.py --filename trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --colshift 1 > trace_passthrough_kernel.json
${srcdir}/../../utils/get_trace_summary.py --filename trace_passthrough_kernel.json

clean_trace:
rm -rf tmpTrace trace.txt parse*json trace*json

clean:
clean: clean_trace
rm -rf build _build ${targetname}*.exe
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,21 @@
dev = NPU2()
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
vector_size = int(sys.argv[2])
if vector_size % 64 != 0 or vector_size < 512:
print("Vector size must be a multiple of 64 and greater than or equal to 512")
in1_size = int(sys.argv[2])
if in1_size % 64 != 0 or in1_size < 512:
print(
"In1 buffer size must be a multiple of 64 and greater than or equal to 512"
)
raise ValueError
trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
out_size = int(sys.argv[3])
trace_size = 0 if (len(sys.argv) != 5) else int(sys.argv[4])
except ValueError:
print("Argument has inappropriate value")

# Define tensor types
line_size = vector_size // 4
line_size = in1_size // 4
line_type = np.ndarray[(line_size,), np.dtype[np.uint8]]
vector_type = np.ndarray[(vector_size,), np.dtype[np.uint8]]
vector_type = np.ndarray[(in1_size,), np.dtype[np.uint8]]

# Dataflow with ObjectFifos
of_in = ObjectFifo(line_type, name="in")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import aie.utils.trace as trace_utils


def passthroughKernel(dev, vector_size, trace_size):
N = vector_size
def passthrough_kernel(dev, in1_size, out_size, trace_size):
N = in1_size
lineWidthInBytes = N // 4 # chop input in 4 sub-tensors

@device(dev)
Expand All @@ -36,7 +36,7 @@ def device_body():
ComputeTile2 = tile(0, 2)

# Set up a packet-switched flow from core to shim for tracing information
tiles_to_trace = [ComputeTile2]
tiles_to_trace = [ComputeTile2, ShimTile]
if trace_size > 0:
trace_utils.configure_packet_tracing_flow(tiles_to_trace, ShimTile)

Expand All @@ -56,17 +56,13 @@ def core_body():
of_in.release(ObjectFifoPort.Consume, 1)
of_out.release(ObjectFifoPort.Produce, 1)

# print(ctx.module.operation.verify())

@runtime_sequence(vector_ty, vector_ty, vector_ty)
def sequence(inTensor, outTensor, notUsed):
if trace_size > 0:
trace_utils.configure_packet_tracing_aie2(
tiles_to_trace=tiles_to_trace,
shim=ShimTile,
trace_size=trace_size,
trace_offset=N,
ddr_id=1,
)

in_task = shim_dma_single_bd_task(
Expand All @@ -82,21 +78,30 @@ def sequence(inTensor, outTensor, notUsed):
trace_utils.gen_trace_done_aie2(ShimTile)


try:
device_name = str(sys.argv[1])
if device_name == "npu":
dev = AIEDevice.npu1_1col
elif device_name == "npu2":
dev = AIEDevice.npu2
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
vector_size = int(sys.argv[2])
if vector_size % 64 != 0 or vector_size < 512:
print("Vector size must be a multiple of 64 and greater than or equal to 512")
raise ValueError
trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
except ValueError:
print("Argument has inappropriate value")
if len(sys.argv) < 4:
raise ValueError("[ERROR] Need at least 4 arguments (dev, in1_size, out_size)")

device_name = str(sys.argv[1])
if device_name == "npu":
dev = AIEDevice.npu1_1col
elif device_name == "npu2":
dev = AIEDevice.npu2
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
in1_size = int(sys.argv[2])
if in1_size % 64 != 0 or in1_size < 512:
raise ValueError(
"[ERROR] In1 buffer size ("
+ str(in1_size)
+ ") must be a multiple of 64 and greater than or equal to 512"
)
out_size = int(sys.argv[3])
trace_size = 0 if (len(sys.argv) != 5) else int(sys.argv[4])

with mlir_mod_ctx() as ctx:
passthroughKernel(dev, vector_size, trace_size)
print(ctx.module)
passthrough_kernel(dev, in1_size, out_size, trace_size)
res = ctx.module.operation.verify()
if res == True:
print(ctx.module)
else:
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@
// RUN: make -f %S/Makefile clean
// RUN: env use_alt=1 make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run_py
// RUN: make -f %S/Makefile clean
// RUN: env CHESS=false use_alt=1 %run_on_npu make -f %S/Makefile trace
// RUN: env CHESS=false use_alt=1 %run_on_npu make -f %S/Makefile trace_py

Loading
Loading