From 1ce4147e464ac83de26e21b52023d105613792f8 Mon Sep 17 00:00:00 2001 From: Jeff Fifield Date: Wed, 15 May 2024 14:44:02 -0600 Subject: [PATCH 1/3] Delete some examples directories --- examples/transform_partition/air.mlir | 65 --------------------------- 1 file changed, 65 deletions(-) delete mode 100644 examples/transform_partition/air.mlir diff --git a/examples/transform_partition/air.mlir b/examples/transform_partition/air.mlir deleted file mode 100644 index c6f42da13..000000000 --- a/examples/transform_partition/air.mlir +++ /dev/null @@ -1,65 +0,0 @@ -//===- air.mlir ------------------------------------------------*- MLIR -*-===// -// -// Copyright (C) 2022, Xilinx Inc. All rights reserved. -// Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. -// SPDX-License-Identifier: MIT -// -//===----------------------------------------------------------------------===// - -// RUN: air-opt %s -air-transform='filename=%s' - -#map = affine_map<()[s0] -> (s0 * 32)> -module attributes {torch.debug_module_name = "mmult"} { - func.func @forward(%a0: memref<64x64xi32>, %a1: memref<64x64xi32>, %a2: memref<64x64xi32>) { - air.segment @segment0 args(%arg0=%a0, %arg1=%a1, %arg2=%a2) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> { - %c2 = arith.constant 2 : index - %c0_i32 = arith.constant 0 : i32 - %0 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32> - linalg.fill ins(%c0_i32 : i32) outs(%0 : memref<64x64xi32>) - %1 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32> - memref.copy %0, %1 : memref<64x64xi32> to memref<64x64xi32> - air.herd @herd_0 tile (%arg3, %arg4) in (%arg5=%c2, %arg6=%c2) args(%arg7=%arg0, %arg8=%arg1, %arg9=%1) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> { - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c32 = arith.constant 32 : index - %2 = affine.apply #map()[%arg3] - %3 = affine.apply #map()[%arg4] - scf.for %arg10 = %c0 to %c64 step %c32 { - %4 = memref.alloc() : memref<32x32xi32, 2> - %5 = memref.alloc() : memref<32x32xi32, 2> - %6 = memref.alloc() : memref<32x32xi32, 2> - air.dma_memcpy_nd (%4[] [] [], %arg7[%2, %arg10] [%c32, %c32] [%c64, %c1]) {id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>) - air.dma_memcpy_nd (%5[] [] [], %arg8[%arg10, %3] [%c32, %c32] [%c64, %c1]) {id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>) - air.dma_memcpy_nd (%6[] [] [], %arg9[%2, %3] [%c32, %c32] [%c64, %c1]) {id = 3 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>) - linalg.matmul ins(%4, %5 : memref<32x32xi32, 2>, memref<32x32xi32, 2>) outs(%6 : memref<32x32xi32, 2>) - air.dma_memcpy_nd (%arg9[%2, %3] [%c32, %c32] [%c64, %c1], %6[] [] []) {id = 4 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>) - memref.dealloc %4 : memref<32x32xi32, 2> - memref.dealloc %5 : memref<32x32xi32, 2> - memref.dealloc %6 : memref<32x32xi32, 2> - } - air.herd_terminator - } - memref.copy %1, %arg2 : memref<64x64xi32> to memref<64x64xi32> - air.segment_terminator - } - return - } -} - -transform.with_pdl_patterns { -^bb0(%arg0: !pdl.operation): - pdl.pattern @match_segments : benefit(1) { - %args = operands - %results = types - %op = operation "air.segment"(%args : !pdl.range) -> (%results : !pdl.range) - rewrite %op with "transform.dialect" - } - - sequence %arg0 failures(propagate) { - ^bb1(%arg1: !pdl.operation): - %0 = pdl_match @match_segments in %arg1 - %1 = transform.air.segment_to_aie %0 - transform.test_print_remark_at_operand %1, "found segment" - } -} \ No newline at end of file From 055e1cace1c4d7129958669895d28459de8be384 Mon Sep 17 00:00:00 2001 From: Jeff Fifield Date: Wed, 15 May 2024 14:44:55 -0600 Subject: [PATCH 2/3] Delete examples/partitioned_L2 directory --- examples/partitioned_L2/mmult.py | 47 ---------- examples/partitioned_L2/output.mlir | 135 ---------------------------- 2 files changed, 182 deletions(-) delete mode 100644 examples/partitioned_L2/mmult.py delete mode 100644 examples/partitioned_L2/output.mlir diff --git a/examples/partitioned_L2/mmult.py b/examples/partitioned_L2/mmult.py deleted file mode 100644 index 69b3b3c45..000000000 --- a/examples/partitioned_L2/mmult.py +++ /dev/null @@ -1,47 +0,0 @@ -# mmult.py -*- Python -*- -# -# Copyright (C) 2022, Advanced Micro Devices. All rights reserved. -# SPDX-License-Identifier: MIT - -import air.compiler.util - -from air.dialects import func -from air.dialects import linalg -from air.ir import * -import air.passmanager - -import sys - -def matmul_on_tensors(m, n, k, dtype): - module = Module.create() - with InsertionPoint(module.body): - @func.FuncOp.from_py_func( - RankedTensorType.get((m, k), dtype), RankedTensorType.get((k, n), dtype), - RankedTensorType.get((m, n), dtype)) - def matmul(lhs, rhs, out): - linalg.matmul(lhs, rhs, outs=[out]) - return module - - -with air.ir.Context(), Location.unknown(): - - air_module = matmul_on_tensors(512, 512, 512, BF16Type.get()) - - # convert linalg on tensors to linalg on memrefs - pm = air.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE) - pm.run(air_module) - - pipeline = ",".join([ - "air-linalg-codegen{l1-tile-size=32,32,32 l1-tile-permute=2,0,1 l2-tile-size=64,64,32 l2-promote=false}", - "affine-to-air{herd-assign-depth=1}", - "canonicalize", "cse", - "air-specialize-dma", - "air-promote-dma", - "canonicalize", "cse", - "air-pipeline-to-affine", - "canonicalize", "cse", - ]) - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module) - - print (air_module) diff --git a/examples/partitioned_L2/output.mlir b/examples/partitioned_L2/output.mlir deleted file mode 100644 index 3c5b0820e..000000000 --- a/examples/partitioned_L2/output.mlir +++ /dev/null @@ -1,135 +0,0 @@ -//===- output.mlir ---------------------------------------------*- MLIR -*-===// -// -// Copyright (C) 2022, Xilinx Inc. All rights reserved. -// Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved. -// SPDX-License-Identifier: MIT -// -//===----------------------------------------------------------------------===// - -#map = affine_map<()[s0] -> (s0 * 32)> -#set0 = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0)> -#set1 = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0)> -#set2 = affine_set<()[s0, s1] : (s1 == 0, s0 >= 0)> -#set3 = affine_set<()[s0, s1] : (s1 - 1 == 0, s0 >= 0)> -module { - func @matmul(%arg0: memref<512x512xbf16>, %arg1: memref<512x512xbf16>, %arg2: memref<512x512xbf16>) { - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %c512 = arith.constant 512 : index - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %0 = memref.alloc() {alignment = 128 : i64} : memref<512x512xbf16> - memref.copy %arg2, %0 : memref<512x512xbf16> to memref<512x512xbf16> - scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c512, %c512) step (%c64, %c64) { - scf.for %arg5 = %c0 to %c512 step %c32 { - %1 = memref.alloc() : memref<32x32xbf16, 1> - air.dma_memcpy_nd (%1[] [] [], %arg0[%arg3, %arg5] [%c32, %c32] [%c512, %c1]) {id = 1 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>) - %2 = memref.alloc() : memref<32x32xbf16, 1> - %3 = arith.addi %arg3, %c32 : index - air.dma_memcpy_nd (%2[] [] [], %arg0[%3, %arg5] [%c32, %c32] [%c512, %c1]) {id = 1 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>) - %4 = memref.alloc() : memref<32x32xbf16, 1> - air.dma_memcpy_nd (%4[] [] [], %arg1[%arg5, %arg4] [%c32, %c32] [%c512, %c1]) {id = 2 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>) - %5 = memref.alloc() : memref<32x32xbf16, 1> - %6 = arith.addi %arg4, %c32 : index - air.dma_memcpy_nd (%5[] [] [], %arg1[%arg5, %6] [%c32, %c32] [%c512, %c1]) {id = 2 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>) - air.launch_herd tile (%arg6, %arg7) in (%arg8=%c2, %arg9=%c2) args(%arg10=%arg3, %arg11=%arg4, %arg12=%0, %arg13=%1, %arg14=%2, %arg15=%4, %arg16=%5) : index, index, memref<512x512xbf16>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1> { - %c32_0 = arith.constant 32 : index - %c512_1 = arith.constant 512 : index - %c1_2 = arith.constant 1 : index - %7 = affine.apply #map()[%arg6] - %8 = affine.apply #map()[%arg7] - %9 = arith.addi %arg10, %7 : index - %10 = arith.addi %arg11, %8 : index - %11 = memref.alloc() : memref<32x32xbf16, 2> - %12 = memref.alloc() : memref<32x32xbf16, 2> - %13 = memref.alloc() : memref<32x32xbf16, 2> - affine.if #set0()[%arg6, %arg7] { - air.dma_memcpy_nd (%11[] [] [], %arg13[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>) - } - affine.if #set1()[%arg6, %arg7] { - air.dma_memcpy_nd (%11[] [] [], %arg14[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>) - } - affine.if #set2()[%arg6, %arg7] { - air.dma_memcpy_nd (%12[] [] [], %arg15[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>) - } - affine.if #set3()[%arg6, %arg7] { - air.dma_memcpy_nd (%12[] [] [], %arg16[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>) - } - air.dma_memcpy_nd (%13[] [] [], %arg12[%9, %10] [%c32_0, %c32_0] [%c512_1, %c1_2]) {id = 3 : i32} : (memref<32x32xbf16, 2>, memref<512x512xbf16>) - linalg.matmul {cast = #linalg.type_fn} ins(%11, %12 : memref<32x32xbf16, 2>, memref<32x32xbf16, 2>) outs(%13 : memref<32x32xbf16, 2>) - air.dma_memcpy_nd (%arg12[%9, %10] [%c32_0, %c32_0] [%c512_1, %c1_2], %13[] [] []) {id = 4 : i32} : (memref<512x512xbf16>, memref<32x32xbf16, 2>) - memref.dealloc %11 : memref<32x32xbf16, 2> - memref.dealloc %12 : memref<32x32xbf16, 2> - memref.dealloc %13 : memref<32x32xbf16, 2> - air.herd_terminator - } - } - scf.yield - } - return - } -} - - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'index' at index: 6 - of type 'memref<512x512xbf16>' at index: 7 - of type 'index' at index: 8 - of type 'memref<512x512xbf16>' at index: 9 - of type 'memref<512x512xbf16>' at index: 10 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'index' at index: 6 - of type 'memref<512x512xbf16>' at index: 7 - of type 'index' at index: 8 - of type 'memref<512x512xbf16>' at index: 9 - of type 'memref<512x512xbf16>' at index: 10 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'index' at index: 6 - of type 'memref<512x512xbf16>' at index: 7 - of type 'index' at index: 8 - of type 'memref<512x512xbf16>' at index: 9 - of type 'memref<512x512xbf16>' at index: 10 - of type 'memref<32x32xbf16, 1>' at index: 11 - of type 'memref<32x32xbf16, 1>' at index: 12 - of type 'memref<32x32xbf16, 1>' at index: 13 - of type 'memref<32x32xbf16, 1>' at index: 14 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'index' at index: 6 - of type 'index' at index: 7 - of type 'memref<512x512xbf16>' at index: 8 - of type 'memref<32x32xbf16, 1>' at index: 9 - of type 'memref<32x32xbf16, 1>' at index: 10 - of type 'memref<32x32xbf16, 1>' at index: 11 - of type 'memref<32x32xbf16, 1>' at index: 12 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'memref<512x512xbf16>' at index: 6 - of type 'memref<32x32xbf16, 1>' at index: 7 - of type 'memref<32x32xbf16, 1>' at index: 8 - of type 'memref<32x32xbf16, 1>' at index: 9 - of type 'memref<32x32xbf16, 1>' at index: 10 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'memref<512x512xbf16>' at index: 6 - of type 'memref<32x32xbf16, 1>' at index: 7 - of type 'memref<32x32xbf16, 1>' at index: 8 - of type 'memref<32x32xbf16, 1>' at index: 9 - of type 'memref<32x32xbf16, 1>' at index: 10 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'memref<512x512xbf16>' at index: 6 - of type 'memref<32x32xbf16, 1>' at index: 7 - of type 'memref<32x32xbf16, 1>' at index: 8 - of type 'memref<32x32xbf16, 1>' at index: 9 - of type 'memref<32x32xbf16, 1>' at index: 10 - of type 'index' at index: 4 - of type 'index' at index: 5 - of type 'memref<512x512xbf16>' at index: 6 - of type 'memref<32x32xbf16, 1>' at index: 7 - of type 'memref<32x32xbf16, 1>' at index: 8 - of type 'memref<32x32xbf16, 1>' at index: 9 - of type 'memref<32x32xbf16, 1>' at index: 10 From 898e5ab15ac5750ce971f8693d2d22299280fc67 Mon Sep 17 00:00:00 2001 From: Jeff Fifield Date: Wed, 15 May 2024 14:45:45 -0600 Subject: [PATCH 3/3] Delete examples/air_to_npu directory --- examples/air_to_npu/aie.py | 157 --------------------- examples/air_to_npu/aie_w_pack.py | 180 ------------------------ examples/air_to_npu/test.cpp | 225 ------------------------------ 3 files changed, 562 deletions(-) delete mode 100644 examples/air_to_npu/aie.py delete mode 100644 examples/air_to_npu/aie_w_pack.py delete mode 100644 examples/air_to_npu/test.cpp diff --git a/examples/air_to_npu/aie.py b/examples/air_to_npu/aie.py deleted file mode 100644 index a6f66c83e..000000000 --- a/examples/air_to_npu/aie.py +++ /dev/null @@ -1,157 +0,0 @@ -import air -import air.compiler.util -from air.dialects import linalg, tensor, arith, func, memref -from air.ir import * -import air.passmanager -from air.dialects import air as airdialect -from air.compiler.util import run_transform -import sys -def matmul_on_tensors(m, n, k, dtype): - module = Module.create() - with InsertionPoint(module.body): - @func.FuncOp.from_py_func( - MemRefType.get((m, k), dtype), MemRefType.get((k, n), dtype)) - def forward(lhs, rhs): - out = memref.AllocOp(MemRefType.get((m, n), dtype), [], []) - zero = arith.ConstantOp(dtype, 0) - zero_fill = linalg.fill(zero, outs=[out]) - linalg.matmul(lhs, rhs, outs=[out]) - return out - return module - -with air.ir.Context() as ctx, Location.unknown(): - air_module = matmul_on_tensors(2048, 2048, 2048, IntegerType.get_signless(width = 32)) - - ################################################ - ## Tiling - ################################################ - - pm = air.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE) - pm.run(air_module.operation) - with open('air_input.mlir', 'w') as f: - f.write(str(air_module)) - - transform_ir_string = """ - transform.with_pdl_patterns { - ^bb0(%arg0: !pdl.operation): - transform.sequence %arg0 : !pdl.operation failures(propagate) { - ^bb1(%arg1: !pdl.operation): - %fill = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %matmul_1, %loops:2 = transform.air.linalg_tile %matmul [64, 64, 0] - %fill_1 = transform.air.fuse_into_containing_op %fill into %loops#1 - transform.air.linalg_promote %fill_1 {"operands_to_promote"=[1], "memory_space"="L2"} - transform.air.linalg_promote %matmul_1 {"operands_to_promote"=[2], "memory_space"="L2"} - %matmul_2, %reduction_loop_1 = transform.air.linalg_tile %matmul_1 [0, 0, 256] - transform.air.linalg_promote %matmul_2 {"operands_to_promote"=[0,1], "memory_space"="L2"} - %matmul_3, %loops_2:2 = transform.air.linalg_tile %matmul_2 [32, 32, 0] - %fill_2 = transform.air.fuse_into_containing_op %fill_1 into %loops_2#1 - transform.air.linalg_promote %fill_2 {"operands_to_promote"=[1], "memory_space"="L1"} - transform.air.linalg_promote %matmul_3 {"operands_to_promote"=[2], "memory_space"="L1"} - %matmul_4, %reduction_loop_2 = transform.air.linalg_tile %matmul_3 [0, 0, 32] - transform.air.linalg_promote %matmul_4 {"operands_to_promote"=[0,1], "memory_space"="L1"} - } - } - """ - - transform_ir = Module.parse(transform_ir_string) - run_transform(transform_ir, air_module) - - with open('air_tiled.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Binding scf.paralell to air hierarchies - ################################################ - - pipeline = "builtin.module("+",".join([ - "buffer-results-to-out-params", - "air-par-to-herd{depth=1}", - "air-par-to-launch{has-air-segment=true}", - "air-copy-to-dma", - "canonicalize", "cse", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - - with open('air_sync.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Extract event dependency and optimize schedule - ################################################ - - pipeline = "builtin.module("+",".join([ - "air-dependency", - "air-dependency-schedule-opt", - "air-specialize-dma-broadcast", - "air-dma-to-channel", - "canonicalize", "cse", - "air-dependency-canonicalize", - "canonicalize", "cse", - "air-label-scf-for-to-ping-pong", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - # Not sure why parsing the ir solves the segmentation fault... - air_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "air-ping-pong-transform{keep-memref-dealloc=true}", - "canonicalize", "cse", - "air-isolate-async-dma-loop-nests", - "air-specialize-channel-wrap-and-stride", - "canonicalize", "cse", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aircc_input.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Place herd to segment - ################################################ - - air_async_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "func.func(air-collapse-herd)", - 'canonicalize', 'cse', - "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}", - 'canonicalize', 'cse', - 'func.func(air-renumber-dma)', - 'func.func(convert-linalg-to-loops)', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('air_placed.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## MLIR-AIR to MLIR-AIE - ################################################ - - pipeline = "builtin.module("+",".join([ - 'air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}', - 'canonicalize', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aircc_decomp_aiecc.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## MLIR-AIR runtime lowering - ################################################ - - pipeline = "builtin.module("+",".join([ - 'air-to-std', - 'canonicalize', - 'func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})', - 'func.func(air-unroll-outer-affine-loops{depth=2})', - 'affine-expand-index-ops', - 'airrt-to-npu', - 'canonicalize', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aie.mlir', 'w') as f: - f.write(str(air_module)) diff --git a/examples/air_to_npu/aie_w_pack.py b/examples/air_to_npu/aie_w_pack.py deleted file mode 100644 index 4b847d513..000000000 --- a/examples/air_to_npu/aie_w_pack.py +++ /dev/null @@ -1,180 +0,0 @@ -import air -import air.compiler.util -from air.dialects import linalg, tensor, arith, func, memref -from air.ir import * -import air.passmanager -from air.dialects import air as airdialect -from air.compiler.util import run_transform -import sys - -with air.ir.Context() as ctx, Location.unknown(): - - ################################################ - ## Tiling - ################################################ - - air_tiled_ir_string = """ - #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> - #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> - #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> - module { - func.func @forward(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>) -> memref<2048x2048xi32> { - %c32 = arith.constant 32 : index - %c256 = arith.constant 256 : index - %c2048 = arith.constant 2048 : index - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c0_i32 = arith.constant 0 : i32 - %alloc = memref.alloc() : memref<2048x2048xi32> - scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) { - %subview = memref.subview %alloc[%arg2, %arg3] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>> - %alloc_0 = memref.alloc() : memref<1x1x64x64xi32, 1> - scf.parallel (%arg4, %arg5) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) { - %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2> - linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<1x1x4x8x4x8xi32, 2>) - %subview_3 = memref.subview %alloc_0[0, 0, %arg4, %arg5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1> - scf.for %arg6 = %c0 to %c2048 step %c256 { - %subview_5 = memref.subview %arg0[%arg2, %arg6] [64, 256] [1, 1] : memref<2048x2048xi32> to memref<64x256xi32, strided<[2048, 1], offset: ?>> - %subview_6 = memref.subview %arg1[%arg6, %arg3] [256, 64] [1, 1] : memref<2048x2048xi32> to memref<256x64xi32, strided<[2048, 1], offset: ?>> - %alloc_7 = memref.alloc() : memref<1x1x64x256xi32, 1> - %alloc_8 = memref.alloc() : memref<1x1x256x64xi32, 1> - air.dma_memcpy_nd (%alloc_7[] [] [], %subview_5[] [] []) : (memref<1x1x64x256xi32, 1>, memref<64x256xi32, strided<[2048, 1], offset: ?>>) - air.dma_memcpy_nd (%alloc_8[] [] [], %subview_6[] [] []) : (memref<1x1x256x64xi32, 1>, memref<256x64xi32, strided<[2048, 1], offset: ?>>) - scf.for %arg7 = %c0 to %c256 step %c32 { - %subview_9 = memref.subview %alloc_7[0, 0, %arg4, %arg7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x256xi32, 1> to memref<1x1x32x32xi32, strided<[16384, 16384, 256, 1], offset: ?>, 1> - %subview_10 = memref.subview %alloc_8[0, 0, %arg7, %arg5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x256x64xi32, 1> to memref<1x1x32x32xi32, strided<[16384, 16384, 64, 1], offset: ?>, 1> - %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2> - %alloc_12 = memref.alloc() : memref<1x1x4x4x8x8xi32, 2> - %expand_shape = memref.expand_shape %subview_9 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[16384, 16384, 256, 1], offset: ?>, 1> into memref<1x1x8x4x4x8xi32, strided<[16384, 16384, 1024, 256, 8, 1], offset: ?>, 1> - %transpose_13 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[16384, 16384, 1024, 256, 8, 1], offset: ?>, 1> to memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 8, 1024, 256, 1], offset: ?>, 1> - air.dma_memcpy_nd (%alloc_11[] [] [], %transpose_13[] [] []) : (memref<1x1x4x8x4x8xi32, 2>, memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 8, 1024, 256, 1], offset: ?>, 1>) - %expand_shape_14 = memref.expand_shape %subview_10 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[16384, 16384, 64, 1], offset: ?>, 1> into memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 512, 64, 8, 1], offset: ?>, 1> - %transpose_15 = memref.transpose %expand_shape_14 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 512, 64, 8, 1], offset: ?>, 1> to memref<1x1x4x4x8x8xi32, strided<[16384, 16384, 8, 512, 64, 1], offset: ?>, 1> - air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_15[] [] []) : (memref<1x1x4x4x8x8xi32, 2>, memref<1x1x4x4x8x8xi32, strided<[16384, 16384, 8, 512, 64, 1], offset: ?>, 1>) - linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2>, memref<1x1x4x4x8x8xi32, 2>) outs(%alloc_2 : memref<1x1x4x8x4x8xi32, 2>) { - ^bb0(%in: i32, %in_16: i32, %out: i32): - %0 = arith.muli %in, %in_16 : i32 - %1 = arith.addi %out, %0 : i32 - linalg.yield %1 : i32 - } - memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2> - memref.dealloc %alloc_12 : memref<1x1x4x4x8x8xi32, 2> - } - memref.dealloc %alloc_7 : memref<1x1x64x256xi32, 1> - memref.dealloc %alloc_8 : memref<1x1x256x64xi32, 1> - } - %transpose_4 = memref.transpose %alloc_2 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x4x8x4x8xi32, 2> to memref<1x1x8x4x4x8xi32, strided<[1024, 1024, 32, 8, 256, 1]>, 2> - air.dma_memcpy_nd (%subview_3[] [] [], %transpose_4[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1>, memref<1x1x8x4x4x8xi32, strided<[1024, 1024, 32, 8, 256, 1]>, 2>) - memref.dealloc %alloc_2 : memref<1x1x4x8x4x8xi32, 2> - scf.reduce - } - %subview_1 = memref.subview %alloc_0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1> to memref<64x64xi32, 1> - %transpose = memref.transpose %subview_1 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1> to memref<64x64xi32, strided<[64, 1]>, 1> - air.dma_memcpy_nd (%subview[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1>) - memref.dealloc %alloc_0 : memref<1x1x64x64xi32, 1> - scf.reduce - } - return %alloc : memref<2048x2048xi32> - } - } - """ - air_module = Module.parse(air_tiled_ir_string) - - with open('air_tiled.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Binding scf.paralell to air hierarchies - ################################################ - - pipeline = "builtin.module("+",".join([ - "buffer-results-to-out-params", - "air-par-to-herd{depth=1}", - "air-par-to-launch{has-air-segment=true}", - "air-copy-to-dma", - "canonicalize", "cse", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - - with open('air_sync.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Extract event dependency and optimize schedule - ################################################ - - pipeline = "builtin.module("+",".join([ - "air-dependency", - "air-dependency-schedule-opt", - "air-specialize-dma-broadcast", - "air-dma-to-channel", - "canonicalize", "cse", - "air-dependency-canonicalize", - "canonicalize", "cse", - "func.func(air-loop-fusion)", - "air-label-scf-for-to-ping-pong", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - # Not sure why parsing the ir solves the segmentation fault... - air_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "air-ping-pong-transform{keep-memref-dealloc=true}", - "canonicalize", "cse", - "air-specialize-channel-wrap-and-stride", - "canonicalize", "cse", - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aircc_input.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## Place herd to segment - ################################################ - - air_async_module = Module.parse(str(air_module)) - pipeline = "builtin.module("+",".join([ - "func.func(air-collapse-herd)", - 'canonicalize', 'cse', - "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}", - 'canonicalize', 'cse', - 'func.func(air-renumber-dma)', - 'func.func(convert-linalg-to-loops)', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('air_placed.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## MLIR-AIR to MLIR-AIE - ################################################ - - pipeline = "builtin.module("+",".join([ - 'air-to-aie{row-offset=2 col-offset=0 device=npu emit-while-loop=true}', - 'canonicalize', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aircc_decomp_aiecc.mlir', 'w') as f: - f.write(str(air_module)) - - ################################################ - ## MLIR-AIR runtime lowering - ################################################ - - pipeline = "builtin.module("+",".join([ - 'air-to-std', - 'canonicalize', - 'func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})', - 'func.func(air-unroll-outer-affine-loops{depth=2})', - 'affine-expand-index-ops', - 'airrt-to-npu', - 'canonicalize', - ])+')' - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(air_module.operation) - with open('aie.mlir', 'w') as f: - f.write(str(air_module)) diff --git a/examples/air_to_npu/test.cpp b/examples/air_to_npu/test.cpp deleted file mode 100644 index 7128685bc..000000000 --- a/examples/air_to_npu/test.cpp +++ /dev/null @@ -1,225 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -#define M 2048 -#define N 2048 -#define K 2048 - -#define A_VOLUME M *K -#define B_VOLUME N *K -#define C_VOLUME M *N - -#define A_DATATYPE int32_t -#define B_DATATYPE int32_t -#define C_DATATYPE int32_t - -constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE)); -constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE)); -constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE)); -constexpr int TRACE_SIZE = (0 * sizeof(uint32_t)); - -namespace po = boost::program_options; - -void check_arg_file_exists(po::variables_map &vm_in, std::string name) { - if (!vm_in.count(name)) { - throw std::runtime_error("Error: no " + name + " file was provided\n"); - } else { - std::ifstream test(vm_in[name].as()); - if (!test) { - throw std::runtime_error("The " + name + " file " + - vm_in[name].as() + - " does not exist.\n"); - } - } -} - -std::vector load_instr_sequence(std::string instr_path) { - std::ifstream instr_file(instr_path); - std::string line; - std::vector instr_v; - while (std::getline(instr_file, line)) { - std::istringstream iss(line); - uint32_t a; - if (!(iss >> std::hex >> a)) { - throw std::runtime_error("Unable to parse instruction file\n"); - } - instr_v.push_back(a); - } - return instr_v; -} - -template -void mm_out(std::vector a, std::vector b, std::vector &r) { - for (size_t m1 = 0; m1 < M; m1++) { - for (size_t n1 = 0; n1 < N; n1++) { - size_t idx = m1 * N + n1; - r[idx] = (T)(0); - for (size_t k1 = 0; k1 < K; k1++) { - T _a = a[k1 + m1 * K]; - T _b = b[n1 + k1 * N]; - r[idx] += _a * _b; - } - } - } -} - -int main(int argc, const char *argv[]) { - - // Program arguments parsing - po::options_description desc("Allowed options"); - desc.add_options()("help,h", "produce help message")( - "xclbin,x", po::value()->required(), - "the input xclbin path")( - "kernel,k", po::value()->required(), - "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( - "verbosity,v", po::value()->default_value(0), - "the verbosity of the output")( - "instr,i", po::value()->required(), - "path of file containing userspace instructions to be sent to the LX6"); - po::variables_map vm; - - try { - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - std::cerr << "Usage:\n" << desc << "\n"; - return 1; - } - - check_arg_file_exists(vm, "xclbin"); - check_arg_file_exists(vm, "instr"); - - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // Start the XRT test code - // Get a device handle - unsigned int device_index = 0; - auto device = xrt::device(device_index); - - // Load the xclbin - if (verbosity >= 1) - std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; - auto xclbin = xrt::xclbin(vm["xclbin"].as()); - - if (verbosity >= 1) - std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; - std::string Node = vm["kernel"].as(); - - // Get the kernel from the xclbin - auto xkernels = xclbin.get_kernels(); - auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), - [Node](xrt::xclbin::kernel &k) { - auto name = k.get_name(); - std::cout << "Name: " << name << std::endl; - return name.rfind(Node, 0) == 0; - }); - auto kernelName = xkernel.get_name(); - - if (verbosity >= 1) - std::cout << "Registering xclbin: " << vm["xclbin"].as() - << "\n"; - - device.register_xclbin(xclbin); - - // get a hardware context - if (verbosity >= 1) - std::cout << "Getting hardware context.\n"; - xrt::hw_context context(device, xclbin.get_uuid()); - - // get a kernel handle - if (verbosity >= 1) - std::cout << "Getting handle to kernel:" << kernelName << "\n"; - auto kernel = xrt::kernel(context, kernelName); - - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_a = - xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_b = - xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_c = - xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - A_DATATYPE *bufA = bo_a.map(); - std::vector AVec; - for (int i = 0; i < A_VOLUME; i++) - AVec.push_back(rand() % UINT16_MAX); - memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE))); - B_DATATYPE *bufB = bo_b.map(); - std::vector BVec; - for (int i = 0; i < B_VOLUME; i++) - BVec.push_back(rand() % UINT16_MAX); - memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE))); - C_DATATYPE *bufC = bo_c.map(); - std::vector CVec; - for (int i = 0; i < C_VOLUME; i++) - CVec.push_back(0); - memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE))); - - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); - run.wait(); - - bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - C_DATATYPE *bufOut = bo_c.map(); - - int errors = 0; - int max_errors = 100; - - std::vector output_ref0; - for (uint32_t i = 0; i < C_VOLUME; i++) - output_ref0.push_back(0); - mm_out(AVec, BVec, output_ref0); - - for (uint32_t i = 0; i < C_VOLUME; i++) { - if (bufOut[i] != output_ref0[i]) { - errors++; - if (errors < max_errors) { - std::cout << "\nerror, id " << i << " expected " - << std::to_string(output_ref0[i]) << ", got" - << std::to_string(bufOut[i]) << "\n"; - } - } - } - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nerror count: " << errors << "\n\n"; - std::cout << "\nfailed.\n\n"; - return 1; - } -}