From 1ce4147e464ac83de26e21b52023d105613792f8 Mon Sep 17 00:00:00 2001
From: Jeff Fifield <jeff.fifield@amd.com>
Date: Wed, 15 May 2024 14:44:02 -0600
Subject: [PATCH 1/3] Delete some examples directories

---
 examples/transform_partition/air.mlir | 65 ---------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 examples/transform_partition/air.mlir
diff --git a/examples/transform_partition/air.mlir b/examples/transform_partition/air.mlir
deleted file mode 100644
index c6f42da13..000000000
--- a/examples/transform_partition/air.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-//===- air.mlir ------------------------------------------------*- MLIR -*-===//
-//
-// Copyright (C) 2022, Xilinx Inc. All rights reserved.
-// Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-//===----------------------------------------------------------------------===//
-
-// RUN: air-opt %s -air-transform='filename=%s'
-
-#map = affine_map<()[s0] -> (s0 * 32)>
-module attributes {torch.debug_module_name = "mmult"} {
-  func.func @forward(%a0: memref<64x64xi32>, %a1: memref<64x64xi32>, %a2: memref<64x64xi32>) {
-    air.segment @segment0 args(%arg0=%a0, %arg1=%a1, %arg2=%a2) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
-      %c2 = arith.constant 2 : index
-      %c0_i32 = arith.constant 0 : i32
-      %0 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
-      linalg.fill ins(%c0_i32 : i32) outs(%0 : memref<64x64xi32>)
-      %1 = memref.alloc() {alignment = 128 : i64} : memref<64x64xi32>
-      memref.copy %0, %1 : memref<64x64xi32> to memref<64x64xi32>
-      air.herd @herd_0  tile (%arg3, %arg4) in (%arg5=%c2, %arg6=%c2) args(%arg7=%arg0, %arg8=%arg1, %arg9=%1) : memref<64x64xi32>, memref<64x64xi32>, memref<64x64xi32> {
-        %c1 = arith.constant 1 : index
-        %c0 = arith.constant 0 : index
-        %c64 = arith.constant 64 : index
-        %c32 = arith.constant 32 : index
-        %2 = affine.apply #map()[%arg3]
-        %3 = affine.apply #map()[%arg4]
-        scf.for %arg10 = %c0 to %c64 step %c32 {
-          %4 = memref.alloc() : memref<32x32xi32, 2>
-          %5 = memref.alloc() : memref<32x32xi32, 2>
-          %6 = memref.alloc() : memref<32x32xi32, 2>
-          air.dma_memcpy_nd (%4[] [] [], %arg7[%2, %arg10] [%c32, %c32] [%c64, %c1]) {id = 1 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
-          air.dma_memcpy_nd (%5[] [] [], %arg8[%arg10, %3] [%c32, %c32] [%c64, %c1]) {id = 2 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
-          air.dma_memcpy_nd (%6[] [] [], %arg9[%2, %3] [%c32, %c32] [%c64, %c1]) {id = 3 : i32} : (memref<32x32xi32, 2>, memref<64x64xi32>)
-          linalg.matmul ins(%4, %5 : memref<32x32xi32, 2>, memref<32x32xi32, 2>) outs(%6 : memref<32x32xi32, 2>)
-          air.dma_memcpy_nd (%arg9[%2, %3] [%c32, %c32] [%c64, %c1], %6[] [] []) {id = 4 : i32} : (memref<64x64xi32>, memref<32x32xi32, 2>)
-          memref.dealloc %4 : memref<32x32xi32, 2>
-          memref.dealloc %5 : memref<32x32xi32, 2>
-          memref.dealloc %6 : memref<32x32xi32, 2>
-        }
-        air.herd_terminator
-      }
-      memref.copy %1, %arg2 : memref<64x64xi32> to memref<64x64xi32>
-      air.segment_terminator
-    }
-    return
-  }
-}
-
-transform.with_pdl_patterns {
-^bb0(%arg0: !pdl.operation):
-  pdl.pattern @match_segments : benefit(1) {
-    %args = operands
-    %results = types
-    %op = operation "air.segment"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
-    rewrite %op with "transform.dialect"
-  }
-
-  sequence %arg0 failures(propagate) {
-  ^bb1(%arg1: !pdl.operation):
-    %0 = pdl_match @match_segments in %arg1
-    %1 = transform.air.segment_to_aie %0
-    transform.test_print_remark_at_operand %1, "found segment"
-  }
-}
\ No newline at end of file

From 055e1cace1c4d7129958669895d28459de8be384 Mon Sep 17 00:00:00 2001
From: Jeff Fifield <jeff.fifield@amd.com>
Date: Wed, 15 May 2024 14:44:55 -0600
Subject: [PATCH 2/3] Delete examples/partitioned_L2 directory

---
 examples/partitioned_L2/mmult.py    |  47 ----------
 examples/partitioned_L2/output.mlir | 135 ----------------------------
 2 files changed, 182 deletions(-)
 delete mode 100644 examples/partitioned_L2/mmult.py
 delete mode 100644 examples/partitioned_L2/output.mlir

diff --git a/examples/partitioned_L2/mmult.py b/examples/partitioned_L2/mmult.py
deleted file mode 100644
index 69b3b3c45..000000000
--- a/examples/partitioned_L2/mmult.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# mmult.py -*- Python -*-
-#
-# Copyright (C) 2022, Advanced Micro Devices. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import air.compiler.util
-
-from air.dialects import func
-from air.dialects import linalg
-from air.ir import *
-import air.passmanager
-
-import sys
-
-def matmul_on_tensors(m, n, k, dtype):
-    module = Module.create()
-    with InsertionPoint(module.body):
-        @func.FuncOp.from_py_func(
-            RankedTensorType.get((m, k), dtype), RankedTensorType.get((k, n), dtype),
-            RankedTensorType.get((m, n), dtype))
-        def matmul(lhs, rhs, out):
-            linalg.matmul(lhs, rhs, outs=[out])
-    return module
-
-
-with air.ir.Context(), Location.unknown():
-
-    air_module = matmul_on_tensors(512, 512, 512, BF16Type.get())
-    
-    # convert linalg on tensors to linalg on memrefs
-    pm = air.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE)
-    pm.run(air_module)
-
-    pipeline = ",".join([
-        "air-linalg-codegen{l1-tile-size=32,32,32 l1-tile-permute=2,0,1 l2-tile-size=64,64,32 l2-promote=false}",
-        "affine-to-air{herd-assign-depth=1}",
-        "canonicalize", "cse",
-        "air-specialize-dma",
-        "air-promote-dma",
-        "canonicalize", "cse",
-        "air-pipeline-to-affine",
-        "canonicalize", "cse",
-    ])
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module)
-
-    print (air_module)
diff --git a/examples/partitioned_L2/output.mlir b/examples/partitioned_L2/output.mlir
deleted file mode 100644
index 3c5b0820e..000000000
--- a/examples/partitioned_L2/output.mlir
+++ /dev/null
@@ -1,135 +0,0 @@
-//===- output.mlir ---------------------------------------------*- MLIR -*-===//
-//
-// Copyright (C) 2022, Xilinx Inc. All rights reserved.
-// Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-//===----------------------------------------------------------------------===//
-
-#map = affine_map<()[s0] -> (s0 * 32)>
-#set0 = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0)>
-#set1 = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0)>
-#set2 = affine_set<()[s0, s1] : (s1 == 0, s0 >= 0)>
-#set3 = affine_set<()[s0, s1] : (s1 - 1 == 0, s0 >= 0)>
-module {
-  func @matmul(%arg0: memref<512x512xbf16>, %arg1: memref<512x512xbf16>, %arg2: memref<512x512xbf16>) {
-    %c32 = arith.constant 32 : index
-    %c64 = arith.constant 64 : index
-    %c512 = arith.constant 512 : index
-    %c0 = arith.constant 0 : index
-    %c2 = arith.constant 2 : index
-    %c1 = arith.constant 1 : index
-    %0 = memref.alloc() {alignment = 128 : i64} : memref<512x512xbf16>
-    memref.copy %arg2, %0 : memref<512x512xbf16> to memref<512x512xbf16>
-    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c512, %c512) step (%c64, %c64) {
-      scf.for %arg5 = %c0 to %c512 step %c32 {
-        %1 = memref.alloc() : memref<32x32xbf16, 1>
-        air.dma_memcpy_nd (%1[] [] [], %arg0[%arg3, %arg5] [%c32, %c32] [%c512, %c1]) {id = 1 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>)
-        %2 = memref.alloc() : memref<32x32xbf16, 1>
-        %3 = arith.addi %arg3, %c32 : index
-        air.dma_memcpy_nd (%2[] [] [], %arg0[%3, %arg5] [%c32, %c32] [%c512, %c1]) {id = 1 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>)
-        %4 = memref.alloc() : memref<32x32xbf16, 1>
-        air.dma_memcpy_nd (%4[] [] [], %arg1[%arg5, %arg4] [%c32, %c32] [%c512, %c1]) {id = 2 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>)
-        %5 = memref.alloc() : memref<32x32xbf16, 1>
-        %6 = arith.addi %arg4, %c32 : index
-        air.dma_memcpy_nd (%5[] [] [], %arg1[%arg5, %6] [%c32, %c32] [%c512, %c1]) {id = 2 : i32} : (memref<32x32xbf16, 1>, memref<512x512xbf16>)
-        air.launch_herd  tile (%arg6, %arg7) in (%arg8=%c2, %arg9=%c2) args(%arg10=%arg3, %arg11=%arg4, %arg12=%0, %arg13=%1, %arg14=%2, %arg15=%4, %arg16=%5) : index, index, memref<512x512xbf16>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1>, memref<32x32xbf16, 1> {
-          %c32_0 = arith.constant 32 : index
-          %c512_1 = arith.constant 512 : index
-          %c1_2 = arith.constant 1 : index
-          %7 = affine.apply #map()[%arg6]
-          %8 = affine.apply #map()[%arg7]
-          %9 = arith.addi %arg10, %7 : index
-          %10 = arith.addi %arg11, %8 : index
-          %11 = memref.alloc() : memref<32x32xbf16, 2>
-          %12 = memref.alloc() : memref<32x32xbf16, 2>
-          %13 = memref.alloc() : memref<32x32xbf16, 2>
-          affine.if #set0()[%arg6, %arg7] {
-            air.dma_memcpy_nd (%11[] [] [], %arg13[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>)
-          }
-          affine.if #set1()[%arg6, %arg7] {
-            air.dma_memcpy_nd (%11[] [] [], %arg14[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>)
-          }
-          affine.if #set2()[%arg6, %arg7] {
-            air.dma_memcpy_nd (%12[] [] [], %arg15[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>)
-          }
-          affine.if #set3()[%arg6, %arg7] {
-            air.dma_memcpy_nd (%12[] [] [], %arg16[] [] []) : (memref<32x32xbf16, 2>, memref<32x32xbf16, 1>)
-          }
-          air.dma_memcpy_nd (%13[] [] [], %arg12[%9, %10] [%c32_0, %c32_0] [%c512_1, %c1_2]) {id = 3 : i32} : (memref<32x32xbf16, 2>, memref<512x512xbf16>)
-          linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%11, %12 : memref<32x32xbf16, 2>, memref<32x32xbf16, 2>) outs(%13 : memref<32x32xbf16, 2>)
-          air.dma_memcpy_nd (%arg12[%9, %10] [%c32_0, %c32_0] [%c512_1, %c1_2], %13[] [] []) {id = 4 : i32} : (memref<512x512xbf16>, memref<32x32xbf16, 2>)
-          memref.dealloc %11 : memref<32x32xbf16, 2>
-          memref.dealloc %12 : memref<32x32xbf16, 2>
-          memref.dealloc %13 : memref<32x32xbf16, 2>
-          air.herd_terminator
-        }
-      }
-      scf.yield
-    }
-    return
-  }
-}
-
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'index' at index: 6
-<block argument> of type 'memref<512x512xbf16>' at index: 7
-<block argument> of type 'index' at index: 8
-<block argument> of type 'memref<512x512xbf16>' at index: 9
-<block argument> of type 'memref<512x512xbf16>' at index: 10
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'index' at index: 6
-<block argument> of type 'memref<512x512xbf16>' at index: 7
-<block argument> of type 'index' at index: 8
-<block argument> of type 'memref<512x512xbf16>' at index: 9
-<block argument> of type 'memref<512x512xbf16>' at index: 10
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'index' at index: 6
-<block argument> of type 'memref<512x512xbf16>' at index: 7
-<block argument> of type 'index' at index: 8
-<block argument> of type 'memref<512x512xbf16>' at index: 9
-<block argument> of type 'memref<512x512xbf16>' at index: 10
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 11
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 12
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 13
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 14
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'index' at index: 6
-<block argument> of type 'index' at index: 7
-<block argument> of type 'memref<512x512xbf16>' at index: 8
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 9
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 10
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 11
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 12
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'memref<512x512xbf16>' at index: 6
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 7
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 8
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 9
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 10
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'memref<512x512xbf16>' at index: 6
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 7
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 8
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 9
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 10
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'memref<512x512xbf16>' at index: 6
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 7
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 8
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 9
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 10
-<block argument> of type 'index' at index: 4
-<block argument> of type 'index' at index: 5
-<block argument> of type 'memref<512x512xbf16>' at index: 6
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 7
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 8
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 9
-<block argument> of type 'memref<32x32xbf16, 1>' at index: 10

From 898e5ab15ac5750ce971f8693d2d22299280fc67 Mon Sep 17 00:00:00 2001
From: Jeff Fifield <jeff.fifield@amd.com>
Date: Wed, 15 May 2024 14:45:45 -0600
Subject: [PATCH 3/3] Delete examples/air_to_npu directory

---
 examples/air_to_npu/aie.py        | 157 ---------------------
 examples/air_to_npu/aie_w_pack.py | 180 ------------------------
 examples/air_to_npu/test.cpp      | 225 ------------------------------
 3 files changed, 562 deletions(-)
 delete mode 100644 examples/air_to_npu/aie.py
 delete mode 100644 examples/air_to_npu/aie_w_pack.py
 delete mode 100644 examples/air_to_npu/test.cpp

diff --git a/examples/air_to_npu/aie.py b/examples/air_to_npu/aie.py
deleted file mode 100644
index a6f66c83e..000000000
--- a/examples/air_to_npu/aie.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import air
-import air.compiler.util
-from air.dialects import linalg, tensor, arith, func, memref
-from air.ir import *
-import air.passmanager
-from air.dialects import air as airdialect
-from air.compiler.util import run_transform
-import sys
-def matmul_on_tensors(m, n, k, dtype):
-    module = Module.create()
-    with InsertionPoint(module.body):
-        @func.FuncOp.from_py_func(
-            MemRefType.get((m, k), dtype), MemRefType.get((k, n), dtype))
-        def forward(lhs, rhs):
-            out = memref.AllocOp(MemRefType.get((m, n), dtype), [], [])
-            zero = arith.ConstantOp(dtype, 0)
-            zero_fill = linalg.fill(zero, outs=[out])
-            linalg.matmul(lhs, rhs, outs=[out])
-            return out
-    return module
-
-with air.ir.Context() as ctx, Location.unknown():
-    air_module = matmul_on_tensors(2048, 2048, 2048, IntegerType.get_signless(width = 32))
-    
-    ################################################
-    ## Tiling
-    ################################################
-
-    pm = air.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE)
-    pm.run(air_module.operation)
-    with open('air_input.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    transform_ir_string = """
-    transform.with_pdl_patterns {
-    ^bb0(%arg0: !pdl.operation):
-        transform.sequence %arg0 : !pdl.operation failures(propagate) {
-        ^bb1(%arg1: !pdl.operation):
-            %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!pdl.operation) -> !pdl.operation
-            %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!pdl.operation) -> !pdl.operation
-            %matmul_1, %loops:2 = transform.air.linalg_tile %matmul [64, 64, 0]
-            %fill_1 = transform.air.fuse_into_containing_op %fill into %loops#1
-            transform.air.linalg_promote %fill_1 {"operands_to_promote"=[1], "memory_space"="L2"}
-            transform.air.linalg_promote %matmul_1 {"operands_to_promote"=[2], "memory_space"="L2"}
-            %matmul_2, %reduction_loop_1 = transform.air.linalg_tile %matmul_1 [0, 0, 256]
-            transform.air.linalg_promote %matmul_2 {"operands_to_promote"=[0,1], "memory_space"="L2"}
-            %matmul_3, %loops_2:2 = transform.air.linalg_tile %matmul_2 [32, 32, 0]
-            %fill_2 = transform.air.fuse_into_containing_op %fill_1 into %loops_2#1
-            transform.air.linalg_promote %fill_2 {"operands_to_promote"=[1], "memory_space"="L1"}
-            transform.air.linalg_promote %matmul_3 {"operands_to_promote"=[2], "memory_space"="L1"}
-            %matmul_4, %reduction_loop_2 = transform.air.linalg_tile %matmul_3 [0, 0, 32]
-            transform.air.linalg_promote %matmul_4 {"operands_to_promote"=[0,1], "memory_space"="L1"}
-        }
-    }
-    """
-    
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
-    
-    with open('air_tiled.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Binding scf.paralell to air hierarchies
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-        "buffer-results-to-out-params",
-        "air-par-to-herd{depth=1}",
-        "air-par-to-launch{has-air-segment=true}",
-        "air-copy-to-dma",
-        "canonicalize", "cse",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-
-    with open('air_sync.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Extract event dependency and optimize schedule
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-        "air-dependency",
-        "air-dependency-schedule-opt",
-        "air-specialize-dma-broadcast",
-        "air-dma-to-channel",
-        "canonicalize", "cse",
-        "air-dependency-canonicalize",
-        "canonicalize", "cse",
-        "air-label-scf-for-to-ping-pong",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    # Not sure why parsing the ir solves the segmentation fault...
-    air_module = Module.parse(str(air_module))
-    pipeline = "builtin.module("+",".join([
-        "air-ping-pong-transform{keep-memref-dealloc=true}",
-        "canonicalize", "cse",
-        "air-isolate-async-dma-loop-nests",
-        "air-specialize-channel-wrap-and-stride",
-        "canonicalize", "cse",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aircc_input.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Place herd to segment
-    ################################################
-
-    air_async_module = Module.parse(str(air_module))
-    pipeline = "builtin.module("+",".join([
-        "func.func(air-collapse-herd)",
-        'canonicalize', 'cse',
-        "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}",
-        'canonicalize', 'cse',
-        'func.func(air-renumber-dma)',
-        'func.func(convert-linalg-to-loops)',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('air_placed.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## MLIR-AIR to MLIR-AIE
-    ################################################
-    
-    pipeline = "builtin.module("+",".join([
-        'air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}',
-        'canonicalize',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aircc_decomp_aiecc.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## MLIR-AIR runtime lowering
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-      'air-to-std',
-      'canonicalize',
-      'func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})',
-      'func.func(air-unroll-outer-affine-loops{depth=2})',
-      'affine-expand-index-ops',
-      'airrt-to-npu',
-      'canonicalize',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aie.mlir', 'w') as f:
-        f.write(str(air_module))
diff --git a/examples/air_to_npu/aie_w_pack.py b/examples/air_to_npu/aie_w_pack.py
deleted file mode 100644
index 4b847d513..000000000
--- a/examples/air_to_npu/aie_w_pack.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import air
-import air.compiler.util
-from air.dialects import linalg, tensor, arith, func, memref
-from air.ir import *
-import air.passmanager
-from air.dialects import air as airdialect
-from air.compiler.util import run_transform
-import sys
-
-with air.ir.Context() as ctx, Location.unknown():
-    
-    ################################################
-    ## Tiling
-    ################################################
-    
-    air_tiled_ir_string = """
-    #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>
-    #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>
-    #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>
-    module {
-      func.func @forward(%arg0: memref<2048x2048xi32>, %arg1: memref<2048x2048xi32>) -> memref<2048x2048xi32> {
-        %c32 = arith.constant 32 : index
-        %c256 = arith.constant 256 : index
-        %c2048 = arith.constant 2048 : index
-        %c0 = arith.constant 0 : index
-        %c64 = arith.constant 64 : index
-        %c0_i32 = arith.constant 0 : i32
-        %alloc = memref.alloc() : memref<2048x2048xi32>
-        scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
-          %subview = memref.subview %alloc[%arg2, %arg3] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
-          %alloc_0 = memref.alloc() : memref<1x1x64x64xi32, 1>
-          scf.parallel (%arg4, %arg5) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
-            %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2>
-            linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<1x1x4x8x4x8xi32, 2>)
-            %subview_3 = memref.subview %alloc_0[0, 0, %arg4, %arg5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1>
-            scf.for %arg6 = %c0 to %c2048 step %c256 {
-              %subview_5 = memref.subview %arg0[%arg2, %arg6] [64, 256] [1, 1] : memref<2048x2048xi32> to memref<64x256xi32, strided<[2048, 1], offset: ?>>
-              %subview_6 = memref.subview %arg1[%arg6, %arg3] [256, 64] [1, 1] : memref<2048x2048xi32> to memref<256x64xi32, strided<[2048, 1], offset: ?>>
-              %alloc_7 = memref.alloc() : memref<1x1x64x256xi32, 1>
-              %alloc_8 = memref.alloc() : memref<1x1x256x64xi32, 1>
-              air.dma_memcpy_nd (%alloc_7[] [] [], %subview_5[] [] []) : (memref<1x1x64x256xi32, 1>, memref<64x256xi32, strided<[2048, 1], offset: ?>>)
-              air.dma_memcpy_nd (%alloc_8[] [] [], %subview_6[] [] []) : (memref<1x1x256x64xi32, 1>, memref<256x64xi32, strided<[2048, 1], offset: ?>>)
-              scf.for %arg7 = %c0 to %c256 step %c32 {
-                %subview_9 = memref.subview %alloc_7[0, 0, %arg4, %arg7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x256xi32, 1> to memref<1x1x32x32xi32, strided<[16384, 16384, 256, 1], offset: ?>, 1>
-                %subview_10 = memref.subview %alloc_8[0, 0, %arg7, %arg5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x256x64xi32, 1> to memref<1x1x32x32xi32, strided<[16384, 16384, 64, 1], offset: ?>, 1>
-                %alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2>
-                %alloc_12 = memref.alloc() : memref<1x1x4x4x8x8xi32, 2>
-                %expand_shape = memref.expand_shape %subview_9 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[16384, 16384, 256, 1], offset: ?>, 1> into memref<1x1x8x4x4x8xi32, strided<[16384, 16384, 1024, 256, 8, 1], offset: ?>, 1>
-                %transpose_13 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[16384, 16384, 1024, 256, 8, 1], offset: ?>, 1> to memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 8, 1024, 256, 1], offset: ?>, 1>
-                air.dma_memcpy_nd (%alloc_11[] [] [], %transpose_13[] [] []) : (memref<1x1x4x8x4x8xi32, 2>, memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 8, 1024, 256, 1], offset: ?>, 1>)
-                %expand_shape_14 = memref.expand_shape %subview_10 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[16384, 16384, 64, 1], offset: ?>, 1> into memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 512, 64, 8, 1], offset: ?>, 1>
-                %transpose_15 = memref.transpose %expand_shape_14 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x4x8xi32, strided<[16384, 16384, 512, 64, 8, 1], offset: ?>, 1> to memref<1x1x4x4x8x8xi32, strided<[16384, 16384, 8, 512, 64, 1], offset: ?>, 1>
-                air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_15[] [] []) : (memref<1x1x4x4x8x8xi32, 2>, memref<1x1x4x4x8x8xi32, strided<[16384, 16384, 8, 512, 64, 1], offset: ?>, 1>)
-                linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2>, memref<1x1x4x4x8x8xi32, 2>) outs(%alloc_2 : memref<1x1x4x8x4x8xi32, 2>) {
-                ^bb0(%in: i32, %in_16: i32, %out: i32):
-                  %0 = arith.muli %in, %in_16 : i32
-                  %1 = arith.addi %out, %0 : i32
-                  linalg.yield %1 : i32
-                }
-                memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2>
-                memref.dealloc %alloc_12 : memref<1x1x4x4x8x8xi32, 2>
-              }
-              memref.dealloc %alloc_7 : memref<1x1x64x256xi32, 1>
-              memref.dealloc %alloc_8 : memref<1x1x256x64xi32, 1>
-            }
-            %transpose_4 = memref.transpose %alloc_2 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x4x8x4x8xi32, 2> to memref<1x1x8x4x4x8xi32, strided<[1024, 1024, 32, 8, 256, 1]>, 2>
-            air.dma_memcpy_nd (%subview_3[] [] [], %transpose_4[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1>, memref<1x1x8x4x4x8xi32, strided<[1024, 1024, 32, 8, 256, 1]>, 2>)
-            memref.dealloc %alloc_2 : memref<1x1x4x8x4x8xi32, 2>
-            scf.reduce 
-          }
-          %subview_1 = memref.subview %alloc_0[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1> to memref<64x64xi32, 1>
-          %transpose = memref.transpose %subview_1 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1> to memref<64x64xi32, strided<[64, 1]>, 1>
-          air.dma_memcpy_nd (%subview[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1>)
-          memref.dealloc %alloc_0 : memref<1x1x64x64xi32, 1>
-          scf.reduce 
-        }
-        return %alloc : memref<2048x2048xi32>
-      }
-    }
-    """
-    air_module = Module.parse(air_tiled_ir_string)
-    
-    with open('air_tiled.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Binding scf.paralell to air hierarchies
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-        "buffer-results-to-out-params",
-        "air-par-to-herd{depth=1}",
-        "air-par-to-launch{has-air-segment=true}",
-        "air-copy-to-dma",
-        "canonicalize", "cse",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-
-    with open('air_sync.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Extract event dependency and optimize schedule
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-        "air-dependency",
-        "air-dependency-schedule-opt",
-        "air-specialize-dma-broadcast",
-        "air-dma-to-channel",
-        "canonicalize", "cse",
-        "air-dependency-canonicalize",
-        "canonicalize", "cse",
-        "func.func(air-loop-fusion)",
-        "air-label-scf-for-to-ping-pong",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    # Not sure why parsing the ir solves the segmentation fault...
-    air_module = Module.parse(str(air_module))
-    pipeline = "builtin.module("+",".join([
-        "air-ping-pong-transform{keep-memref-dealloc=true}",
-        "canonicalize", "cse",
-        "air-specialize-channel-wrap-and-stride",
-        "canonicalize", "cse",
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aircc_input.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## Place herd to segment
-    ################################################
-
-    air_async_module = Module.parse(str(air_module))
-    pipeline = "builtin.module("+",".join([
-        "func.func(air-collapse-herd)",
-        'canonicalize', 'cse',
-        "air-place-herds{num-rows=4 num-cols=1 row-anchor=2 col-anchor=0}",
-        'canonicalize', 'cse',
-        'func.func(air-renumber-dma)',
-        'func.func(convert-linalg-to-loops)',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('air_placed.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## MLIR-AIR to MLIR-AIE
-    ################################################
-    
-    pipeline = "builtin.module("+",".join([
-        'air-to-aie{row-offset=2 col-offset=0 device=npu emit-while-loop=true}',
-        'canonicalize',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aircc_decomp_aiecc.mlir', 'w') as f:
-        f.write(str(air_module))
-    
-    ################################################
-    ## MLIR-AIR runtime lowering
-    ################################################
-
-    pipeline = "builtin.module("+",".join([
-      'air-to-std',
-      'canonicalize',
-      'func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})',
-      'func.func(air-unroll-outer-affine-loops{depth=2})',
-      'affine-expand-index-ops',
-      'airrt-to-npu',
-      'canonicalize',
-    ])+')'
-    pm = air.passmanager.PassManager.parse(pipeline)
-    pm.run(air_module.operation)
-    with open('aie.mlir', 'w') as f:
-        f.write(str(air_module))
diff --git a/examples/air_to_npu/test.cpp b/examples/air_to_npu/test.cpp
deleted file mode 100644
index 7128685bc..000000000
--- a/examples/air_to_npu/test.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#include <boost/program_options.hpp>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#define M 2048
-#define N 2048
-#define K 2048
-
-#define A_VOLUME M *K
-#define B_VOLUME N *K
-#define C_VOLUME M *N
-
-#define A_DATATYPE int32_t
-#define B_DATATYPE int32_t
-#define C_DATATYPE int32_t
-
-constexpr int A_SIZE = (A_VOLUME * sizeof(A_DATATYPE));
-constexpr int B_SIZE = (B_VOLUME * sizeof(B_DATATYPE));
-constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE));
-constexpr int TRACE_SIZE = (0 * sizeof(uint32_t));
-
-namespace po = boost::program_options;
-
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
-  if (!vm_in.count(name)) {
-    throw std::runtime_error("Error: no " + name + " file was provided\n");
-  } else {
-    std::ifstream test(vm_in[name].as<std::string>());
-    if (!test) {
-      throw std::runtime_error("The " + name + " file " +
-                               vm_in[name].as<std::string>() +
-                               " does not exist.\n");
-    }
-  }
-}
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-template <typename T>
-void mm_out(std::vector<T> a, std::vector<T> b, std::vector<T> &r) {
-  for (size_t m1 = 0; m1 < M; m1++) {
-    for (size_t n1 = 0; n1 < N; n1++) {
-      size_t idx = m1 * N + n1;
-      r[idx] = (T)(0);
-      for (size_t k1 = 0; k1 < K; k1++) {
-        T _a = a[k1 + m1 * K];
-        T _b = b[n1 + k1 * N];
-        r[idx] += _a * _b;
-      }
-    }
-  }
-}
-
-int main(int argc, const char *argv[]) {
-
-  // Program arguments parsing
-  po::options_description desc("Allowed options");
-  desc.add_options()("help,h", "produce help message")(
-      "xclbin,x", po::value<std::string>()->required(),
-      "the input xclbin path")(
-      "kernel,k", po::value<std::string>()->required(),
-      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
-      "verbosity,v", po::value<int>()->default_value(0),
-      "the verbosity of the output")(
-      "instr,i", po::value<std::string>()->required(),
-      "path of file containing userspace instructions to be sent to the LX6");
-  po::variables_map vm;
-
-  try {
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return 1;
-    }
-  } catch (const std::exception &ex) {
-    std::cerr << ex.what() << "\n\n";
-    std::cerr << "Usage:\n" << desc << "\n";
-    return 1;
-  }
-
-  check_arg_file_exists(vm, "xclbin");
-  check_arg_file_exists(vm, "instr");
-
-  std::vector<uint32_t> instr_v =
-      load_instr_sequence(vm["instr"].as<std::string>());
-
-  int verbosity = vm["verbosity"].as<int>();
-  if (verbosity >= 1)
-    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-  // Start the XRT test code
-  // Get a device handle
-  unsigned int device_index = 0;
-  auto device = xrt::device(device_index);
-
-  // Load the xclbin
-  if (verbosity >= 1)
-    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
-  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
-
-  if (verbosity >= 1)
-    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
-  std::string Node = vm["kernel"].as<std::string>();
-
-  // Get the kernel from the xclbin
-  auto xkernels = xclbin.get_kernels();
-  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                               [Node](xrt::xclbin::kernel &k) {
-                                 auto name = k.get_name();
-                                 std::cout << "Name: " << name << std::endl;
-                                 return name.rfind(Node, 0) == 0;
-                               });
-  auto kernelName = xkernel.get_name();
-
-  if (verbosity >= 1)
-    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
-              << "\n";
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  if (verbosity >= 1)
-    std::cout << "Getting hardware context.\n";
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  if (verbosity >= 1)
-    std::cout << "Getting handle to kernel:" << kernelName << "\n";
-  auto kernel = xrt::kernel(context, kernelName);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-  auto bo_a =
-      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-  auto bo_b =
-      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_c =
-      xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  if (verbosity >= 1)
-    std::cout << "Writing data into buffer objects.\n";
-  A_DATATYPE *bufA = bo_a.map<A_DATATYPE *>();
-  std::vector<A_DATATYPE> AVec;
-  for (int i = 0; i < A_VOLUME; i++)
-    AVec.push_back(rand() % UINT16_MAX);
-  memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE)));
-  B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();
-  std::vector<B_DATATYPE> BVec;
-  for (int i = 0; i < B_VOLUME; i++)
-    BVec.push_back(rand() % UINT16_MAX);
-  memcpy(bufB, BVec.data(), (BVec.size() * sizeof(B_DATATYPE)));
-  C_DATATYPE *bufC = bo_c.map<C_DATATYPE *>();
-  std::vector<C_DATATYPE> CVec;
-  for (int i = 0; i < C_VOLUME; i++)
-    CVec.push_back(0);
-  memcpy(bufC, CVec.data(), (CVec.size() * sizeof(C_DATATYPE)));
-
-  void *bufInstr = bo_instr.map<void *>();
-  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  if (verbosity >= 1)
-    std::cout << "Running Kernel.\n";
-  auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
-  run.wait();
-
-  bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  C_DATATYPE *bufOut = bo_c.map<C_DATATYPE *>();
-
-  int errors = 0;
-  int max_errors = 100;
-
-  std::vector<C_DATATYPE> output_ref0;
-  for (uint32_t i = 0; i < C_VOLUME; i++)
-    output_ref0.push_back(0);
-  mm_out(AVec, BVec, output_ref0);
-
-  for (uint32_t i = 0; i < C_VOLUME; i++) {
-    if (bufOut[i] != output_ref0[i]) {
-      errors++;
-      if (errors < max_errors) {
-        std::cout << "\nerror, id " << i << " expected "
-                  << std::to_string(output_ref0[i]) << ", got"
-                  << std::to_string(bufOut[i]) << "\n";
-      }
-    }
-  }
-
-  if (!errors) {
-    std::cout << "\nPASS!\n\n";
-    return 0;
-  } else {
-    std::cout << "\nerror count: " << errors << "\n\n";
-    std::cout << "\nfailed.\n\n";
-    return 1;
-  }
-}