Skip to content

Commit

Permalink
Correctly generate offsets for DMA memcpys for AIEs (#713)
Browse files Browse the repository at this point in the history
* No need to insert an add instruction when loop bounds are correct

* Fix offset computation when folding loops

* clang-format

* fix format and syntax errors

* Add a test that demonstrates correctly splitting dma memcpys

* Add a test that tests folding of offsets into strides in dma memcpys

* Remove debug print
  • Loading branch information
akkothar authored Aug 16, 2024
1 parent 388bd8b commit 74d1b73
Show file tree
Hide file tree
Showing 4 changed files with 1,011 additions and 52 deletions.
7 changes: 2 additions & 5 deletions mlir/lib/Conversion/AIRRtToNpuPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -742,11 +742,8 @@ void tileIllegalWrapDim(airrt::DmaMemcpyNdOp memcpy_op) {
// Innermost tiled affine.for loop induction variable as lowest offset, if
// original rank exceeds hw limit.
new_opers.insert(new_opers.end(), offsets.begin(), offsets.end() - 1);
auto new_inner_offset = builder.create<arith::AddIOp>(
loc,
builder.create<arith::IndexCastOp>(loc, IntegerType::get(ctx, 64),
inner_affine_for_iv),
offsets.back());
auto new_inner_offset = builder.create<arith::IndexCastOp>(
loc, IntegerType::get(ctx, 64), inner_affine_for_iv);
new_opers.push_back(new_inner_offset);
} else
new_opers.insert(new_opers.end(), offsets.begin(), offsets.end());
Expand Down
109 changes: 62 additions & 47 deletions mlir/lib/Util/Util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ LogicalResult eraseWrapNStrideDim(OpBuilder builder,
offset_expr = offset_expr.replaceDimsAndSymbols({}, symReplacements);
auto next_offset_map = AffineMap::get(0, 1, offset_expr);
affine_apply.setMap(next_offset_map);
offsets[i] = affine_apply;
offsets[i + 1] = offsets[i];
}
erased |= multiplyAdjWraps(builder, i, sizes);
Expand All @@ -927,7 +928,6 @@ LogicalResult air::canonicalizeWrapAndStrideList(OpBuilder builder,
SmallVector<Value> &sizes,
SmallVector<Value> &strides,
int memref_volume) {

bool listsHaveChanged = false;
// Match offsets size with sizes and strides
auto max_dim_size =
Expand Down Expand Up @@ -1004,33 +1004,85 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(

// Fold for loops int channel op's wrap and stride fields
SmallVector<Operation *> for_loops;
SmallVector<Value> ivs;
Operation *parent = channel_op;
while (parent != for_op) {
parent = parent->getParentOp();
if (isa<scf::ForOp>(parent))
if (auto sfo = dyn_cast<scf::ForOp>(parent)) {
for_loops.push_back(parent);
else if (isa<affine::AffineForOp>(parent))
ivs.push_back(sfo.getInductionVar());
} else if (auto afo = dyn_cast<affine::AffineForOp>(parent)) {
for_loops.push_back(parent);
ivs.push_back(afo.getInductionVar());
}
}

// First traversal inserting new dimensions from loops
std::map<Operation *, int> op_to_count;
for (auto o : for_loops) {
uint64_t ind_var_factor = 0;
int64_t stepSize = -1;
int loop_lower_bound = 0;
Value iv = nullptr;
if (auto afo = dyn_cast<affine::AffineForOp>(o)) {
iv = afo.getInductionVar();
loop_lower_bound = afo.getConstantLowerBound();
stepSize = afo.getStepAsInt();
} else if (auto sfo = dyn_cast<scf::ForOp>(o)) {
iv = sfo.getInductionVar();
if (auto cst_lower_bound = mlir::getConstantIntValue(sfo.getLowerBound()))
loop_lower_bound = *cst_lower_bound;
stepSize = *mlir::getConstantIntValue(sfo.getStep());
}
int64_t ind_var_factor = 0;
for (int i = offsets.size() - 1; i >= 0; i--) {
Value iv = nullptr;
if (auto afo = dyn_cast<affine::AffineForOp>(o))
iv = afo.getInductionVar();
else if (auto sfo = dyn_cast<scf::ForOp>(o))
iv = sfo.getInductionVar();
if (iv && offsets[i] == iv) {
ind_var_factor = *getConstantIntValue(strides[i]);
offsets[i] = builder.template create<arith::ConstantIndexOp>(
loc, loop_lower_bound);
break;
} else if (iv && offsets[i].getDefiningOp()) {
Operation *iv_consumer = offsets[i].getDefiningOp();
if (auto exec = dyn_cast<air::ExecuteOp>(iv_consumer))
iv_consumer = exec.getChildOp();
if (auto affop = dyn_cast<affine::AffineApplyOp>(iv_consumer)) {
// The induction variable must be the input to the affine op
if (affop.getSymbolOperands().size() == 1) {
bool iv_is_symbol = false;
for (auto val : affop.getSymbolOperands()) {
if (val == iv) {
iv_is_symbol = true;
break;
}
}
if (iv_is_symbol) {
auto map = affop.getAffineMap();
ind_var_factor = air::evaluateConstantsInMap(
map,
SmallVector<std::optional<int64_t>>{
std::optional<int64_t>{stepSize}},
for_op->getContext())
.value();
offsets[i] = builder.template create<arith::ConstantIndexOp>(
loc, loop_lower_bound);
break;
}
}
}
if (llvm::is_contained(iv_consumer->getOperands(), iv)) {
if (op_to_count.find(iv_consumer) == op_to_count.end()) {
op_to_count[iv_consumer] = 0;
for (auto operand : iv_consumer->getOperands()) {
for (auto iv_val : ivs) {
if (iv_val == operand)
op_to_count[iv_consumer]++;
}
}
}
op_to_count[iv_consumer]--;
ind_var_factor = *getConstantIntValue(strides[i]);
if (!op_to_count[iv_consumer]) {
offsets[i] = builder.template create<arith::ConstantIndexOp>(
loc, loop_lower_bound);
}
break;
}
}
Expand All @@ -1042,11 +1094,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
trip_count = *getStaticScfForTripCountAsInt(sfo);
Value new_wrap =
builder.template create<arith::ConstantIndexOp>(loc, trip_count);
int stepSize = -1;
if (auto afo = dyn_cast<affine::AffineForOp>(o))
stepSize = afo.getStepAsInt();
else if (auto sfo = dyn_cast<scf::ForOp>(o))
stepSize = *mlir::getConstantIntValue(sfo.getStep());
int64_t new_stride_value =
(stepSize * ind_var_factor) % getTensorVolume(memref.getType());
Value new_stride =
Expand All @@ -1069,38 +1116,6 @@ LogicalResult air::foldForLoopNestAsExtendedSizesAndStrides(
wraps.insert(wraps.begin(), new_wrap);
strides.insert(strides.begin(), new_stride);
}

// Second traversal updating existing offsets
for (auto o : for_loops) {
for (int i = offsets.size() - 1; i >= 0; i--) {
Value iv = nullptr;
int loop_lower_bound = 0;
if (auto afo = dyn_cast<affine::AffineForOp>(o)) {
iv = afo.getInductionVar();
loop_lower_bound = afo.getConstantLowerBound();
} else if (auto sfo = dyn_cast<scf::ForOp>(o)) {
iv = sfo.getInductionVar();
if (auto cst_lower_bound =
mlir::getConstantIntValue(sfo.getLowerBound()))
loop_lower_bound = *cst_lower_bound;
}
if (iv && offsets[i] == iv) {
// Replace offset with for loop lower bound
offsets[i] = builder.template create<arith::ConstantIndexOp>(
loc, loop_lower_bound);
break;
} else if (iv && offsets[i].getDefiningOp()) {
Operation *iv_consumer = offsets[i].getDefiningOp();
if (auto exec = dyn_cast<air::ExecuteOp>(iv_consumer))
iv_consumer = exec.getChildOp();
if (llvm::is_contained(iv_consumer->getOperands(), iv)) {
offsets[i] = builder.template create<arith::ConstantIndexOp>(
loc, loop_lower_bound);
break;
}
}
}
}
return success();
}

Expand Down
Loading

0 comments on commit 74d1b73

Please sign in to comment.