bytedance · YellowHCH · Oct 9, 2024 · Oct 8, 2024
diff --git a/compiler/lib/Dialect/mhlo/Transforms/DecomposeMhloCustomCallOps.cpp b/compiler/lib/Dialect/mhlo/Transforms/DecomposeMhloCustomCallOps.cpp
@@ -53,6 +53,69 @@ struct DecomposeByteIRAddN : public OpRewritePattern<mhlo::CustomCallOp> {
   }
 };
 
+struct DecomposeByteIRSoftmax : public OpRewritePattern<mhlo::CustomCallOp> {
+  using OpRewritePattern<mhlo::CustomCallOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getCallTargetName() != getSoftmaxName())
+      return failure();
+
+    DictionaryAttr byteirAttrs =
+        cast<DictionaryAttr>(op->getAttr(getCustomCallAttrName()));
+    if (!byteirAttrs)
+      return failure();
+    auto axisAttr = cast<IntegerAttr>(byteirAttrs.get("axis"));
+
+    RankedTensorType inType =
+        cast<RankedTensorType>(op.getOperand(0).getType());
+    Value exp = rewriter.create<mhlo::ExpOp>(op.getLoc(), op.getOperand(0));
+    Value reduce;
+    {
+      SmallVector<int64_t> reduceResultShape(inType.getShape());
+      reduceResultShape.erase(reduceResultShape.begin() + axisAttr.getInt());
+      RankedTensorType reduceResultType =
+          RankedTensorType::get(reduceResultShape, inType.getElementType());
+
+      Value initValue = rewriter.create<mhlo::ConstantOp>(
+          op.getLoc(),
+          DenseElementsAttr::get(
+              RankedTensorType::get({}, inType.getElementType()),
+              {APFloat::getZero(cast<mlir::FloatType>(inType.getElementType())
+                                    .getFloatSemantics())}));
+      auto reduceOp = rewriter.create<mhlo::ReduceOp>(
+          op.getLoc(), reduceResultType, exp, initValue,
+          rewriter.getI64TensorAttr({axisAttr.getInt()}));
+
+      Block &block = reduceOp.getBody().emplaceBlock();
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&block);
+      auto blockValArgumentType =
+          RankedTensorType::get({}, inType.getElementType());
+      block.addArgument(blockValArgumentType, op->getLoc());
+      block.addArgument(blockValArgumentType, op->getLoc());
+      auto *firstValArg = block.args_begin();
+      auto *secondValArg = std::next(firstValArg);
+      Value result = rewriter.create<mhlo::AddOp>(op->getLoc(), *firstValArg,
+                                                  *secondValArg);
+      rewriter.create<mhlo::ReturnOp>(op->getLoc(), result);
+
+      reduce = reduceOp.getResults()[0];
+    }
+
+    SmallVector broadcastDim =
+        llvm::to_vector(llvm::seq<int64_t>(0, inType.getRank()));
+    broadcastDim.erase(broadcastDim.begin() + axisAttr.getInt());
+    Value broadcast = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
+        op->getLoc(), inType, reduce,
+        rewriter.create<shape::ShapeOfOp>(op.getLoc(), exp),
+        rewriter.getI64TensorAttr(broadcastDim));
+    Value result = rewriter.create<mhlo::DivOp>(op->getLoc(), exp, broadcast);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 struct DecomposeByteIRArgMaxMin : public OpRewritePattern<mhlo::CustomCallOp> {
   DecomposeByteIRArgMaxMin(MLIRContext *context, llvm::StringRef customCallName)
       : OpRewritePattern<mhlo::CustomCallOp>(context),
@@ -230,6 +293,9 @@ struct DecomposeMhloCustomCallOpsPass
     if (!legalOpsSet.contains(getAddNName())) {
       patterns.add<DecomposeByteIRAddN>(context);
     }
+    if (!legalOpsSet.contains(getSoftmaxName())) {
+      patterns.add<DecomposeByteIRSoftmax>(context);
+    }
     if (!legalOpsSet.contains(getArgMaxName())) {
       patterns.add<DecomposeByteIRArgMaxMin>(context, getArgMaxName());
     }

diff --git a/compiler/test/Dialect/Mhlo/transforms/DecomposeMhloCustomCallOps.mlir b/compiler/test/Dialect/Mhlo/transforms/DecomposeMhloCustomCallOps.mlir
@@ -10,6 +10,19 @@ func.func @byteir.addn(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor
 // CHECK:      mhlo.add
 // CHECK:      return
 
+func.func @byteir.softmax(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %0 = mhlo.custom_call @byteir.softmax(%arg0) {byteir_attrs = {axis = 1 : i64}} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %0 : tensor<4x4xf32>
+}
+// CHECK-LABEL: func.func @byteir.softmax
+// CHECK-NOT:  byteir.softmax
+// CHECK:      mhlo.exp
+// CHECK:      mhlo.reduce
+// CHECK-SAME:      mhlo.add
+// CHECK:      mhlo.broadcast_in_dim
+// CHECK:      mhlo.div
+// CHECK:      return
+
 func.func @byteir.arg_max$return_1(%arg0: tensor<3x4xf32>) -> tensor<3xi64> {
   %0 = mhlo.custom_call @byteir.arg_max(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x4xf32>) -> tensor<3xi64>
   return %0 : tensor<3xi64>

diff --git a/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_softmax.mlir b/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_softmax.mlir
@@ -0,0 +1,4 @@
+func.func @byteir.softmax(%arg0: tensor<10x128xf32>) -> tensor<10x128xf32> {
+  %0 = stablehlo.custom_call @byteir.softmax(%arg0) {byteir_attrs = {axis = 1 : i64}} : (tensor<10x128xf32>) -> tensor<10x128xf32>
+  return %0 : tensor<10x128xf32>
+}