diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 87a798183d6e19..e484c3969fe228 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4798,6 +4798,12 @@ def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> { let Prototype = "int(unsigned int, unsigned int, int)"; } +def HLSLDot4AddU8Packed : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_dot4add_u8packed"]; + let Attributes = [NoThrow, Const]; + let Prototype = "unsigned int(unsigned int, unsigned int, unsigned int)"; +} + def HLSLFirstBitHigh : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_elementwise_firstbithigh"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5c3df5124517d6..0ef9058640db6a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18881,6 +18881,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/C->getType(), ID, ArrayRef{A, B, C}, nullptr, "hlsl.dot4add.i8packed"); } + case Builtin::BI__builtin_hlsl_dot4add_u8packed: { + Value *A = EmitScalarExpr(E->getArg(0)); + Value *B = EmitScalarExpr(E->getArg(1)); + Value *C = EmitScalarExpr(E->getArg(2)); + + Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddU8PackedIntrinsic(); + return Builder.CreateIntrinsic( + /*ReturnType=*/C->getType(), ID, ArrayRef{A, B, C}, nullptr, + "hlsl.dot4add.u8packed"); + } case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: { Value *X = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index ff810cc535c087..caf8777fd95a9f 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -90,6 +90,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot) GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot) GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed) + GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddU8Packed, dot4add_u8packed) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane) GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitUHigh, firstbituhigh) diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 12d3aa418449c0..2ee3827d720495 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -942,7 +942,13 @@ uint64_t dot(uint64_t4, uint64_t4); _HLSL_AVAILABILITY(shadermodel, 6.4) _HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed) -int dot4add_i8packed(unsigned int, unsigned int, int); +int dot4add_i8packed(uint, uint, int); + +/// \fn uint dot4add_u8packed(uint A, uint B, uint C) + +_HLSL_AVAILABILITY(shadermodel, 6.4) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_u8packed) +uint dot4add_u8packed(uint, uint, uint); //===----------------------------------------------------------------------===// // exp builtins diff --git a/clang/test/CodeGenHLSL/builtins/dot4add_u8packed.hlsl b/clang/test/CodeGenHLSL/builtins/dot4add_u8packed.hlsl new file mode 100644 index 00000000000000..c388cf58cd17c0 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/dot4add_u8packed.hlsl @@ -0,0 +1,18 @@ + +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s -DTARGET=dx +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s -DTARGET=spv + +// Test basic lowering to runtime function call. + +// CHECK-LABEL: define {{.*}}test +uint test(uint a, uint b, uint c) { + // CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.u8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]]) + // CHECK: ret [[TY]] %[[RET]] + return dot4add_u8packed(a, b, c); +} + +// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.u8packed([[TY]], [[TY]], [[TY]]) diff --git a/clang/test/SemaHLSL/BuiltIns/dot4add_u8packed-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot4add_u8packed-errors.hlsl new file mode 100644 index 00000000000000..f1fa41902b9687 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/dot4add_u8packed-errors.hlsl @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify + +int test_too_few_arg0() { + return __builtin_hlsl_dot4add_u8packed(); + // expected-error@-1 {{too few arguments to function call, expected 3, have 0}} +} + +int test_too_few_arg1(int p0) { + return __builtin_hlsl_dot4add_u8packed(p0); + // expected-error@-1 {{too few arguments to function call, expected 3, have 1}} +} + +int test_too_few_arg2(uint p0) { + return __builtin_hlsl_dot4add_u8packed(p0, p0); + // expected-error@-1 {{too few arguments to function call, expected 3, have 2}} +} + +int test_too_many_arg(uint p0) { + return __builtin_hlsl_dot4add_u8packed(p0, p0, p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 3, have 4}} +} + +struct S { float f; }; + +int test_expr_struct_type_check(S p0, uint p1) { + return __builtin_hlsl_dot4add_u8packed(p1, p1, p0); + // expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index c3a935c39ddc30..43267033f024a7 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -69,7 +69,8 @@ def int_dx_udot : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], [IntrNoMem, Commutative] >; - def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_dx_dot4add_u8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 629d6759dd65fd..e93d6fa83de61b 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -84,6 +84,7 @@ let TargetPrefix = "spv" in { [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], [IntrNoMem, Commutative] >; def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_spv_dot4add_u8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>; def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>; def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index aab94e95a40ff9..1aabff90e5ec6e 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -822,6 +822,16 @@ def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let stages = [Stages]; } +def Dot4AddU8Packed : DXILOp<164, dot4AddPacked> { + let Doc = "unsigned dot product of 4 x i8 vectors packed into i32, with " + "accumulate to i32"; + let LLVMIntrinsic = int_dx_dot4add_u8packed; + let arguments = [Int32Ty, Int32Ty, Int32Ty]; + let result = Int32Ty; + let attributes = [Attributes]; + let stages = [Stages]; +} + def AnnotateHandle : DXILOp<216, annotateHandle> { let Doc = "annotate handle with resource properties"; let arguments = [HandleTy, ResPropsTy]; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index be38b22f70c583..414583aea91e64 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -2687,6 +2687,11 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, STI.isAtLeastSPIRVVer(VersionTuple(1, 6))) return selectDot4AddPacked(ResVReg, ResType, I); return selectDot4AddPackedExpansion(ResVReg, ResType, I); + case Intrinsic::spv_dot4add_u8packed: + if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_integer_dot_product) || + STI.isAtLeastSPIRVVer(VersionTuple(1, 6))) + return selectDot4AddPacked(ResVReg, ResType, I); + return selectDot4AddPackedExpansion(ResVReg, ResType, I); case Intrinsic::spv_all: return selectAll(ResVReg, ResType, I); case Intrinsic::spv_any: diff --git a/llvm/test/CodeGen/DirectX/dot4add_u8packed.ll b/llvm/test/CodeGen/DirectX/dot4add_u8packed.ll new file mode 100644 index 00000000000000..3836b4a4bc16c9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/dot4add_u8packed.ll @@ -0,0 +1,10 @@ +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +define void @main(i32 %a, i32 %b, i32 %c) { +entry: +; CHECK: call i32 @dx.op.dot4AddPacked(i32 164, i32 %a, i32 %b, i32 %c) + %0 = call i32 @llvm.dx.dot4add.u8packed(i32 %a, i32 %b, i32 %c) + ret void +} + +declare i32 @llvm.dx.dot4add.u8packed(i32, i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_u8packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_u8packed.ll new file mode 100644 index 00000000000000..8acdb7d2ea3241 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_u8packed.ll @@ -0,0 +1,65 @@ +; RUN: llc -O0 -mtriple=spirv1.5-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXP +; RUN: llc -O0 -mtriple=spirv1.6-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT +; RUN: llc -O0 -mtriple=spirv-unknown-unknown -spirv-ext=+SPV_KHR_integer_dot_product %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-EXT +; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv1.5-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv1.6-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown -spirv-ext=+SPV_KHR_integer_dot_product %s -o - -filetype=obj | spirv-val %} + +; CHECK-DOT: OpCapability DotProduct +; CHECK-DOT: OpCapability DotProductInput4x8BitPacked +; CHECK-EXT: OpExtension "SPV_KHR_integer_dot_product" + +; CHECK: %[[#int_32:]] = OpTypeInt 32 0 +; CHECK-EXP-DAG: %[[#int_8:]] = OpTypeInt 8 0 +; CHECK-EXP-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]] +; CHECK-EXP-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8 +; CHECK-EXP-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16 +; CHECK-EXP-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24 + +; CHECK-LABEL: Begin function test_dot +define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) { +entry: +; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]] +; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]] +; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]] + +; Test that we use the dot product op when capabilities allow + +; CHECK-DOT: %[[#DOT:]] = OpUDot %[[#int_32]] %[[#A]] %[[#B]] +; CHECK-DOT: %[[#RES:]] = OpIAdd %[[#int_32]] %[[#DOT]] %[[#C]] + +; Test expansion is used when spirv dot product capabilities aren't available: + +; First element of the packed vector +; CHECK-EXP: %[[#A0:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#zero]] %[[#eight]] +; CHECK-EXP: %[[#B0:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#zero]] %[[#eight]] +; CHECK-EXP: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]] +; CHECK-EXP: %[[#MASK0:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] %[[#eight]] +; CHECK-EXP: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]] + +; Second element of the packed vector +; CHECK-EXP: %[[#A1:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#eight]] %[[#eight]] +; CHECK-EXP: %[[#B1:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#eight]] %[[#eight]] +; CHECK-EXP: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]] +; CHECK-EXP: %[[#MASK1:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] %[[#eight]] +; CHECK-EXP: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]] + +; Third element of the packed vector +; CHECK-EXP: %[[#A2:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#sixteen]] %[[#eight]] +; CHECK-EXP: %[[#B2:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#sixteen]] %[[#eight]] +; CHECK-EXP: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]] +; CHECK-EXP: %[[#MASK2:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] %[[#eight]] +; CHECK-EXP: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]] + +; Fourth element of the packed vector +; CHECK-EXP: %[[#A3:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] %[[#eight]] +; CHECK-EXP: %[[#B3:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] %[[#eight]] +; CHECK-EXP: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]] +; CHECK-EXP: %[[#MASK3:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] %[[#eight]] + +; CHECK-EXP: %[[#RES:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]] +; CHECK: OpReturnValue %[[#RES]] + %spv.dot = call i32 @llvm.spv.dot4add.u8packed(i32 %a, i32 %b, i32 %c) + + ret i32 %spv.dot +}