Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HLSL][SPIRV][DXIL] Implement dot4add_u8packed intrinsic #115068

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/include/clang/Basic/Builtins.td
Original file line number Diff line number Diff line change
Expand Up @@ -4798,6 +4798,12 @@ def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> {
let Prototype = "int(unsigned int, unsigned int, int)";
}

def HLSLDot4AddU8Packed : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_dot4add_u8packed"];
let Attributes = [NoThrow, Const];
let Prototype = "unsigned int(unsigned int, unsigned int, unsigned int)";
}

def HLSLFirstBitHigh : LangBuiltin<"HLSL_LANG"> {
let Spellings = ["__builtin_hlsl_elementwise_firstbithigh"];
let Attributes = [NoThrow, Const];
Expand Down
10 changes: 10 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18881,6 +18881,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
/*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
"hlsl.dot4add.i8packed");
}
case Builtin::BI__builtin_hlsl_dot4add_u8packed: {
Value *A = EmitScalarExpr(E->getArg(0));
Value *B = EmitScalarExpr(E->getArg(1));
Value *C = EmitScalarExpr(E->getArg(2));

Intrinsic::ID ID = CGM.getHLSLRuntime().getDot4AddU8PackedIntrinsic();
return Builder.CreateIntrinsic(
/*ReturnType=*/C->getType(), ID, ArrayRef<Value *>{A, B, C}, nullptr,
"hlsl.dot4add.u8packed");
}
case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: {

Value *X = EmitScalarExpr(E->getArg(0));
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CodeGen/CGHLSLRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddI8Packed, dot4add_i8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(Dot4AddU8Packed, dot4add_u8packed)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitUHigh, firstbituhigh)
Expand Down
8 changes: 7 additions & 1 deletion clang/lib/Headers/hlsl/hlsl_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,13 @@ uint64_t dot(uint64_t4, uint64_t4);

_HLSL_AVAILABILITY(shadermodel, 6.4)
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_i8packed)
int dot4add_i8packed(unsigned int, unsigned int, int);
int dot4add_i8packed(uint, uint, int);
joaosaffran marked this conversation as resolved.
Show resolved Hide resolved

/// \fn uint dot4add_u8packed(uint A, uint B, uint C)

_HLSL_AVAILABILITY(shadermodel, 6.4)
_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot4add_u8packed)
uint dot4add_u8packed(uint, uint, uint);

//===----------------------------------------------------------------------===//
// exp builtins
Expand Down
18 changes: 18 additions & 0 deletions clang/test/CodeGenHLSL/builtins/dot4add_u8packed.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

// RUN: %clang_cc1 -finclude-default-header -triple \
// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
// RUN: FileCheck %s -DTARGET=dx
// RUN: %clang_cc1 -finclude-default-header -triple \
// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
// RUN: FileCheck %s -DTARGET=spv

// Test basic lowering to runtime function call.

// CHECK-LABEL: define {{.*}}test
uint test(uint a, uint b, uint c) {
// CHECK: %[[RET:.*]] = call [[TY:i32]] @llvm.[[TARGET]].dot4add.u8packed([[TY]] %[[#]], [[TY]] %[[#]], [[TY]] %[[#]])
// CHECK: ret [[TY]] %[[RET]]
return dot4add_u8packed(a, b, c);
}

// CHECK: declare [[TY]] @llvm.[[TARGET]].dot4add.u8packed([[TY]], [[TY]], [[TY]])
28 changes: 28 additions & 0 deletions clang/test/SemaHLSL/BuiltIns/dot4add_u8packed-errors.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify

int test_too_few_arg0() {
return __builtin_hlsl_dot4add_u8packed();
// expected-error@-1 {{too few arguments to function call, expected 3, have 0}}
}

int test_too_few_arg1(int p0) {
return __builtin_hlsl_dot4add_u8packed(p0);
// expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
}

int test_too_few_arg2(uint p0) {
return __builtin_hlsl_dot4add_u8packed(p0, p0);
// expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
}

int test_too_many_arg(uint p0) {
return __builtin_hlsl_dot4add_u8packed(p0, p0, p0, p0);
// expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
}

struct S { float f; };

int test_expr_struct_type_check(S p0, uint p1) {
return __builtin_hlsl_dot4add_u8packed(p1, p1, p0);
// expected-error@-1 {{no viable conversion from 'S' to 'unsigned int'}}
}
3 changes: 2 additions & 1 deletion llvm/include/llvm/IR/IntrinsicsDirectX.td
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_dx_dot4add_u8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;

def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsSPIRV.td
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ let TargetPrefix = "spv" in {
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_dot4add_u8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/DirectX/DXIL.td
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,16 @@ def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
let stages = [Stages<DXIL1_0, [all_stages]>];
}

def Dot4AddU8Packed : DXILOp<164, dot4AddPacked> {
let Doc = "unsigned dot product of 4 x i8 vectors packed into i32, with "
"accumulate to i32";
let LLVMIntrinsic = int_dx_dot4add_u8packed;
let arguments = [Int32Ty, Int32Ty, Int32Ty];
let result = Int32Ty;
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
let stages = [Stages<DXIL1_0, [all_stages]>];
}

def AnnotateHandle : DXILOp<216, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2687,6 +2687,11 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
STI.isAtLeastSPIRVVer(VersionTuple(1, 6)))
return selectDot4AddPacked<true>(ResVReg, ResType, I);
return selectDot4AddPackedExpansion<true>(ResVReg, ResType, I);
case Intrinsic::spv_dot4add_u8packed:
if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_integer_dot_product) ||
STI.isAtLeastSPIRVVer(VersionTuple(1, 6)))
return selectDot4AddPacked<false>(ResVReg, ResType, I);
return selectDot4AddPackedExpansion<false>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
Expand Down
10 changes: 10 additions & 0 deletions llvm/test/CodeGen/DirectX/dot4add_u8packed.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s

define void @main(i32 %a, i32 %b, i32 %c) {
entry:
; CHECK: call i32 @dx.op.dot4AddPacked(i32 164, i32 %a, i32 %b, i32 %c)
%0 = call i32 @llvm.dx.dot4add.u8packed(i32 %a, i32 %b, i32 %c)
ret void
}

declare i32 @llvm.dx.dot4add.u8packed(i32, i32, i32)
65 changes: 65 additions & 0 deletions llvm/test/CodeGen/SPIRV/hlsl-intrinsics/dot4add_u8packed.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; RUN: llc -O0 -mtriple=spirv1.5-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXP
; RUN: llc -O0 -mtriple=spirv1.6-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
; RUN: llc -O0 -mtriple=spirv-unknown-unknown -spirv-ext=+SPV_KHR_integer_dot_product %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-EXT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this in the existing test as well. Perhaps you can remind me why we can expect both the integer dot product operations as well as the expansion in this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK-EX P will check for the expansion variant, whereas CHECK-EX T is just used to check if the extension is emitted from the module

; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv1.5-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv1.6-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; RUN: %if spirv-tools %{ llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown -spirv-ext=+SPV_KHR_integer_dot_product %s -o - -filetype=obj | spirv-val %}

; CHECK-DOT: OpCapability DotProduct
; CHECK-DOT: OpCapability DotProductInput4x8BitPacked
; CHECK-EXT: OpExtension "SPV_KHR_integer_dot_product"

; CHECK: %[[#int_32:]] = OpTypeInt 32 0
; CHECK-EXP-DAG: %[[#int_8:]] = OpTypeInt 8 0
; CHECK-EXP-DAG: %[[#zero:]] = OpConstantNull %[[#int_8]]
; CHECK-EXP-DAG: %[[#eight:]] = OpConstant %[[#int_8]] 8
; CHECK-EXP-DAG: %[[#sixteen:]] = OpConstant %[[#int_8]] 16
; CHECK-EXP-DAG: %[[#twentyfour:]] = OpConstant %[[#int_8]] 24

; CHECK-LABEL: Begin function test_dot
define noundef i32 @test_dot(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
entry:
; CHECK: %[[#A:]] = OpFunctionParameter %[[#int_32]]
; CHECK: %[[#B:]] = OpFunctionParameter %[[#int_32]]
; CHECK: %[[#C:]] = OpFunctionParameter %[[#int_32]]

; Test that we use the dot product op when capabilities allow

; CHECK-DOT: %[[#DOT:]] = OpUDot %[[#int_32]] %[[#A]] %[[#B]]
; CHECK-DOT: %[[#RES:]] = OpIAdd %[[#int_32]] %[[#DOT]] %[[#C]]

; Test expansion is used when spirv dot product capabilities aren't available:

; First element of the packed vector
; CHECK-EXP: %[[#A0:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#zero]] %[[#eight]]
; CHECK-EXP: %[[#B0:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#zero]] %[[#eight]]
; CHECK-EXP: %[[#MUL0:]] = OpIMul %[[#int_32]] %[[#A0]] %[[#B0]]
; CHECK-EXP: %[[#MASK0:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL0]] %[[#zero]] %[[#eight]]
; CHECK-EXP: %[[#ACC0:]] = OpIAdd %[[#int_32]] %[[#C]] %[[#MASK0]]

; Second element of the packed vector
; CHECK-EXP: %[[#A1:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#eight]] %[[#eight]]
; CHECK-EXP: %[[#B1:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#eight]] %[[#eight]]
; CHECK-EXP: %[[#MUL1:]] = OpIMul %[[#int_32]] %[[#A1]] %[[#B1]]
; CHECK-EXP: %[[#MASK1:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL1]] %[[#zero]] %[[#eight]]
; CHECK-EXP: %[[#ACC1:]] = OpIAdd %[[#int_32]] %[[#ACC0]] %[[#MASK1]]

; Third element of the packed vector
; CHECK-EXP: %[[#A2:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#sixteen]] %[[#eight]]
; CHECK-EXP: %[[#B2:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#sixteen]] %[[#eight]]
; CHECK-EXP: %[[#MUL2:]] = OpIMul %[[#int_32]] %[[#A2]] %[[#B2]]
; CHECK-EXP: %[[#MASK2:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL2]] %[[#zero]] %[[#eight]]
; CHECK-EXP: %[[#ACC2:]] = OpIAdd %[[#int_32]] %[[#ACC1]] %[[#MASK2]]

; Fourth element of the packed vector
; CHECK-EXP: %[[#A3:]] = OpBitFieldUExtract %[[#int_32]] %[[#A]] %[[#twentyfour]] %[[#eight]]
; CHECK-EXP: %[[#B3:]] = OpBitFieldUExtract %[[#int_32]] %[[#B]] %[[#twentyfour]] %[[#eight]]
; CHECK-EXP: %[[#MUL3:]] = OpIMul %[[#int_32]] %[[#A3]] %[[#B3]]
; CHECK-EXP: %[[#MASK3:]] = OpBitFieldUExtract %[[#int_32]] %[[#MUL3]] %[[#zero]] %[[#eight]]

; CHECK-EXP: %[[#RES:]] = OpIAdd %[[#int_32]] %[[#ACC2]] %[[#MASK3]]
; CHECK: OpReturnValue %[[#RES]]
%spv.dot = call i32 @llvm.spv.dot4add.u8packed(i32 %a, i32 %b, i32 %c)

ret i32 %spv.dot
}
Loading