diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index a10c8afd7..52d0ef901 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -951,6 +951,33 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) { return {out, 256}; } +/** Helper function for NEON instructions with the format `udot vd.s, vn.b, + * vm.b`. D represents the number of elements in the output vector to be updated + * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted + * RegisterValue. */ +template +RegisterValue vecUdot( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Check D and N are valid values + static_assert((D == 2 || D == 4) && + "D must be either 2 or 4 to align with vd.2s or vd.4s."); + + const uint32_t* vd = sourceValues[0].getAsVector(); + const uint8_t* vn = sourceValues[1].getAsVector(); + const uint8_t* vm = sourceValues[2].getAsVector(); + + uint32_t out[D] = {0}; + for (int i = 0; i < D; i++) { + out[i] = vd[i]; + for (int j = 0; j < 4; j++) { + out[i] += (static_cast(vn[(4 * i) + j]) * + static_cast(vm[(4 * i) + j])); + } + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `udot vd.s, vn.b, * vm.4b[index]`. * D represents the number of elements in the output vector to be updated (i.e. diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index a058c354f..505520287 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6550,6 +6550,10 @@ void Instruction::execute() { metadata_, VL_bits); break; } + case Opcode::AArch64_UDOTv16i8: { // udot vd.4s, vn.16b, vm.16b + results_[0] = vecUdot<4>(sourceValues_, metadata_); + break; + } case Opcode::AArch64_UDOTlanev16i8: { // udot vd.4s, vn.16b, vm.4b[index] results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_); break; diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 1621cbbda..6271023ea 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3681,6 +3681,30 @@ TEST_P(InstNeon, udot) { CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f}); CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0}); CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f}); + + // udot by vector + initialHeapData_.resize(128); + heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFFFFFF; + heap64[1] = 0x01234567ABBACAFE; + heap64[2] = 0xFEDCBA98FFFFFFFF; + heap64[3] = 0xDEADCAFEABBABEEF; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + movi v2.4s, #3 + + udot v2.4s, v1.16b, v0.16b + )"); + CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFFFFFF, 0x01234567ABBACAFE}); + CHECK_NEON(1, uint64_t, {0xFEDCBA98FFFFFFFF, 0xDEADCAFEABBABEEF}); + CHECK_NEON(2, uint32_t, {0x3F807, 0x288E7, 0x27C6E, 0xB52C}); } TEST_P(InstNeon, uzp) {