diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 814a85b4..3d520a34 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,11 @@ jobs: steps: - uses: actions/checkout@v4 - run: apt -y update - - run: apt -y install g++-multilib libboost-dev make nasm yasm + - run: apt -y install g++-multilib libboost-dev make nasm yasm wget xz-utils python3 - run: make test - run: make -C sample CXXFLAGS="-DXBYAK_NO_EXCEPTION" + - run: | + cd test + wget https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz + tar xvf sde-external-9.44.0-2024-08-22-lin.tar.xz + env XED=sde-external-9.44.0-2024-08-22-lin/xed64 make xed_test diff --git a/.gitignore b/.gitignore index 24b0b1de..507091e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /build* # cmake +*CVS diff --git a/CMakeLists.txt b/CMakeLists.txt index 79b0f517..72dad78a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.5) -project(xbyak LANGUAGES CXX VERSION 7.09.1) +project(xbyak LANGUAGES CXX VERSION 7.10) file(GLOB headers xbyak/*.h) diff --git a/doc/changelog.md b/doc/changelog.md index af0f6aab..5e25c2dd 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,5 +1,6 @@ # History +* 2024/Oct/13 ver 7.10 support AVX10 integer and fp16 vnni, media new instructions. setDefaultEncoding is extended. * 2024/Oct/10 ver 7.09.1 fix the names of vpcompressb and vpcompressw * 2024/Oct/08 ver 7.09 support YMM embedded rounding of AVX10.2 and fix some mnemonics with {sae}/{er}. * 2024/Oct/07 ver 7.08 support rdfsbase etc. diff --git a/doc/usage.md b/doc/usage.md index 0911b914..5b255130 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -106,18 +106,37 @@ vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, yword_b [rax+64], 5); // broadcast 64-bit to 256-bit +``` + +## Selecting AVX512-VNNI, AVX-VNNI, AVX-VNNI-INT8 etc. +Some mnemonics have two types of encodings: VEX and EVEX. +The functions for these mnemonics include an optional parameter as the last argument to specify the encoding. +The default behavior depends on the order in which the instruction was introduced (whether VEX or EVEX came first), +and can be specified using setDefaultEncoding. -vpdpbusd(xm0, xm1, xm2); // default encoding is EVEX +``` +vpdpbusd(xm0, xm1, xm2); // default encoding: EVEX (AVX512-VNNI) vpdpbusd(xm0, xm1, xm2, EvexEncoding); // same as the above -vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX encoding +vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX (AVX-VNNI) setDefaultEncoding(VexEncoding); // default encoding is VEX -vpdpbusd(xm0, xm1, xm2); // VEX encoding +vpdpbusd(xm0, xm1, xm2); // VEX + +vmpsadbw(xm1, xm3, xm15, 3); // default encoding: VEX (AVX-VNNI) +vmpsadbw(xm1, xm3, xm15, 3, VexEncoding); // same as the above +vmpsadbw(xm1, xm3, xm15, 3, EvexEncoding); // EVEX (AVX10.2) +setDefaultEncoding(VexEncoding, EvexEncoding); // use 2nd argument. +vmpsadbw(xm1, xm3, xm15, 3); // EVEX ``` -- setDefaultEncoding(PreferredEncoding encoding); - - Set the default encoding to select EVEX or VEX. - - The default value is EvexEncoding. - - This function affects only an instruction that has a PreferredEncoding argument such as vpdpbusd. +- `setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding)` +Control the default encoding of mnemonics with `Xbyak::PreferredEncoding` param. + +param|vnniEnc|avx10Enc +-|-|- +EvexEncoding|AVX512-VNNI|AVX10.2 +VexEncoding|AVX-VNNI|AVX-VNNI-INT8 +default|EvexEncoding|VexEncoding +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 79ec79aa..2b8a3286 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -137,8 +137,6 @@ void putVcmp() printf("void %s(const Opmask& k, const Xmm& x, const Operand& op%s) { opAVX_K_X_XM(k, x, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } - puts("void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }"); - puts("void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }"); } void putVcmpAlias() @@ -198,6 +196,19 @@ void putX_XM() { 0x7C, "vcvttph2w", T_66 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_SAE_Z }, { 0x7D, "vcvtuw2ph", T_F2 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, { 0x7D, "vcvtw2ph", T_F3 | T_MAP5 | T_MUST_EVEX | T_YMM | T_EW0 | T_B16 | T_ER_Z }, + + { 0x51, "vsqrtnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + + { 0x2F, "vcomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomish", T_MUST_EVEX | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + + { 0x2F, "vcomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2F, "vcomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2F, "vcomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, + + { 0x2E, "vucomxsd", T_MUST_EVEX | T_F3 | T_0F | T_EW1 | T_SAE_X | T_N8 }, + { 0x2E, "vucomxsh", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_SAE_X | T_N2 }, + { 0x2E, "vucomxss", T_MUST_EVEX | T_F2 | T_0F | T_EW0 | T_SAE_X | T_N4 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -316,6 +327,9 @@ void putX_X_XM_IMM() { 0x77, "vpermi2ps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, { 0x77, "vpermi2pd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0x25, "vpternlogd", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, true }, { 0x25, "vpternlogq", T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, true }, @@ -401,6 +415,38 @@ void putX_X_XM_IMM() { 0x5A, "vcvtsh2sd", T_F3 | T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x13, "vcvtsh2ss", T_MAP6 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, false }, { 0x1D, "vcvtss2sh", T_MAP5 | T_MUST_EVEX | T_EW0 | T_ER_X | T_N4, false }, + + { 0x58, "vaddnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5E, "vdivnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5F, "vmaxpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x5D, "vminpbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x59, "vmulnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16,false }, + { 0x5C, "vsubnepbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + + { 0x98, "vfmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xA8, "vfmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xB8, "vfmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9C, "vfnmadd132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAC, "vfnmadd213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBC, "vfnmadd231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9A, "vfmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAA, "vfmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBA, "vfmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x9E, "vfnmsub132nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xAE, "vfnmsub213nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0xBE, "vfnmsub231nepbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + + { 0x67, "vcvt2ps2phx", T_MUST_EVEX | T_66 | T_0F38 | T_EW0 | T_YMM | T_B32 | T_ER_Y | T_ER_Z, false }, + { 0x74, "vcvtne2ph2bf8", T_MUST_EVEX | T_F2 | T_0F38 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x74, "vcvtne2ph2bf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x18, "vcvtne2ph2hf8", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, + + { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -410,6 +456,45 @@ void putX_X_XM_IMM() } } +void putX_X_XM_IMM_AVX10() +{ + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + uint64_t typeVex; + uint64_t typeEvex; + int sel; + bool hasIMM; + } tbl[] = { + // vpdpb[su,uu,ss]d[,s] + { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + + // vpdpw[su,us,uu]d[,s] + { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + + { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl *p = &tbl[i]; + std::string s = type2String(p->type); + std::string sVex = type2String(p->typeVex); + std::string sEvex = type2String(p->typeEvex); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, %s, 0x%02X, encoding, %s, %s, %s, %d); }\n" + , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? "imm" : "NONE", sVex.c_str(), sEvex.c_str(), p->sel); + } +} + void putShift() { const struct Tbl { @@ -571,6 +656,8 @@ void putCvt() { 0x2A, "vcvtsi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, { 0x7B, "vcvtusi2sh", T_F3 | T_MAP5 | T_MUST_EVEX | T_ER_R | T_M_K, 6 }, + + { 0x72, "vcvtneps2bf16", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 2 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl& p = tbl[i]; @@ -758,6 +845,15 @@ void putX_XM_IMM() { 0x62, "vpexpandb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_SAE_Z | T_N1, false }, { 0x62, "vpexpandw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_SAE_Z | T_N2, false }, + + { 0x2F, "vcomsbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_N2, false }, + { 0x42, "vgetexppbf16", T_MUST_EVEX | T_66 | T_MAP5 | T_EW0 | T_YMM | T_B16, false }, + { 0x26, "vgetmantpbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4C, "vrcppbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x56, "vreducenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x08, "vrndscalenepbf16", T_MUST_EVEX | T_F2 | T_0F3A | T_EW0 | T_YMM | T_B16, true }, + { 0x4E, "vrsqrtpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, + { 0x2C, "vscalefpbf16", T_MUST_EVEX | T_MAP6 | T_EW0 | T_YMM | T_B16, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -952,6 +1048,41 @@ void putFP16() putFP16_2(); } +void putAVX10_2() +{ + puts("void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }"); + puts("void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }"); + + const struct Tbl { + uint8_t code; + const char *name; + uint64_t type; + } tbl1[] = { + { 0x74, "vcvtbiasph2bf8", T_MUST_EVEX | T_0F38 | T_EW0 |T_YMM | T_B16 }, + { 0x74, "vcvtbiasph2bf8s", T_MUST_EVEX | T_MAP5 | T_EW0 |T_YMM | T_B16 }, + { 0x18, "vcvtbiasph2hf8", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtbiasph2hf8s", T_MUST_EVEX | T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl1); i++) { + const Tbl *p = &tbl1[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } + puts("void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }"); + + const Tbl tbl2[] = { + { 0x74, "vcvtneph2bf8", T_MUST_EVEX | T_F3 | T_0F38 | T_EW0 | T_YMM | T_B16 }, + { 0x74, "vcvtneph2bf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x18, "vcvtneph2hf8", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + { 0x1B, "vcvtneph2hf8s", T_MUST_EVEX | T_F3 |T_MAP5 | T_EW0 | T_YMM | T_B16 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl2); i++) { + const Tbl *p = &tbl2[i]; + std::string s = type2String(p->type); + printf("void %s(const Xmm& x, const Operand& op) { opCvt2(x, op, %s, 0x%02X); }\n" , p->name, s.c_str(), p->code); + } +} + int main(int argc, char *[]) { bool only64bit = argc == 2; @@ -966,6 +1097,7 @@ int main(int argc, char *[]) putM_X(); putXM_X(); putX_X_XM_IMM(); + putX_X_XM_IMM_AVX10(); putShift(); putExtractInsert(); putCvt(); @@ -977,4 +1109,5 @@ int main(int argc, char *[]) putScatter(); putV4FMA(); putFP16(); + putAVX10_2(); } diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index ad6806b4..a22c12b2 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -57,7 +57,7 @@ void putX_X_XM(bool omitOnly) { 0x0C, "blendps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x41, "dppd", T_0F3A | T_66 | T_W0, true, true, 3 }, { 0x40, "dpps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, - { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, + { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 1 }, { 0x0E, "pblendw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x02, "pblendd", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 2 }, { 0x0B, "roundsd", T_0F3A | T_66 | T_W0, true, true, 3 }, @@ -1802,7 +1802,6 @@ void put() const Tbl& p = tbl[i]; printf("void %s(const Xmm& x, const Address& addr) { opVex(x, 0, addr, %s, 0x%02X); }\n", p.name, type2String(p.type).c_str(), p.code); } - printf("void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, %s|orEvexIf(encoding), 0x72); }\n", type2String(T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32).c_str()); } // haswell gpr(reg, reg, r/m) { @@ -1893,8 +1892,6 @@ void put() { 0x51, "vpdpbusds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x52, "vpdpwssd", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, { 0x53, "vpdpwssds", T_66 | T_0F38 | T_YMM | T_EW0 | T_SAE_Z | T_B32}, - { 0xB4, "vpmadd52luq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, - { 0xB5, "vpmadd52huq", T_66 | T_0F38 | T_YMM | T_EW1 | T_B64 }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1904,25 +1901,26 @@ void put() } // avx-vnni-int8 // avx-vnni-int16 +#if 0 { const struct Tbl { uint8_t code; const char *name; uint64_t type; } tbl[] = { - { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, - { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - - { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbssd", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbssds", T_F2 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, +// { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, + +// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1930,6 +1928,7 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); } } +#endif } void put32() diff --git a/meson.build b/meson.build index 0fea416a..3fb5e511 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,7 @@ project( 'xbyak', 'cpp', - version: '7.09.1', + version: '7.10', license: 'BSD-3-Clause', default_options: 'b_ndebug=if-release' ) diff --git a/readme.md b/readme.md index 3ee7dd1d..49f0a9d7 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -# Xbyak 7.09.1 [![Badge Build]][Build Status] +# Xbyak 7.10 [![Badge Build]][Build Status] *A C++ JIT assembler for x86 (IA32), x64 (AMD64, x86-64)* diff --git a/readme.txt b/readme.txt index 768049b6..deabcd8b 100644 --- a/readme.txt +++ b/readme.txt @@ -1,5 +1,5 @@ - C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.09.1 + C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 7.10 ----------------------------------------------------------------------------- ◎概要 diff --git a/test/Makefile b/test/Makefile index ca2f0bb0..336dcaf8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -60,9 +60,9 @@ apx: apx.cpp $(XBYAK_INC) avx10_test: avx10_test.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 -TEST_FILES=avx10.txt misc.txt +TEST_FILES=old.txt new-ymm.txt bf16.txt comp.txt convert.txt xed_test: - @for target in $(addprefix target/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done + @for target in $(addprefix avx10/, $(TEST_FILES)); do ./test_by_xed.sh $$target; done test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen diff --git a/test/avx10/bf16.txt b/test/avx10/bf16.txt new file mode 100644 index 00000000..c544e02c --- /dev/null +++ b/test/avx10/bf16.txt @@ -0,0 +1,210 @@ +vaddnepbf16(xm1, xm2, xm3); +vaddnepbf16(ym1|k1, ym2, ptr[rax+128]); +vaddnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vaddnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vdivnepbf16(xm1, xm2, xm3); +vdivnepbf16(ym1|k1, ym2, ptr[rax+128]); +vdivnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vdivnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmaxpbf16(xm1, xm2, xm3); +vmaxpbf16(ym1|k1, ym2, ptr[rax+128]); +vmaxpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmaxpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vminpbf16(xm1, xm2, xm3); +vminpbf16(ym1|k1, ym2, ptr[rax+128]); +vminpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vminpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vmulnepbf16(xm1, xm2, xm3); +vmulnepbf16(ym1|k1, ym2, ptr[rax+128]); +vmulnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vmulnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vscalefpbf16(xm1, xm2, xm3); +vscalefpbf16(ym1|k1, ym2, ptr[rax+128]); +vscalefpbf16(ym1|k1, ym2, ptr_b[rax+128]); +vscalefpbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vsubnepbf16(xm1, xm2, xm3); +vsubnepbf16(ym1|k1, ym2, ptr[rax+128]); +vsubnepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vsubnepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// madd +vfmadd132nepbf16(xm1, xm2, xm3); +vfmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd213nepbf16(xm1, xm2, xm3); +vfmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmadd231nepbf16(xm1, xm2, xm3); +vfmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmadd +vfnmadd132nepbf16(xm1, xm2, xm3); +vfnmadd132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd213nepbf16(xm1, xm2, xm3); +vfnmadd213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmadd231nepbf16(xm1, xm2, xm3); +vfnmadd231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmadd231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmadd231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// msub +vfmsub132nepbf16(xm1, xm2, xm3); +vfmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub213nepbf16(xm1, xm2, xm3); +vfmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfmsub231nepbf16(xm1, xm2, xm3); +vfmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); +// nmsub +vfnmsub132nepbf16(xm1, xm2, xm3); +vfnmsub132nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub132nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub132nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub213nepbf16(xm1, xm2, xm3); +vfnmsub213nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub213nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub213nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vfnmsub231nepbf16(xm1, xm2, xm3); +vfnmsub231nepbf16(ym1|k1, ym2, ptr[rax+128]); +vfnmsub231nepbf16(ym1|k1, ym2, ptr_b[rax+128]); +vfnmsub231nepbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]); + +vcmppbf16(k1, xm5, xm4, 5); +vcmppbf16(k2, ym5, ym4, 6); +vcmppbf16(k3, ym15, ptr_b[rax+128], 7); +vcmppbf16(k4, zm30, zm20, 8); +vcmppbf16(k5, zm1, ptr[rax+128], 9); +vcmppbf16(k6, zm10, ptr_b[rax+128], 10); + +vfpclasspbf16(k1, xm4, 5); +vfpclasspbf16(k2|k5, ym4, 6); +vfpclasspbf16(k3|k5, zm20, 7); +vfpclasspbf16(k3|k5, xword[rax+128], 8); +vfpclasspbf16(k3, xword_b[rax+128], 9); +vfpclasspbf16(k5|k5, yword[rax+128], 10); +vfpclasspbf16(k6|k5, yword_b[rax+128], 11); +vfpclasspbf16(k7|k5, zword[rax+128], 12); +vfpclasspbf16(k7|k5, zword_b[rax+128], 13); + +vcomsbf16(xm2, xm3); +vcomsbf16(xm2, ptr[rax+128]); + +vgetexppbf16(xm1|k3, xmm2); +vgetexppbf16(xm1|k3, ptr[rax+128]); +vgetexppbf16(xm1|k3, ptr_b[rax+128]); + +vgetexppbf16(ym1|k3, ymm2); +vgetexppbf16(ym1|k3, ptr[rax+128]); +vgetexppbf16(ym1|k3, ptr_b[rax+128]); + +vgetexppbf16(zm1|k3, zmm2); +vgetexppbf16(zm1|k3, ptr[rax+128]); +vgetexppbf16(zm1|k3, ptr_b[rax+128]); + +vgetmantpbf16(xm1|k3, xmm2, 3); +vgetmantpbf16(xm1|k3, ptr[rax+128], 5); +vgetmantpbf16(xm1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(ym1|k3, ymm2, 3); +vgetmantpbf16(ym1|k3, ptr[rax+128], 5); +vgetmantpbf16(ym1|k3, ptr_b[rax+128], 9); + +vgetmantpbf16(zm1|k3, zmm2, 3); +vgetmantpbf16(zm1|k3, ptr[rax+128], 5); +vgetmantpbf16(zm1|k3, ptr_b[rax+128], 9); + +vrcppbf16(xm1|k5, xm2); +vrcppbf16(xm1|k5, ptr[rcx+128]); +vrcppbf16(xm1|k5, ptr_b[rcx+128]); + +vrcppbf16(ym1|k5, ym2); +vrcppbf16(ym1|k5, ptr[rcx+128]); +vrcppbf16(ym1|k5, ptr_b[rcx+128]); + +vrcppbf16(zm1|k5, zm2); +vrcppbf16(zm1|k5, ptr[rcx+128]); +vrcppbf16(zm1|k5, ptr_b[rcx+128]); + +vreducenepbf16(xm1|k4, xm2, 1); +vreducenepbf16(xm1|k4, ptr[rax+128], 1); +vreducenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(ym1|k4, ym2, 1); +vreducenepbf16(ym1|k4, ptr[rax+128], 1); +vreducenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vreducenepbf16(zm1|k4, zm2, 1); +vreducenepbf16(zm1|k4, ptr[rax+128], 1); +vreducenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(xm1|k4, xm2, 1); +vrndscalenepbf16(xm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(xm1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(ym1|k4, ym2, 1); +vrndscalenepbf16(ym1|k4, ptr[rax+128], 1); +vrndscalenepbf16(ym1|k4, ptr_b[rax+128], 1); + +vrndscalenepbf16(zm1|k4, zm2, 1); +vrndscalenepbf16(zm1|k4, ptr[rax+128], 1); +vrndscalenepbf16(zm1|k4, ptr_b[rax+128], 1); + +vrsqrtpbf16(xm1|k5, xm2); +vrsqrtpbf16(xm1|k5, ptr[rcx+128]); +vrsqrtpbf16(xm1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(ym1|k5, ym2); +vrsqrtpbf16(ym1|k5, ptr[rcx+128]); +vrsqrtpbf16(ym1|k5, ptr_b[rcx+128]); + +vrsqrtpbf16(zm1|k5, zm2); +vrsqrtpbf16(zm1|k5, ptr[rcx+128]); +vrsqrtpbf16(zm1|k5, ptr_b[rcx+128]); + +vscalefpbf16(xm1|k5, xm5, xm2); +vscalefpbf16(xm1|k5, xm5, ptr[rcx+128]); +vscalefpbf16(xm1|k5, xm5, ptr_b[rcx+128]); + +vscalefpbf16(ym1|k5, ym9, ym2); +vscalefpbf16(ym1|k5, ym9, ptr[rcx+128]); +vscalefpbf16(ym1|k5, ym9, ptr_b[rcx+128]); + +vscalefpbf16(zm1|k5, zm30, zm2); +vscalefpbf16(zm1|k5, zm30, ptr[rcx+128]); +vscalefpbf16(zm1|k5, zm30, ptr_b[rcx+128]); + +vsqrtnepbf16(xm5|k3, xmm4); +vsqrtnepbf16(xm5|k3, ptr[rax+128]); +vsqrtnepbf16(xm5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(ym5|k3, ymm4); +vsqrtnepbf16(ym5|k3, ptr[rax+128]); +vsqrtnepbf16(ym5|k3, ptr_b[rax+128]); + +vsqrtnepbf16(zm5|k3, zmm4); +vsqrtnepbf16(zm5|k3, ptr[rax+128]); +vsqrtnepbf16(zm5|k3, ptr_b[rax+128]); diff --git a/test/avx10/comp.txt b/test/avx10/comp.txt new file mode 100644 index 00000000..bfc883e0 --- /dev/null +++ b/test/avx10/comp.txt @@ -0,0 +1,17 @@ +vcomxsd(xm1, xm2|T_sae); +vcomxsd(xm1, ptr[rax+128]); + +vcomxsh(xm1, xm2|T_sae); +vcomxsh(xm1, ptr[rax+128]); + +vcomxss(xm1, xm2|T_sae); +vcomxss(xm1, ptr[rax+128]); + +vucomxsd(xm1, xm2|T_sae); +vucomxsd(xm1, ptr[rax+128]); + +vucomxsh(xm1, xm2|T_sae); +vucomxsh(xm1, ptr[rax+128]); + +vucomxss(xm1, xm2|T_sae); +vucomxss(xm1, ptr[rax+128]); diff --git a/test/avx10/convert.txt b/test/avx10/convert.txt new file mode 100644 index 00000000..836fcca8 --- /dev/null +++ b/test/avx10/convert.txt @@ -0,0 +1,176 @@ +vcvt2ps2phx(xm1|k5, xm2, xm3); +vcvt2ps2phx(xm1|k5, xm2, ptr[rax+128]); +vcvt2ps2phx(xm1|k5, xm2, ptr_b[rax+128]); + +vcvt2ps2phx(ym1|k5, ym2, ym3); +vcvt2ps2phx(ym1|k5, ym2, ptr[rax+128]); +vcvt2ps2phx(ym1|k5, ym2, ptr_b[rax+128]); + +vcvt2ps2phx(zm1|k5, zm2, zm3); +vcvt2ps2phx(zm1|k5, zm2, ptr[rax+128]); +vcvt2ps2phx(zm1|k5, zm2, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2bf8(xm1|k2, xm3, xm5); +vcvtbiasph2bf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8(xm1|k2, ym3, ym5); +vcvtbiasph2bf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8(ym1|k2, zm3, zm5); +vcvtbiasph2bf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2bf8s +vcvtbiasph2bf8s(xm1|k2, xm3, xm5); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(xm1|k2, ym3, ym5); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2bf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2bf8s(ym1|k2, zm3, zm5); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2bf8s(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8 +vcvtbiasph2hf8(xm1|k2, xm3, xm5); +vcvtbiasph2hf8(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8(xm1|k2, ym3, ym5); +vcvtbiasph2hf8(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8(ym1|k2, zm3, zm5); +vcvtbiasph2hf8(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8(ym1|k2, zm3, ptr_b[rax+128]); + +// vcvtbiasph2hf8s +vcvtbiasph2hf8s(xm1|k2, xm3, xm5); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, xm3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(xm1|k2, ym3, ym5); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr[rax+128]); +vcvtbiasph2hf8s(xm1|k2, ym3, ptr_b[rax+128]); + +vcvtbiasph2hf8s(ym1|k2, zm3, zm5); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr[rax+128]); +vcvtbiasph2hf8s(ym1|k2, zm3, ptr_b[rax+128]); + +vcvthf82ph(xm1|k5|T_z, xm2); +vcvthf82ph(xm1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(ym1|k5|T_z, xm2); +vcvthf82ph(ym1|k5|T_z, ptr[rax+128]); + +vcvthf82ph(zm1|k5|T_z, ym2); +vcvthf82ph(zm1|k5|T_z, ptr[rax+128]); + +// +vcvtne2ph2bf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2bf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2bf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2bf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2bf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2bf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]); + +// +vcvtne2ph2hf8s(xm1|k4|T_z, xm2, xm3); +vcvtne2ph2hf8s(xm1|k4, xm2, ptr[rax+128]); +vcvtne2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(ym1|k4|T_z, ym2, ym3); +vcvtne2ph2hf8s(ym1|k4, ym2, ptr[rax+128]); +vcvtne2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]); + +vcvtne2ph2hf8s(zm1|k4|T_z, zm2, zm3); +vcvtne2ph2hf8s(zm1|k4, zm2, ptr[rax+128]); +vcvtne2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]); + +// vcvtneph2bf8 +vcvtneph2bf8(xmm1|k2|T_z, xmm2); +vcvtneph2bf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8(xmm1|k2|T_z, ymm2); +vcvtneph2bf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8(ymm1|k2|T_z, zmm2); +vcvtneph2bf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2bf8s +vcvtneph2bf8s(xmm1|k2|T_z, xmm2); +vcvtneph2bf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2bf8s(xmm1|k2|T_z, ymm2); +vcvtneph2bf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2bf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2bf8s(ymm1|k2|T_z, zmm2); +vcvtneph2bf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2bf8s(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8 +vcvtneph2hf8(xmm1|k2|T_z, xmm2); +vcvtneph2hf8(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8(xmm1|k2|T_z, ymm2); +vcvtneph2hf8(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8(ymm1|k2|T_z, zmm2); +vcvtneph2hf8(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8(ymm1|k2|T_z, zword_b[rax+128]); + +// vcvtneph2hf8s +vcvtneph2hf8s(xmm1|k2|T_z, xmm2); +vcvtneph2hf8s(xmm1|k2|T_z, xword [rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, xword_b[rax+128]); + +vcvtneph2hf8s(xmm1|k2|T_z, ymm2); +vcvtneph2hf8s(xmm1|k2|T_z, yword[rax+128]); +vcvtneph2hf8s(xmm1|k2|T_z, yword_b[rax+128]); + +vcvtneph2hf8s(ymm1|k2|T_z, zmm2); +vcvtneph2hf8s(ymm1|k2|T_z, zword[rax+128]); +vcvtneph2hf8s(ymm1|k2|T_z, zword_b[rax+128]); diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt new file mode 100644 index 00000000..9464d034 --- /dev/null +++ b/test/avx10/misc.txt @@ -0,0 +1,167 @@ +vdpphps(xm1, xm2, xm3); +vdpphps(xm1, xm2, ptr[rax+128]); +vdpphps(xm1, xm2, ptr_b[rax+128]); + +vdpphps(ym1, ym2, ym3); +vdpphps(ym1, ym2, ptr[rax+128]); +vdpphps(ym1, ym2, ptr_b[rax+128]); + +vdpphps(zm1, zm2, zm3); +vdpphps(zm1, zm2, ptr[rax+128]); +vdpphps(zm1, zm2, ptr_b[rax+128]); +// +vmpsadbw(xm1, xm3, xm15, 3); +vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); + +vmpsadbw(ym1|k4, ym3, ym15, 3); +vmpsadbw(ym1, ym4, ptr[rax+128], 5); + +vmpsadbw(zm1|k4, zm3, zm15, 3); +vmpsadbw(zm1, zm4, ptr[rax+128], 5); +// +vpdpbssd(xm1, xm2, xm3); +vpdpbssd(xm1, xm2, ptr[rax+128]); +vpdpbssd(xm1, xm2, ptr_b[rax+128]); + +vpdpbssd(ym1, ym2, ym3); +vpdpbssd(ym1, ym2, ptr[rax+128]); +vpdpbssd(ym1, ym2, ptr_b[rax+128]); + +vpdpbssd(zm1, zm2, zm3); +vpdpbssd(zm1, zm2, ptr[rax+128]); +vpdpbssd(zm1, zm2, ptr_b[rax+128]); +// +vpdpbssds(xm1, xm2, xm3); +vpdpbssds(xm1, xm2, ptr[rax+128]); +vpdpbssds(xm1, xm2, ptr_b[rax+128]); + +vpdpbssds(ym1, ym2, ym3); +vpdpbssds(ym1, ym2, ptr[rax+128]); +vpdpbssds(ym1, ym2, ptr_b[rax+128]); + +vpdpbssds(zm1, zm2, zm3); +vpdpbssds(zm1, zm2, ptr[rax+128]); +vpdpbssds(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsud(xm1, xm2, xm3); +vpdpbsud(xm1, xm2, ptr[rax+128]); +vpdpbsud(xm1, xm2, ptr_b[rax+128]); + +vpdpbsud(ym1, ym2, ym3); +vpdpbsud(ym1, ym2, ptr[rax+128]); +vpdpbsud(ym1, ym2, ptr_b[rax+128]); + +vpdpbsud(zm1, zm2, zm3); +vpdpbsud(zm1, zm2, ptr[rax+128]); +vpdpbsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbsuds(xm1, xm2, xm3); +vpdpbsuds(xm1, xm2, ptr[rax+128]); +vpdpbsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbsuds(ym1, ym2, ym3); +vpdpbsuds(ym1, ym2, ptr[rax+128]); +vpdpbsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbsuds(zm1, zm2, zm3); +vpdpbsuds(zm1, zm2, ptr[rax+128]); +vpdpbsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpbuud(xm1, xm2, xm3); +vpdpbuud(xm1, xm2, ptr[rax+128]); +vpdpbuud(xm1, xm2, ptr_b[rax+128]); + +vpdpbuud(ym1, ym2, ym3); +vpdpbuud(ym1, ym2, ptr[rax+128]); +vpdpbuud(ym1, ym2, ptr_b[rax+128]); + +vpdpbuud(zm1, zm2, zm3); +vpdpbuud(zm1, zm2, ptr[rax+128]); +vpdpbuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpbuuds(xm1, xm2, xm3); +vpdpbuuds(xm1, xm2, ptr[rax+128]); +vpdpbuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpbuuds(ym1, ym2, ym3); +vpdpbuuds(ym1, ym2, ptr[rax+128]); +vpdpbuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpbuuds(zm1, zm2, zm3); +vpdpbuuds(zm1, zm2, ptr[rax+128]); +vpdpbuuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwuud(xm1, xm2, xm3); +vpdpwuud(xm1, xm2, ptr[rax+128]); +vpdpwuud(xm1, xm2, ptr_b[rax+128]); + +vpdpwuud(ym1, ym2, ym3); +vpdpwuud(ym1, ym2, ptr[rax+128]); +vpdpwuud(ym1, ym2, ptr_b[rax+128]); + +vpdpwuud(zm1, zm2, zm3); +vpdpwuud(zm1, zm2, ptr[rax+128]); +vpdpwuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwuuds(xm1, xm2, xm3); +vpdpwuuds(xm1, xm2, ptr[rax+128]); +vpdpwuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwuuds(ym1, ym2, ym3); +vpdpwuuds(ym1, ym2, ptr[rax+128]); +vpdpwuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwuuds(zm1, zm2, zm3); +vpdpwuuds(zm1, zm2, ptr[rax+128]); +vpdpwuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/test/target/avx10.txt b/test/avx10/new-ymm.txt similarity index 100% rename from test/target/avx10.txt rename to test/avx10/new-ymm.txt diff --git a/test/avx10/old.txt b/test/avx10/old.txt new file mode 100644 index 00000000..9e4f097d --- /dev/null +++ b/test/avx10/old.txt @@ -0,0 +1,657 @@ +v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); +v4fmaddss(xmm15, xmm8, ptr [rax + 64]); +v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); +v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); +vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); +vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); +vaesdec(xmm20, xmm30, ptr [rcx + 64]); +vaesdec(ymm1, ymm2, ptr [rcx + 64]); +vaesdec(zmm1, zmm2, ptr [rcx + 64]); +vaesdeclast(xmm20, xmm30, ptr [rax + 64]); +vaesdeclast(ymm20, ymm30, ptr [rax + 64]); +vaesdeclast(zmm20, zmm30, ptr [rax + 64]); +vaesenc(xmm20, xmm30, ptr [rcx + 64]); +vaesenc(ymm1, ymm2, ptr [rcx + 64]); +vaesenc(zmm1, zmm2, ptr [rcx + 64]); +vaesenclast(xmm20, xmm30, ptr [rax + 64]); +vaesenclast(ymm20, ymm30, ptr [rax + 64]); +vaesenclast(zmm20, zmm30, ptr [rax + 64]); +vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); +vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); +vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); +vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); +vpcompressb(ptr[rax + 64], xmm1); +vpcompressb(xmm30 | k5, xmm1); +vpcompressb(ptr[rax + 64], ymm1); +vpcompressb(ymm30 | k3 |T_z, ymm1); +vpcompressb(ptr[rax + 64], zmm1); +vpcompressb(zmm30 | k2 |T_z, zmm1); +vpcompressw(ptr[rax + 64], xmm1); +vpcompressw(xmm30 | k5, xmm1); +vpcompressw(ptr[rax + 64], ymm1); +vpcompressw(ymm30 | k3 |T_z, ymm1); +vpcompressw(ptr[rax + 64], zmm1); +vpcompressw(zmm30 | k2 |T_z, zmm1); +vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); +vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); +vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); +vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); +vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); +vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); +vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); +vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); +vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); +vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); +vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); +vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); +vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); +vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); +vpexpandb(xmm5|k3|T_z, xmm30); +vpexpandb(ymm5|k3|T_z, ymm30); +vpexpandb(zmm5|k3|T_z, zmm30); +vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(xmm5|k3|T_z, xmm30); +vpexpandw(ymm5|k3|T_z, ymm30); +vpexpandw(zmm5|k3|T_z, zmm30); +vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); +vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); +vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); +gf2p8affineinvqb(xmm1, xmm2, 3); +gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8affineqb(xmm1, xmm2, 3); +gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm1, xmm5, xmm2, 3); +vgf2p8affineqb(ymm1, ymm5, ymm2, 3); +vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); +vgf2p8affineqb(xmm30, xmm31, xmm4, 5); +vgf2p8affineqb(ymm30, ymm31, ymm4, 5); +vgf2p8affineqb(zmm30, zmm31, zmm4, 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); +vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); +vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); +gf2p8mulb(xmm1, xmm2); +gf2p8mulb(xmm1, ptr [rax + 0x40]); +vgf2p8mulb(xmm1, xmm5, xmm2); +vgf2p8mulb(ymm1, ymm5, ymm2); +vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(xmm30, xmm31, xmm4); +vgf2p8mulb(ymm30, ymm31, ymm4); +vgf2p8mulb(zmm30, zmm31, zmm4); +vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); +vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); +vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); +vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); +vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); +vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); +vcvtneps2bf16(xmm0, xword [rax + 64]); +vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); +vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); +vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); +vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); +vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); +ldtilecfg(ptr[rax + rcx * 4 + 64]); +sttilecfg(ptr[rsp + rax * 8 + 128]); +tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); +tileloaddt1(tmm4, ptr[r8 + r9 + 32]); +tilerelease(); +tilestored(ptr[r10 + r11 * 2 + 32], tmm2); +tilezero(tmm7); +tdpbssd(tmm1, tmm2, tmm3); +tdpbsud(tmm2, tmm3, tmm4); +tdpbusd(tmm3, tmm4, tmm5); +tdpbuud(tmm4, tmm5, tmm6); +tdpbf16ps(tmm5, tmm6, tmm7); +tileloadd(tmm1, ptr[r8+r8]); +tileloadd(tmm1, ptr[rax+rcx*4]); +tileloadd(tmm1, ptr[r8+r9*1+0x40]); +vaddph(zmm0, zmm1, ptr[rax+64]); +vaddph(ymm0, ymm1, ptr[rax+64]); +vaddph(xmm0, xmm1, ptr[rax+64]); +vaddph(zmm0, zmm1, ptr_b[rax+64]); +vaddph(ymm0, ymm1, ptr_b[rax+64]); +vaddph(xmm0, xmm1, ptr_b[rax+64]); +vaddsh(xmm0, xmm15, ptr[rax+64]); +vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); +vcmpph(k1, xm15, ptr[rax+64], 1); +vcmpph(k2, ym15, ptr[rax+64], 2); +vcmpph(k3, zm15, ptr[rax+64], 3); +vcmpph(k1, xm15, ptr_b[rax+64], 1); +vcmpph(k2, ym15, ptr_b[rax+64], 2); +vcmpph(k3, zm15, ptr_b[rax+64], 3); +vcmpsh(k1, xm15, ptr[rax+64], 1); +vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); +vcomish(xmm1, ptr[rax+64]); +vcomish(xmm1|T_sae, xmm15); +vucomish(xmm1, ptr [rax+0x40]); +vucomish(xmm1|T_sae, xmm15); +vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); +vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(xmm1|k3, xmm2, xmm5); +vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); +vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); +vfmaddsub213ph(ymm1|k3, ymm2, ymm5); +vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); +vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); +vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); +vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); +vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); +vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); +vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); +vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); +vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); +vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); +vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); +vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); +vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); +vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); +vfmaddcph(xm1, xm2, ptr[rax+0x40]); +vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); +vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); +vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vfmulcph(xmm1, xmm2, ptr [rax+0x40]); +vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); +vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); +vrcpph(xmm1, ptr [rax+0x40]); +vrcpph(xmm1, ptr_b [rax+0x40]); +vrcpph(ymm1, ptr [rax+0x40]); +vrcpph(ymm1, ptr_b [rax+0x40]); +vrcpph(zmm1, ptr [rax+0x40]); +vrcpph(zmm1, ptr_b [rax+0x40]); +vrcpsh(xmm1, xmm3, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr [rax+0x40]); +vrsqrtph(xmm1, ptr_b [rax+0x40]); +vrsqrtph(ymm2, ptr [rax+0x40]); +vrsqrtph(ymm2, ptr_b [rax+0x40]); +vrsqrtph(zmm2, ptr [rax+0x40]); +vrsqrtph(zmm2, ptr_b [rax+0x40]); +vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); +vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); +vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); +vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); +vscalefph(xmm1, xmm5, ptr [rax+0x40]); +vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); +vscalefph(ymm1, ymm5, ptr [rax+0x40]); +vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); +vscalefph(zmm1, zmm5, ptr [rax+0x40]); +vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); +vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); +vscalefsh(xmm1, xmm5, ptr [rax+0x40]); +vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); +vreduceph(xmm1, ptr [rax+0x40], 0x1); +vreduceph(xmm1, ptr_b [rax+0x40], 0x2); +vreduceph(ymm1, ptr [rax+0x40], 0x3); +vreduceph(ymm1, ptr_b [rax+0x40], 0x4); +vreduceph(zmm1, ptr [rax+0x40], 0x5); +vreduceph(zmm1, ptr_b [rax+0x40], 0x6); +vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vrndscaleph(xmm1, ptr [rax+0x40], 0x1); +vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); +vrndscaleph(ymm1, ptr [rax+0x40], 0x3); +vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); +vrndscaleph(zmm1, ptr [rax+0x40], 0x5); +vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); +vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); +vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); +vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); +vfpclassph(k1, xword [rax+0x40], 0x1); +vfpclassph(k1, xword_b[rax+0x40], 0x2); +vfpclassph(k1, yword [rax+0x40], 0x3); +vfpclassph(k1, yword_b[rax+0x40], 0x4); +vfpclassph(k1, zword [rax+0x40], 0x5); +vfpclassph(k1, zword_b[rax+0x40], 0x6); +vfpclasssh(k1|k2, xmm3, 0x5); +vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); +vgetexpph(xmm1, ptr [rax+0x40]); +vgetexpph(ymm1, ptr_b [rax+0x40]); +vgetexpph(zmm1, ptr [rax+0x40]); +vgetexpph(zmm1|k1|T_z|T_sae, zmm5); +vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); +vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); +vgetmantph(xmm1, ptr [rax+0x40], 0x1); +vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); +vgetmantph(zmm1, ptr [rax+0x40], 0x3); +vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); +vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); +vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); +vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); +vmovsh(ptr [rax+0x40]|k1, xmm1); +vmovsh(xmm1|k2|T_z, xmm3, xmm5); +vmovw(xmm1, r13d); +vmovw(xmm3, ptr [rax+0x40]); +vmovw(r9d, xmm1); +vmovw(ptr [rax+0x40], xmm7); +vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); +vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); +vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); +vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); +vcvtsh2si(edx|T_rd_sae, xmm1); +vcvtsh2si(edx, ptr [rax+0x40]); +vcvtsh2si(rdx|T_rd_sae, xmm1); +vcvtsh2si(r8, ptr [rax+0x40]); +vcvtph2dq(xmm1, xmm5); +vcvtph2dq(xmm1, ptr [rax+0x40]); +vcvtph2dq(xmm1, ptr_b [rax+0x40]); +vcvtph2dq(ymm1|k2|T_z, xmm5); +vcvtph2dq(ymm1, ptr [rax+0x40]); +vcvtph2dq(ymm1, ptr_b [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2psx(xmm1, xmm5); +vcvtph2psx(xmm1, ptr [rax+0x40]); +vcvtph2psx(xmm1, ptr_b [rax+0x40]); +vcvtph2psx(ymm1|k2|T_z, xmm5); +vcvtph2psx(ymm1, ptr [rax+0x40]); +vcvtph2psx(ymm1, ptr_b [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); +vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2udq(xmm1, xmm5); +vcvtph2udq(xmm1, ptr [rax+0x40]); +vcvtph2udq(xmm1, ptr_b [rax+0x40]); +vcvtph2udq(ymm1|k2|T_z, xmm5); +vcvtph2udq(ymm1, ptr [rax+0x40]); +vcvtph2udq(ymm1, ptr_b [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); +vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2dq(xmm1, xmm5); +vcvttph2dq(xmm1, ptr [rax+0x40]); +vcvttph2dq(xmm1, ptr_b [rax+0x40]); +vcvttph2dq(ymm1|k2|T_z, xmm5); +vcvttph2dq(ymm1, ptr [rax+0x40]); +vcvttph2dq(ymm1, ptr_b [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2udq(xmm1, xmm5); +vcvttph2udq(xmm1, ptr [rax+0x40]); +vcvttph2udq(xmm1, ptr_b [rax+0x40]); +vcvttph2udq(ymm1|k2|T_z, xmm5); +vcvttph2udq(ymm1, ptr [rax+0x40]); +vcvttph2udq(ymm1, ptr_b [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); +vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2pd(xmm1, xmm5); +vcvtph2pd(xmm1, ptr [rax+0x40]); +vcvtph2pd(xmm1, ptr_b [rax+0x40]); +vcvtph2pd(ymm1|k2|T_z, xmm5); +vcvtph2pd(ymm1, ptr [rax+0x40]); +vcvtph2pd(ymm1, ptr_b [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); +vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2qq(xmm1, xmm5); +vcvtph2qq(xmm1, ptr [rax+0x40]); +vcvtph2qq(xmm1, ptr_b [rax+0x40]); +vcvtph2qq(ymm1|k2|T_z, xmm5); +vcvtph2qq(ymm1, ptr [rax+0x40]); +vcvtph2qq(ymm1, ptr_b [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtph2uqq(xmm1, xmm5); +vcvtph2uqq(xmm1, ptr [rax+0x40]); +vcvtph2uqq(xmm1, ptr_b [rax+0x40]); +vcvtph2uqq(ymm1|k2|T_z, xmm5); +vcvtph2uqq(ymm1, ptr [rax+0x40]); +vcvtph2uqq(ymm1, ptr_b [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); +vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvttph2uqq(xmm1, xmm5); +vcvttph2uqq(xmm1, ptr [rax+0x40]); +vcvttph2uqq(xmm1, ptr_b [rax+0x40]); +vcvttph2uqq(ymm1|k2|T_z, xmm5); +vcvttph2uqq(ymm1, ptr [rax+0x40]); +vcvttph2uqq(ymm1, ptr_b [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtdq2ph(xmm1, xmm5); +vcvtdq2ph(xmm1, xword [rax+0x40]); +vcvtdq2ph(xmm1, xword_b [rax+0x40]); +vcvtdq2ph(xmm1, yword [rax+0x40]); +vcvtdq2ph(xmm1, yword_b [rax+0x40]); +vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtdq2ph(ymm1, ptr [rax+0x40]); +vcvtdq2ph(ymm1, ptr_b [rax+0x40]); +vcvtps2phx(xmm1, xmm5); +vcvtps2phx(xmm1, xword [rax+0x40]); +vcvtps2phx(xmm1, xword_b [rax+0x40]); +vcvtps2phx(xmm1, yword [rax+0x40]); +vcvtps2phx(xmm1, yword_b [rax+0x40]); +vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtps2phx(ymm1, ptr [rax+0x40]); +vcvtps2phx(ymm1, ptr_b [rax+0x40]); +vcvtudq2ph(xmm1, xmm5); +vcvtudq2ph(xmm1, xword [rax+0x40]); +vcvtudq2ph(xmm1, xword_b [rax+0x40]); +vcvtudq2ph(xmm1, yword [rax+0x40]); +vcvtudq2ph(xmm1, yword_b [rax+0x40]); +vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); +vcvtudq2ph(ymm1, ptr [rax+0x40]); +vcvtudq2ph(ymm1, ptr_b [rax+0x40]); +vcvtpd2ph(xmm1, xmm5); +vcvtpd2ph(xmm1, ymm5); +vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtpd2ph(xmm1, xword [rax+0x40]); +vcvtpd2ph(xmm1, xword_b [rax+0x40]); +vcvtpd2ph(xmm1, yword [rax+0x40]); +vcvtpd2ph(xmm1, yword_b [rax+0x40]); +vcvtpd2ph(xmm1, zword [rax+0x40]); +vcvtpd2ph(xmm1, zword_b [rax+0x40]); +vcvtqq2ph(xmm1, xmm5); +vcvtqq2ph(xmm1, ymm5); +vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtqq2ph(xmm1, xword [rax+0x40]); +vcvtqq2ph(xmm1, xword_b [rax+0x40]); +vcvtqq2ph(xmm1, yword [rax+0x40]); +vcvtqq2ph(xmm1, yword_b [rax+0x40]); +vcvtqq2ph(xmm1, zword [rax+0x40]); +vcvtqq2ph(xmm1, zword_b [rax+0x40]); +vcvtuqq2ph(xmm1, xmm5); +vcvtuqq2ph(xmm1, ymm5); +vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuqq2ph(xmm1, xword [rax+0x40]); +vcvtuqq2ph(xmm1, xword_b [rax+0x40]); +vcvtuqq2ph(xmm1, yword [rax+0x40]); +vcvtuqq2ph(xmm1, yword_b [rax+0x40]); +vcvtuqq2ph(xmm1, zword [rax+0x40]); +vcvtuqq2ph(xmm1, zword_b [rax+0x40]); +vcvtph2uw(xmm1, xmm5); +vcvtph2uw(xmm1, ptr [rax+0x40]); +vcvtph2uw(xmm1, ptr_b [rax+0x40]); +vcvtph2uw(ymm1, ptr [rax+0x40]); +vcvtph2uw(ymm1, ptr_b [rax+0x40]); +vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2uw(zmm1, ptr [rax+0x40]); +vcvtph2uw(zmm1, ptr_b [rax+0x40]); +vcvtph2w(xmm1, xmm5); +vcvtph2w(xmm1, ptr [rax+0x40]); +vcvtph2w(xmm1, ptr_b [rax+0x40]); +vcvtph2w(ymm1, ptr [rax+0x40]); +vcvtph2w(ymm1, ptr_b [rax+0x40]); +vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtph2w(zmm1, ptr [rax+0x40]); +vcvtph2w(zmm1, ptr_b [rax+0x40]); +vcvttph2uw(xmm1, xmm5); +vcvttph2uw(xmm1, ptr [rax+0x40]); +vcvttph2uw(xmm1, ptr_b [rax+0x40]); +vcvttph2uw(ymm1, ptr [rax+0x40]); +vcvttph2uw(ymm1, ptr_b [rax+0x40]); +vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2uw(zmm1, ptr [rax+0x40]); +vcvttph2uw(zmm1, ptr_b [rax+0x40]); +vcvttph2w(xmm1, xmm5); +vcvttph2w(xmm1, ptr [rax+0x40]); +vcvttph2w(xmm1, ptr_b [rax+0x40]); +vcvttph2w(ymm1, ptr [rax+0x40]); +vcvttph2w(ymm1, ptr_b [rax+0x40]); +vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); +vcvttph2w(zmm1, ptr [rax+0x40]); +vcvttph2w(zmm1, ptr_b [rax+0x40]); +vcvtuw2ph(xmm1, xmm5); +vcvtuw2ph(xmm1, ptr [rax+0x40]); +vcvtuw2ph(xmm1, ptr_b [rax+0x40]); +vcvtuw2ph(ymm1, ptr [rax+0x40]); +vcvtuw2ph(ymm1, ptr_b [rax+0x40]); +vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtuw2ph(zmm1, ptr [rax+0x40]); +vcvtuw2ph(zmm1, ptr_b [rax+0x40]); +vcvtw2ph(xmm1, xmm5); +vcvtw2ph(xmm1, ptr [rax+0x40]); +vcvtw2ph(xmm1, ptr_b [rax+0x40]); +vcvtw2ph(ymm1, ptr [rax+0x40]); +vcvtw2ph(ymm1, ptr_b [rax+0x40]); +vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); +vcvtw2ph(zmm1, ptr [rax+0x40]); +vcvtw2ph(zmm1, ptr_b [rax+0x40]); +vcvtps2ph(xmm1, xmm2, 0x1); +vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); +vcvtps2ph(xmm1, ymm2, 0x3); +vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); +vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); +vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); +vcvtps2ph(xmm1|k2, ymm4, 0x7); +vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); +vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); +vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); +vcvtsh2usi(ecx|T_rd_sae, xmm1); +vcvtsh2usi(eax, ptr [rax+0x40]); +vcvtsh2usi(r9|T_rd_sae, xmm1); +vcvtsh2usi(r13, ptr [rax+0x40]); +vcvttsh2si(ecx|T_sae, xmm1); +vcvttsh2si(eax, ptr [rax+0x40]); +vcvttsh2si(r9|T_sae, xmm1); +vcvttsh2si(r13, ptr [rax+0x40]); +vcvttsh2usi(ecx|T_sae, xmm1); +vcvttsh2usi(eax, ptr [rax+0x40]); +vcvttsh2usi(r9|T_sae, xmm1); +vcvttsh2usi(r13, ptr [rax+0x40]); +vcvttph2qq(xmm1, xmm5); +vcvttph2qq(xmm1, ptr [rax+0x40]); +vcvttph2qq(xmm1, ptr_b [rax+0x40]); +vcvttph2qq(ymm1|k2|T_z, xmm5); +vcvttph2qq(ymm1, ptr [rax+0x40]); +vcvttph2qq(ymm1, ptr_b [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); +vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); +vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); +vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); +vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); +vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); +aadd(ptr[rax], ecx); +aadd(ptr[eax], ecx); +aadd(ptr[rax], r10); +aand(ptr[rax], ecx); +aand(ptr[eax], ecx); +aand(ptr[rax], r10); +aor(ptr[rax], ecx); +aor(ptr[eax], ecx); +aor(ptr[rax], r10); +axor(ptr[rax], ecx); +axor(ptr[eax], ecx); +axor(ptr[rax], r10); +cmpbexadd(ptr[rax+r10*4], rcx, rdx); +cmpbxadd(ptr[rax+r10*4], rcx, rdx); +cmplexadd(ptr[rax+r10*4], rcx, rdx); +cmplxadd(ptr[rax+r10*4], rcx, rdx); +cmpnbexadd(ptr[rax+r10*4], rcx, rdx); +cmpnbxadd(ptr[rax+r10*4], rcx, rdx); +cmpnlexadd(ptr[rax+r10*4], rcx, rdx); +cmpnlxadd(ptr[rax+r10*4], rcx, rdx); +cmpnoxadd(ptr[rax+r10*4], rcx, rdx); +cmpnpxadd(ptr[rax+r10*4], rcx, rdx); +cmpnsxadd(ptr[rax+r10*4], rcx, rdx); +cmpnzxadd(ptr[rax+r10*4], rcx, rdx); +cmpoxadd(ptr[rax+r10*4], rcx, rdx); +cmppxadd(ptr[rax+r10*4], rcx, rdx); +cmpsxadd(ptr[rax+r10*4], rcx, rdx); +cmpzxadd(ptr[rax+r10*4], rcx, rdx); +vsha512msg1(ymm3, xmm5); +vsha512msg2(ymm9, ymm10); +vsha512rnds2(ymm1, ymm3, xmm2); +vsm3msg1(xmm1, xmm2, xmm3); +vsm3msg1(xmm1, xmm2, ptr [rax]); +vsm3msg2(xmm5, xmm7, xmm3); +vsm3msg2(xmm5, xmm6, ptr [rax]); +vsm3rnds2(xmm5, xmm7, xmm3, 0x12); +vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); +vsm4key4(xmm1, xmm2, xmm3); +vsm4key4(xmm1, xmm2, ptr [rdx]); +vsm4rnds4(xmm1, xmm2, xmm3); +vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); +vpdpbssd(xmm1, xmm2, xmm3); +vpdpbssd(ymm1, ymm2, ptr [rax]); +vpdpbssds(xmm1, xmm2, xmm3); +vpdpbssds(ymm1, ymm2, ptr [rax]); +vpdpbsud(xmm1, xmm2, xmm3); +vpdpbsud(ymm1, ymm2, ptr [rax]); +vpdpbsuds(xmm1, xmm2, xmm3); +vpdpbsuds(ymm1, ymm2, ptr [rax]); +vpdpbuud(xmm1, xmm2, xmm3); +vpdpbuud(ymm1, ymm2, ptr [rax]); +vpdpbuuds(xmm1, xmm2, xmm3); +vpdpbuuds(ymm1, ymm2, ptr [rax]); +vpdpwsud(xmm1, xmm2, xmm3); +vpdpwsud(ymm1, ymm2, ptr [rax]); +vpdpwsuds(xmm1, xmm2, xmm3); +vpdpwsuds(ymm1, ymm2, ptr [rax]); +vpdpwusd(xmm1, xmm2, xmm3); +vpdpwusd(ymm1, ymm2, ptr [rax]); +vpdpwusds(xmm1, xmm2, xmm3); +vpdpwusds(ymm1, ymm2, ptr [rax]); +vpdpwuud(xmm1, xmm2, xmm3); +vpdpwuud(ymm1, ymm2, ptr [rax]); +vpdpwuuds(xmm1, xmm2, xmm3); +vpdpwuuds(ymm1, ymm2, ptr [rax]); diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 9a4a8480..5f742fe7 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -228,3 +228,27 @@ CYBOZU_TEST_AUTO(ymm_with_sae) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(vmpsadbw) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncoding(); + vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) + setDefaultEncoding(VexEncoding, EvexEncoding); + vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) + } + } c; + const uint8_t tbl[] = { + 0xc4, 0xc3, 0x61, 0x42, 0xcf, 0x03, + 0xc4, 0xe3, 0x65, 0x42, 0x88, 0x80, 0x00, 0x00, 0x00, 0x03, + 0x62, 0xd3, 0x66, 0x28, 0x42, 0xcf, 0x03, + 0x62, 0xf3, 0x66, 0x28, 0x42, 0x48, 0x04, 0x03, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 08dc8afe..ddac779a 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -1,10 +1,14 @@ #include #include +using namespace Xbyak; + struct Code : Xbyak::CodeGenerator { Code() + : Xbyak::CodeGenerator(4096*8) { -#include "cpp.txt" + setDefaultEncoding(VexEncoding, EvexEncoding); +#include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index f24d7f6b..afd77d8a 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -7,6 +7,25 @@ def __init__(self, s): self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name + +g_xmmTbl = ''' +xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 +xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 +xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 +ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 +ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 +ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 +ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 +zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 +zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 +zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 +zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 +'''.split() g_regTbl = ''' eax ecx edx ebx esp ebp esi edi @@ -22,49 +41,53 @@ def __str__(self): r8b r9b r10b r11b r12b r13b r14b r15b r16b r17b r18b r19b r20b r21b r22b r23b r24b r25b r26b r27b r28b r29b r30b r31b spl bpl sil dil -xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 -xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -xmm16 xmm17 xmm18 xmm19 xmm20 xmm21 xmm22 xmm23 -xmm24 xmm25 xmm26 xmm27 xmm28 xmm29 xmm30 xmm31 -ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 -ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 -ymm16 ymm17 ymm18 ymm19 ymm20 ymm21 ymm22 ymm23 -ymm24 ymm25 ymm26 ymm27 ymm28 ymm29 ymm30 ymm31 -zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 -zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 -zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 -zmm24 zmm25 zmm26 zmm27 zmm28 zmm29 zmm30 zmm31 -'''.split() +tmm0 tmm1 tmm2 tmm3 tmm4 tmm5 tmm6 tmm7 +'''.split()+g_xmmTbl # define global constants for e in g_regTbl: globals()[e] = Reg(e) +g_maskTbl = [k1, k2, k3, k4, k5, k6, k7] + g_replaceCharTbl = '{}();|,' g_replaceChar = str.maketrans(g_replaceCharTbl, ' '*len(g_replaceCharTbl)) g_sizeTbl = ['byte', 'word', 'dword', 'qword', 'xword', 'yword', 'zword'] -g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae'] #, 'T_z'] -g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae'] +g_xedSizeTbl = ['xmmword', 'ymmword', 'zmmword'] +g_attrTbl = ['T_sae', 'T_rn_sae', 'T_rd_sae', 'T_ru_sae', 'T_rz_sae', 'T_z'] +g_attrXedTbl = ['sae', 'rne-sae', 'rd-sae', 'ru-sae', 'rz-sae', 'z'] class Attr: def __init__(self, s): self.name = s def __str__(self): return self.name + def __eq__(self, rhs): + return self.name == rhs.name + def __lt__(self, rhs): + return self.name < rhs.name for e in g_attrTbl: globals()[e] = Attr(e) +def newReg(s): + if type(s) == str: + return Reg(s) + return s + class Memory: - def __init__(self, size=0, base=None, index=None, scale=0, disp=0): + def __init__(self, size=0, base=None, index=None, scale=0, disp=0, broadcast=False): self.size = size - self.base = base - self.index = index + self.base = newReg(base) + self.index = newReg(index) self.scale = scale self.disp = disp + self.broadcast = broadcast def __str__(self): s = 'ptr' if self.size == 0 else g_sizeTbl[int(math.log2(self.size))] + if self.broadcast: + s += '_b' s += ' [' needPlus = False if self.base: @@ -84,47 +107,72 @@ def __str__(self): s += ']' return s - def __eq__(self, rhs): - return str(self) == str(rhs) + # xbyak uses ptr if it is automatically detected, so xword == ptr is true + if self.broadcast != rhs.broadcast: return False +# if not self.broadcast and 0 < self.size <= 8 and 0 < rhs.size <= 8 and self.size != rhs.size: return False + if not self.broadcast and self.size > 0 and rhs.size > 0 and self.size != rhs.size: return False + r = self.base == rhs.base and self.index == rhs.index and self.scale == rhs.scale and self.disp == rhs.disp + return r + +def parseBroadcast(s): + if '_b' in s: + return (s.replace('_b', ''), True) + r = re.search(r'({1to\d+})', s) + if not r: + return (s, False) + return (s.replace(r.group(1), ''), True) -def parseMemory(s): - sizeTbl = { - 'byte': 1, 'word': 2, 'dword': 4, 'qword': 8, - 'xword': 16, 'yword': 32, 'zword': 64 - } +def parseMemory(s, broadcast=False): + org_s = s s = s.replace(' ', '').lower() - # Parse size size = 0 + base = index = None + scale = 0 + disp = 0 + + if not broadcast: + (s, broadcast) = parseBroadcast(s) + + # Parse size for i in range(len(g_sizeTbl)): w = g_sizeTbl[i] if s.startswith(w): size = 1< 0: s += ', ' s += str(self.args[i]) - for e in self.attrs: - s += f'|{e}' + if i == 0 and self.attrs: + for e in self.attrs: + s += f'|{e}' s += ');' return s + def __eq__(self, rhs): + return self.name == rhs.name and self.args == rhs.args and self.attrs == rhs.attrs def parseNmemonic(s): + args = [] + attrs = [] + + # remove Xbyak::{Evex,Vex}Encoding + r = re.search(r'(,[^,]*Encoding)', s) + if r: + s = s.replace(r.group(1), '') + + (s, broadcast) = parseBroadcast(s) + + # replace xm0 with xmm0 + while True: + r = re.search(r'([xyz])m(\d\d?)', s) + if not r: + break + s = s.replace(r.group(0), r.group(1) + 'mm' + r.group(2)) + + # check 'zmm0{k7}' + r = re.search(r'({k[1-7]})', s) + if r: + idx = int(r.group(1)[2]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + # check 'zmm0|k7' + r = re.search(r'(\|\s*k[1-7])', s) + if r: + idx = int(r.group(1)[-1]) + attrs.append(g_maskTbl[idx-1]) + s = s.replace(r.group(1), '') + s = s.translate(g_replaceChar) # reconstruct memory string @@ -168,13 +249,12 @@ def parseNmemonic(s): inMemory = False else: v.append(e) - if e in g_sizeTbl or e == 'ptr': + if e in g_sizeTbl or e in g_xedSizeTbl or e.startswith('ptr'): v[-1] += ' ' # to avoid 'byteptr' - inMemory = True + if ']' not in v[-1]: + inMemory = True name = v[0] - args = [] - attrs = [] for e in v[1:]: if e.startswith('0x'): args.append(int(e, 16)) @@ -185,9 +265,12 @@ def parseNmemonic(s): elif e in g_attrXedTbl: attrs.append(Attr(g_attrTbl[g_attrXedTbl.index(e)])) elif e in g_regTbl: - args.append(e) + args.append(Reg(e)) + # xed special format : xmm8+3 + elif e[:-2] in g_xmmTbl and e.endswith('+3'): + args.append(Reg(e[:-2])) else: - args.append(parseMemory(e)) + args.append(parseMemory(e, broadcast)) return Nmemonic(name, args, attrs) def loadFile(name): @@ -195,7 +278,7 @@ def loadFile(name): r = [] for line in f.read().split('\n'): if line: - if line[0] == '#': + if line[0] == '#' or line.startswith('//'): continue r.append(line) return r @@ -209,19 +292,27 @@ def removeExtraInfo(s): def run(cppText, xedText): cpp = loadFile(cppText) xed = loadFile(xedText) - for i in range(len(cpp)): + n = len(cpp) + if n != len(xed): + raise Exception(f'different line {n} {len(xed)}') + + for i in range(n): line1 = cpp[i] line2 = removeExtraInfo(xed[i]) m1 = parseNmemonic(line1) m2 = parseNmemonic(line2) - assertEqualStr(m1, m2, f'{i}') - print('run ok') + assertEqual(m1, m2, f'{i+1}') + print('run ok', n) def assertEqualStr(a, b, msg=None): if str(a) != str(b): raise Exception(f'assert fail {msg}:', str(a), str(b)) +def assertEqual(a, b, msg=None): + if a != b: + raise Exception(f'assert fail {msg}:', str(a), str(b)) + def MemoryTest(): tbl = [ (Memory(0, rax), 'ptr [rax]'), @@ -231,18 +322,23 @@ def MemoryTest(): (Memory(8, None, rcx, 4), 'qword [rcx*4]'), (Memory(8, rax, None, 0, 5), 'qword [rax+0x5]'), (Memory(8, None, None, 0, 255), 'qword [0xff]'), + (Memory(0, r8, r9, 1, 32), 'ptr [r8+r9+0x20]'), ] for (m, expected) in tbl: assertEqualStr(m, expected) + assertEqual(Memory(16, rax), Memory(0, rax)) + def parseMemoryTest(): print('parseMemoryTest') tbl = [ ('[]', Memory()), ('[rax]', Memory(0, rax)), ('ptr[rax]', Memory(0, rax)), + ('ptr_b[rax]', Memory(0, rax, broadcast=True)), ('dword[rbx]', Memory(4, rbx)), ('xword ptr[rcx]', Memory(16, rcx)), + ('xmmword ptr[rcx]', Memory(16, rcx)), ('xword ptr[rdx*8]', Memory(16, None, rdx, 8)), ('[12345]', Memory(0, None, None, 0, 12345)), ('[0x12345]', Memory(0, None, None, 0, 0x12345)), @@ -262,10 +358,19 @@ def parseNmemonicTest(): ('mov(rax, ptr [rcx + rdx * 8 ] );', Nmemonic('mov', [rax, Memory(0, rcx, rdx, 8)])), ('vcmppd(k1, ymm2, ymm3 |T_sae, 3);', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), ('vcmppd k1{sae}, ymm2, ymm3, 0x3', Nmemonic('vcmppd', [k1, ymm2, ymm3, 3], [T_sae])), + ('v4fmaddps zmm1, zmm8+3, xmmword ptr [rdx+0x40]', Nmemonic('v4fmaddps', [zmm1, zmm8, Memory(16, rdx, None, 0, 0x40)])), + ('vp4dpwssd zmm23{k7}{z}, zmm1+3, xmmword ptr [rax+0x40]', Nmemonic('vp4dpwssd', [zmm23, zmm1, Memory(16, rax, None, 0, 0x40)], [k7, T_z])), + ('v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);', Nmemonic('v4fnmaddps', [zmm5, zmm2, Memory(0, rcx, None, 0, 0x80)], [k5])), + ('vpcompressw(zmm30 | k2 |T_z, zmm1);', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpcompressw zmm30{k2}{z}, zmm1', Nmemonic('vpcompressw', [zmm30, zmm1], [k2, T_z])), + ('vpshldw(xmm9|k3|T_z, xmm2, ptr [rax + 0x40], 5);', Nmemonic('vpshldw', [xmm9, xmm2, Memory(0, rax, None, 0, 0x40), 5], [k3, T_z])), + ('vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vpshrdd xmm5{k3}{z}, xmm2, dword ptr [rax+0x40]{1to4}, 0x5', Nmemonic('vpshrdd', [xmm5, xmm2, Memory(0, rax, None, 0, 0x40, True), 5], [k3, T_z])), + ('vcmpph(k1, xm15, ptr[rax+64], 1);', Nmemonic('vcmpph', [k1, xm15, Memory(0, rax, None, 0, 64), 1])), ] for (s, expected) in tbl: e = parseNmemonic(s) - assertEqualStr(e, expected) + assertEqual(e, expected) def test(): print('test start') diff --git a/test/test_by_xed.sh b/test/test_by_xed.sh index 6d820bd7..905b8a01 100755 --- a/test/test_by_xed.sh +++ b/test/test_by_xed.sh @@ -4,6 +4,7 @@ set -e XED=${XED:=xed} CXX=${CXX:=g++} PYTHON=${PYTHON:=python3} +echo $XED if [ $# -ne 1 ]; then echo "./test_by_xed.sh " @@ -15,9 +16,9 @@ TARGET=$1 CFLAGS="-Wall -Wextra -I ../" echo "test:" $TARGET -cp $TARGET cpp.txt +cp $TARGET tmp.cpp $CXX $CFLAGS test_by_xed.cpp -o test_by_xed ./test_by_xed $XED -64 -ir bin > out.txt -$PYTHON test_by_xed.py cpp.txt out.txt +$PYTHON test_by_xed.py $TARGET out.txt diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index f0d99db5..552e451e 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7091 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7100 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -2559,6 +2559,18 @@ class CodeGenerator : public CodeArray { Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM; opVex(x.copyAndSetKind(kind), &xm0, op, type, code); } + // (x, x, x/m), (x, y, y/m), (y, z, z/m) + void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code) + { + int b1 = x1.getBit(); + int b2 = x2.getBit(); + int b3 = op.getBit(); + if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) { + opVex(x1, &x2, op, type, code); + return; + } + XBYAK_THROW(ERR_BAD_COMBINATION); + } const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; @@ -2649,21 +2661,21 @@ class CodeGenerator : public CodeArray { if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding) { + int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { if (encoding == DefaultEncoding) { - encoding = defaultEncoding_; + encoding = defaultEncoding_[sel]; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX; + return T_MUST_EVEX | typeEvex; } - return 0; + return typeVex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2833,7 +2845,7 @@ class CodeGenerator : public CodeArray { #endif private: bool isDefaultJmpNEAR_; - PreferredEncoding defaultEncoding_; + PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -3119,8 +3131,9 @@ class CodeGenerator : public CodeArray { , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) - , defaultEncoding_(EvexEncoding) { + // select avx512-vnni, vmpsadbw(avx) + setDefaultEncoding(); labelMgr_.set(this); } void reset() @@ -3157,8 +3170,11 @@ class CodeGenerator : public CodeArray { #undef jnl #endif - // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + // set default encoding + // vnniEnc : control AVX512_VNNI (evex:default) or AVX-VNNI (vex) + // avx10Enc : control mpsadbw, AVX-VNNI-INT8 (vex:default) or AVX10.2 (evex) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding avx10Enc = VexEncoding) + { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = avx10Enc; } void sha1msg12(const Xmm& x, const Operand& op) { diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 8316bd92..0397ffdc 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.09.1"; } +const char *getVersionString() const { return "7.10"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -1213,7 +1213,6 @@ void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3| void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38|T_W0|T_YMM, 0xB0); } -void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1370,7 +1369,6 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); } void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } @@ -1421,22 +1419,10 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } -void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } -void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } -void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } @@ -1468,8 +1454,6 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB5, encoding); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_YMM|T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM|T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -2047,6 +2031,7 @@ void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); } void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); } void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); } +void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); } void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); } void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); } void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); } @@ -2175,6 +2160,7 @@ void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); } void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); } void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); } +void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); } void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0xC2, imm); } void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0xC2, imm); } void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0xC2, imm); } @@ -2197,11 +2183,30 @@ void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); } void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); } void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); } -void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); } +void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } +void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } +void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } +void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); } +void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); } +void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } +void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } @@ -2258,9 +2263,11 @@ void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2 void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); } +void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); } void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); } void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); } void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); } +void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); } void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); } void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); } void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); } @@ -2279,38 +2286,51 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } +void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); } +void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } +void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } +void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); } +void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); } +void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); } void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } +void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } +void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); } +void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); } +void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); } +void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); } +void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } +void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); } void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } @@ -2329,12 +2349,14 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } +void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } +void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); } void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); } @@ -2349,8 +2371,10 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); } +void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } @@ -2371,6 +2395,8 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } +void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); } @@ -2413,6 +2439,18 @@ void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } @@ -2437,6 +2475,8 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_6 void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x44); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB5); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x39); } @@ -2530,14 +2570,17 @@ void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_ void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); } void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); } void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); } +void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); } +void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); } void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); } void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); } void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); } void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } +void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); } void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); } @@ -2552,8 +2595,11 @@ void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); } void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); } void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); } +void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); } +void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } +void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); } void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); } void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); } @@ -2576,11 +2622,16 @@ void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); } +void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); } void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); } void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); } +void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } -void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); } +void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F3|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F2|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); }