From 3efe2eb9e3dfb49cb110c53e3430caeae4599f52 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 8 May 2024 22:24:27 -0700 Subject: [PATCH] bn: Move dispatching logic from x86_64-mont5.pl to C. CL originally uploaded by Brian Smith at https://boringssl-review.googlesource.com/c/boringssl/+/65569 Bug: 673 Change-Id: If84d34cae1c44cc883fc292dd048542e2b341f41 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68347 Reviewed-by: Bob Beck Auto-Submit: David Benjamin Commit-Queue: Bob Beck --- crypto/fipsmodule/bn/asm/x86_64-mont5.pl | 91 ++++++++++-------------- crypto/fipsmodule/bn/bn_test.cc | 28 ++++++-- crypto/fipsmodule/bn/exponentiation.c | 46 +++++++++++- crypto/fipsmodule/bn/internal.h | 56 +++++++++------ gen/bcm/x86_64-mont5-apple.S | 62 ++++++++-------- gen/bcm/x86_64-mont5-linux.S | 65 ++++++++--------- gen/bcm/x86_64-mont5-win.asm | 73 ++++++++++--------- 7 files changed, 241 insertions(+), 180 deletions(-) diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl index 88d98af298..a944739fe7 100755 --- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl +++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl @@ -50,7 +50,7 @@ # output, so this isn't useful anyway. $addx = 1; -# int bn_mul_mont_gather5( +# int bn_mul_mont_gather5_nohw( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, @@ -72,29 +72,17 @@ $code=<<___; .text -.extern OPENSSL_ia32cap_P - -.globl bn_mul_mont_gather5 -.type bn_mul_mont_gather5,\@function,6 +.globl bn_mul_mont_gather5_nohw +.type bn_mul_mont_gather5_nohw,\@function,6 .align 64 -bn_mul_mont_gather5: +bn_mul_mont_gather5_nohw: .cfi_startproc _CET_ENDBR + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. Zero the upper half to normalize it. mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax - test \$7,${num}d - jnz .Lmul_enter -___ -$code.=<<___ if ($addx); - leaq OPENSSL_ia32cap_P(%rip),%r11 - mov 8(%r11),%r11d -___ -$code.=<<___; - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx .cfi_push %rbx @@ -454,27 +442,21 @@ .Lmul_epilogue: ret .cfi_endproc -.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; +.globl bn_mul4x_mont_gather5 .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: .cfi_startproc + _CET_ENDBR .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax -.Lmul4x_enter: -___ -$code.=<<___ if ($addx); - and \$0x80108,%r11d - cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 - je .Lmulx4x_enter -___ -$code.=<<___; push %rbx .cfi_push %rbx push %rbp @@ -490,6 +472,9 @@ .Lmul4x_prologue: .byte 0x67 + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num @@ -1079,7 +1064,7 @@ }}} {{{ ###################################################################### -# void bn_power5( +# void bn_power5_nohw( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const BN_ULONG *table, @@ -1094,23 +1079,14 @@ my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; -.globl bn_power5 -.type bn_power5,\@function,6 +.globl bn_power5_nohw +.type bn_power5_nohw,\@function,6 .align 32 -bn_power5: +bn_power5_nohw: .cfi_startproc _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax -___ -$code.=<<___ if ($addx); - leaq OPENSSL_ia32cap_P(%rip),%r11 - mov 8(%r11),%r11d - and \$0x80108,%r11d - cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 - je .Lpowerx5_enter -___ -$code.=<<___; push %rbx .cfi_push %rbx push %rbp @@ -1125,6 +1101,9 @@ .cfi_push %r15 .Lpower5_prologue: + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, come before the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num neg $num @@ -1233,7 +1212,7 @@ .Lpower5_epilogue: ret .cfi_endproc -.size bn_power5,.-bn_power5 +.size bn_power5_nohw,.-bn_power5_nohw .globl bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -2108,13 +2087,14 @@ my $bp="%rdx"; # restore original value $code.=<<___; +.globl bn_mulx4x_mont_gather5 .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: .cfi_startproc + _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax -.Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp @@ -2129,6 +2109,9 @@ .cfi_push %r15 .Lmulx4x_prologue: + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num @@ -2583,7 +2566,7 @@ ___ } { ###################################################################### -# void bn_power5( +# void bn_powerx5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const BN_ULONG *table, @@ -2598,13 +2581,14 @@ my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; +.globl bn_powerx5 .type bn_powerx5,\@function,6 .align 32 bn_powerx5: .cfi_startproc + _CET_ENDBR mov %rsp,%rax .cfi_def_cfa_register %rax -.Lpowerx5_enter: push %rbx .cfi_push %rbx push %rbp @@ -2619,6 +2603,9 @@ .cfi_push %r15 .Lpowerx5_prologue: + # num is declared as an int, a 32-bit parameter, so the upper half is + # undefined. It is important that this write to ${num}, which zeros the + # upper half, predates the first access. shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num @@ -3705,17 +3692,17 @@ .section .pdata .align 4 - .rva .LSEH_begin_bn_mul_mont_gather5 - .rva .LSEH_end_bn_mul_mont_gather5 - .rva .LSEH_info_bn_mul_mont_gather5 + .rva .LSEH_begin_bn_mul_mont_gather5_nohw + .rva .LSEH_end_bn_mul_mont_gather5_nohw + .rva .LSEH_info_bn_mul_mont_gather5_nohw .rva .LSEH_begin_bn_mul4x_mont_gather5 .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 - .rva .LSEH_begin_bn_power5 - .rva .LSEH_end_bn_power5 - .rva .LSEH_info_bn_power5 + .rva .LSEH_begin_bn_power5_nohw + .rva .LSEH_end_bn_power5_nohw + .rva .LSEH_info_bn_power5_nohw ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont_gather5 @@ -3733,7 +3720,7 @@ .section .xdata .align 8 -.LSEH_info_bn_mul_mont_gather5: +.LSEH_info_bn_mul_mont_gather5_nohw: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] @@ -3743,7 +3730,7 @@ .rva mul_handler .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 -.LSEH_info_bn_power5: +.LSEH_info_bn_power5_nohw: .byte 9,0,0,0 .rva mul_handler .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc index fcc59e00f6..710b60f8c5 100644 --- a/crypto/fipsmodule/bn/bn_test.cc +++ b/crypto/fipsmodule/bn/bn_test.cc @@ -2946,17 +2946,35 @@ TEST_F(BNTest, BNMulMont5ABI) { } CHECK_ABI(bn_gather5, r.data(), words, table.data(), 13); - CHECK_ABI(bn_mul_mont_gather5, r.data(), r.data(), table.data(), m->d, + if (bn_mulx4x_mont_gather5_capable(words)) { + CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), r.data(), table.data(), m->d, + mont->n0, words, 13); + CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), a.data(), table.data(), m->d, + mont->n0, words, 13); + } + if (bn_mul4x_mont_gather5_capable(words)) { + CHECK_ABI(bn_mul4x_mont_gather5, r.data(), r.data(), table.data(), m->d, + mont->n0, words, 13); + CHECK_ABI(bn_mul4x_mont_gather5, r.data(), a.data(), table.data(), m->d, + mont->n0, words, 13); + } + CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), r.data(), table.data(), m->d, mont->n0, words, 13); - CHECK_ABI(bn_mul_mont_gather5, r.data(), a.data(), table.data(), m->d, + CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), a.data(), table.data(), m->d, mont->n0, words, 13); - if (words % 8 == 0) { - CHECK_ABI(bn_power5, r.data(), r.data(), table.data(), m->d, mont->n0, + if (bn_powerx5_capable(words)) { + CHECK_ABI(bn_powerx5, r.data(), r.data(), table.data(), m->d, mont->n0, words, 13); - CHECK_ABI(bn_power5, r.data(), a.data(), table.data(), m->d, mont->n0, + CHECK_ABI(bn_powerx5, r.data(), a.data(), table.data(), m->d, mont->n0, words, 13); } + if (bn_power5_capable(words)) { + CHECK_ABI(bn_power5_nohw, r.data(), r.data(), table.data(), m->d, + mont->n0, words, 13); + CHECK_ABI(bn_power5_nohw, r.data(), a.data(), table.data(), m->d, + mont->n0, words, 13); + } } } #endif // OPENSSL_BN_ASM_MONT5 && SUPPORTS_ABI_TEST diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c index 53c6142bd7..9030aa8cbe 100644 --- a/crypto/fipsmodule/bn/exponentiation.c +++ b/crypto/fipsmodule/bn/exponentiation.c @@ -119,6 +119,50 @@ #include "internal.h" #include "rsaz_exp.h" +#if defined(OPENSSL_BN_ASM_MONT5) + +// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it +// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| +// words long and represented in Montgomery form. |n0| is a pointer to the +// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least +// 16 bytes. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_mul_mont_gather5( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power) { + if (bn_mulx4x_mont_gather5_capable(num)) { + bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else if (bn_mul4x_mont_gather5_capable(num)) { + bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else { + bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power); + } +} + +// bn_power5 squares |ap| five times and multiplies it by the value stored at +// index |power| of |table|, modulo |np|. It stores the result in |rp|. The +// values are |num| words long and represented in Montgomery form. |n0| is a +// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible +// by 8. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, + int power) { + assert(bn_power5_capable(num)); + if (bn_powerx5_capable(num)) { + bn_powerx5(rp, ap, table, np, n0, num, power); + } else { + bn_power5_nohw(rp, ap, table, np, n0, num, power); + } +} + +#endif // defined(OPENSSL_BN_ASM_MONT5) int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { int i, bits, ret = 0; @@ -1079,7 +1123,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, // Scan the exponent one window at a time starting from the most // significant bits. - if (top & 7) { + if (!bn_power5_capable(top)) { while (bits >= 0) { for (wvalue = 0, i = 0; i < 5; i++, bits--) { wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index 02711603ea..679e249ce9 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h @@ -438,18 +438,26 @@ int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) #define OPENSSL_BN_ASM_MONT5 -// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it -// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| -// words long and represented in Montgomery form. |n0| is a pointer to the -// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least -// 16 bytes. |power| must be less than 32 and is treated as secret. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, - const BN_ULONG *table, const BN_ULONG *np, - const BN_ULONG *n0, int num, int power); +// The following functions implement |bn_mul_mont_gather5|. See +// |bn_mul_mont_gather5| for details. +OPENSSL_INLINE int bn_mul4x_mont_gather5_capable(int num) { + return (num & 7) == 0; +} +void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + +OPENSSL_INLINE int bn_mulx4x_mont_gather5_capable(int num) { + return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); + +void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power); // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of // |table| are |num| words long. |power| must be less than 32 and is treated as @@ -463,17 +471,19 @@ void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table, // is treated as secret. |table| must be aligned to at least 16 bytes. void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power); -// bn_power5 squares |ap| five times and multiplies it by the value stored at -// index |power| of |table|, modulo |np|. It stores the result in |rp|. The -// values are |num| words long and represented in Montgomery form. |n0| is a -// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible -// by 8. |power| must be less than 32 and is treated as secret. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, - const BN_ULONG *np, const BN_ULONG *n0, int num, int power); +// The following functions implement |bn_power5|. See |bn_power5| for details. +void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, int power); + +OPENSSL_INLINE int bn_power5_capable(int num) { return (num & 7) == 0; } + +OPENSSL_INLINE int bn_powerx5_capable(int num) { + return bn_power5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, int power); + #endif // !OPENSSL_NO_ASM && OPENSSL_X86_64 uint64_t bn_mont_n0(const BIGNUM *n); diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S index bd63d91c8a..5cf770f1d5 100644 --- a/gen/bcm/x86_64-mont5-apple.S +++ b/gen/bcm/x86_64-mont5-apple.S @@ -6,26 +6,18 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - - -.globl _bn_mul_mont_gather5 -.private_extern _bn_mul_mont_gather5 +.globl _bn_mul_mont_gather5_nohw +.private_extern _bn_mul_mont_gather5_nohw .p2align 6 -_bn_mul_mont_gather5: +_bn_mul_mont_gather5_nohw: _CET_ENDBR + + movl %r9d,%r9d movq %rsp,%rax - testl $7,%r9d - jnz L$mul_enter - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp L$mul4x_enter - -.p2align 4 -L$mul_enter: movd 8(%rsp),%xmm5 pushq %rbx @@ -452,17 +444,16 @@ L$mul_epilogue: ret +.globl _bn_mul4x_mont_gather5 +.private_extern _bn_mul4x_mont_gather5 .p2align 5 -bn_mul4x_mont_gather5: +_bn_mul4x_mont_gather5: +_CET_ENDBR .byte 0x67 movq %rsp,%rax -L$mul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$mulx4x_enter pushq %rbx pushq %rbp @@ -478,6 +469,9 @@ L$mul4x_enter: L$mul4x_prologue: .byte 0x67 + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1087,20 +1081,15 @@ L$inner4x: jmp L$sqr4x_sub_entry -.globl _bn_power5 -.private_extern _bn_power5 +.globl _bn_power5_nohw +.private_extern _bn_power5_nohw .p2align 5 -_bn_power5: +_bn_power5_nohw: _CET_ENDBR movq %rsp,%rax - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$powerx5_enter pushq %rbx pushq %rbp @@ -1115,6 +1104,9 @@ _CET_ENDBR L$power5_prologue: + + + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -2066,13 +2058,15 @@ L$sqr4x_sub_entry: ret +.globl _bn_mulx4x_mont_gather5 +.private_extern _bn_mulx4x_mont_gather5 .p2align 5 -bn_mulx4x_mont_gather5: +_bn_mulx4x_mont_gather5: +_CET_ENDBR movq %rsp,%rax -L$mulx4x_enter: pushq %rbx pushq %rbp @@ -2087,6 +2081,9 @@ L$mulx4x_enter: L$mulx4x_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -2603,13 +2600,15 @@ L$mulx4x_inner: jmp L$sqrx4x_sub_entry +.globl _bn_powerx5 +.private_extern _bn_powerx5 .p2align 5 -bn_powerx5: +_bn_powerx5: +_CET_ENDBR movq %rsp,%rax -L$powerx5_enter: pushq %rbx pushq %rbp @@ -2624,6 +2623,9 @@ L$powerx5_enter: L$powerx5_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S index 14ab4f72a4..dcc02fc7fb 100644 --- a/gen/bcm/x86_64-mont5-linux.S +++ b/gen/bcm/x86_64-mont5-linux.S @@ -6,27 +6,18 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl bn_mul_mont_gather5 -.hidden bn_mul_mont_gather5 -.type bn_mul_mont_gather5,@function +.globl bn_mul_mont_gather5_nohw +.hidden bn_mul_mont_gather5_nohw +.type bn_mul_mont_gather5_nohw,@function .align 64 -bn_mul_mont_gather5: +bn_mul_mont_gather5_nohw: .cfi_startproc _CET_ENDBR + + movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax - testl $7,%r9d - jnz .Lmul_enter - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: movd 8(%rsp),%xmm5 pushq %rbx .cfi_offset %rbx,-16 @@ -452,18 +443,17 @@ _CET_ENDBR .Lmul_epilogue: ret .cfi_endproc -.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw +.globl bn_mul4x_mont_gather5 +.hidden bn_mul4x_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: .cfi_startproc +_CET_ENDBR .byte 0x67 movq %rsp,%rax .cfi_def_cfa_register %rax -.Lmul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -479,6 +469,9 @@ bn_mul4x_mont_gather5: .Lmul4x_prologue: .byte 0x67 + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1088,20 +1081,15 @@ mul4x_internal: jmp .Lsqr4x_sub_entry .cfi_endproc .size mul4x_internal,.-mul4x_internal -.globl bn_power5 -.hidden bn_power5 -.type bn_power5,@function +.globl bn_power5_nohw +.hidden bn_power5_nohw +.type bn_power5_nohw,@function .align 32 -bn_power5: +bn_power5_nohw: .cfi_startproc _CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1116,6 +1104,9 @@ _CET_ENDBR .cfi_offset %r15,-56 .Lpower5_prologue: + + + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -1224,7 +1215,7 @@ _CET_ENDBR .Lpower5_epilogue: ret .cfi_endproc -.size bn_power5,.-bn_power5 +.size bn_power5_nohw,.-bn_power5_nohw .globl bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -2067,13 +2058,15 @@ __bn_post4x_internal: ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal +.globl bn_mulx4x_mont_gather5 +.hidden bn_mulx4x_mont_gather5 .type bn_mulx4x_mont_gather5,@function .align 32 bn_mulx4x_mont_gather5: .cfi_startproc +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax -.Lmulx4x_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2088,6 +2081,9 @@ bn_mulx4x_mont_gather5: .cfi_offset %r15,-56 .Lmulx4x_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -2604,13 +2600,15 @@ mulx4x_internal: jmp .Lsqrx4x_sub_entry .cfi_endproc .size mulx4x_internal,.-mulx4x_internal +.globl bn_powerx5 +.hidden bn_powerx5 .type bn_powerx5,@function .align 32 bn_powerx5: .cfi_startproc +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax -.Lpowerx5_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2625,6 +2623,9 @@ bn_powerx5: .cfi_offset %r15,-56 .Lpowerx5_prologue: + + + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm index 46aae51739..3b12405d0d 100644 --- a/gen/bcm/x86_64-mont5-win.asm +++ b/gen/bcm/x86_64-mont5-win.asm @@ -14,16 +14,14 @@ default rel section .text code align=64 -EXTERN OPENSSL_ia32cap_P - -global bn_mul_mont_gather5 +global bn_mul_mont_gather5_nohw ALIGN 64 -bn_mul_mont_gather5: +bn_mul_mont_gather5_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_bn_mul_mont_gather5: +$L$SEH_begin_bn_mul_mont_gather5_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -34,17 +32,11 @@ $L$SEH_begin_bn_mul_mont_gather5: _CET_ENDBR + + mov r9d,r9d mov rax,rsp - test r9d,7 - jnz NEAR $L$mul_enter - lea r11,[OPENSSL_ia32cap_P] - mov r11d,DWORD[8+r11] - jmp NEAR $L$mul4x_enter - -ALIGN 16 -$L$mul_enter: movd xmm5,DWORD[56+rsp] push rbx @@ -472,7 +464,8 @@ $L$mul_epilogue: mov rsi,QWORD[16+rsp] ret -$L$SEH_end_bn_mul_mont_gather5: +$L$SEH_end_bn_mul_mont_gather5_nohw: +global bn_mul4x_mont_gather5 ALIGN 32 bn_mul4x_mont_gather5: @@ -489,13 +482,10 @@ $L$SEH_begin_bn_mul4x_mont_gather5: +_CET_ENDBR DB 0x67 mov rax,rsp -$L$mul4x_enter: - and r11d,0x80108 - cmp r11d,0x80108 - je NEAR $L$mulx4x_enter push rbx push rbp @@ -511,6 +501,9 @@ $L$mul4x_enter: $L$mul4x_prologue: DB 0x67 + + + shl r9d,3 lea r10,[r9*2+r9] neg r9 @@ -1122,14 +1115,14 @@ $L$inner4x: jmp NEAR $L$sqr4x_sub_entry -global bn_power5 +global bn_power5_nohw ALIGN 32 -bn_power5: +bn_power5_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_bn_power5: +$L$SEH_begin_bn_power5_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1142,11 +1135,6 @@ $L$SEH_begin_bn_power5: _CET_ENDBR mov rax,rsp - lea r11,[OPENSSL_ia32cap_P] - mov r11d,DWORD[8+r11] - and r11d,0x80108 - cmp r11d,0x80108 - je NEAR $L$powerx5_enter push rbx push rbp @@ -1161,6 +1149,9 @@ _CET_ENDBR $L$power5_prologue: + + + shl r9d,3 lea r10d,[r9*2+r9] neg r9 @@ -1271,7 +1262,7 @@ $L$power5_epilogue: mov rsi,QWORD[16+rsp] ret -$L$SEH_end_bn_power5: +$L$SEH_end_bn_power5_nohw: global bn_sqr8x_internal @@ -2113,6 +2104,7 @@ $L$sqr4x_sub_entry: ret +global bn_mulx4x_mont_gather5 ALIGN 32 bn_mulx4x_mont_gather5: @@ -2129,9 +2121,9 @@ $L$SEH_begin_bn_mulx4x_mont_gather5: +_CET_ENDBR mov rax,rsp -$L$mulx4x_enter: push rbx push rbp @@ -2146,6 +2138,9 @@ $L$mulx4x_enter: $L$mulx4x_prologue: + + + shl r9d,3 lea r10,[r9*2+r9] neg r9 @@ -2664,6 +2659,7 @@ $L$mulx4x_inner: jmp NEAR $L$sqrx4x_sub_entry +global bn_powerx5 ALIGN 32 bn_powerx5: @@ -2680,9 +2676,9 @@ $L$SEH_begin_bn_powerx5: +_CET_ENDBR mov rax,rsp -$L$powerx5_enter: push rbx push rbp @@ -2697,6 +2693,9 @@ $L$powerx5_enter: $L$powerx5_prologue: + + + shl r9d,3 lea r10,[r9*2+r9] neg r9 @@ -3804,17 +3803,17 @@ $L$common_seh_tail: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase - DD $L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase - DD $L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase + DD $L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase + DD $L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase + DD $L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase - DD $L$SEH_begin_bn_power5 wrt ..imagebase - DD $L$SEH_end_bn_power5 wrt ..imagebase - DD $L$SEH_info_bn_power5 wrt ..imagebase + DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase + DD $L$SEH_end_bn_power5_nohw wrt ..imagebase + DD $L$SEH_info_bn_power5_nohw wrt ..imagebase DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase @@ -3828,7 +3827,7 @@ ALIGN 4 section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_bn_mul_mont_gather5: +$L$SEH_info_bn_mul_mont_gather5_nohw: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase @@ -3838,7 +3837,7 @@ $L$SEH_info_bn_mul4x_mont_gather5: DD mul_handler wrt ..imagebase DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase ALIGN 8 -$L$SEH_info_bn_power5: +$L$SEH_info_bn_power5_nohw: DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase