// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // ---------------------------------------------------------------------------- // Multiply-add with single-word multiplier, z := z + c * y // Inputs c, y[n]; outputs function return (carry-out) and z[k] // // extern uint64_t bignum_cmadd // (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); // // Does the "z := z + c * y" operation where y is n digits, result z is p. // Truncates the result in general. // // The return value is a high/carry word that is meaningful when p = n + 1, or // more generally when n <= p and the result fits in p + 1 digits. In these // cases it gives the top digit of the (p + 1)-digit result. // // Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX // Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX // ---------------------------------------------------------------------------- #include "s2n_bignum_internal.h" .intel_syntax noprefix S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) .text #define p rdi #define z rsi #define c r9 #define n rcx #define x r8 #define i r10 #define h r11 #define r rbx #define hshort r11d #define ishort r10d S2N_BN_SYMBOL(bignum_cmadd): _CET_ENDBR #if WINDOWS_ABI push rdi push rsi mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 mov r8, [rsp+56] #endif // Seems hard to avoid one more register push rbx // First clamp the input size n := min(p,n) since we can never need to read // past the p'th term of the input to generate p-digit output. // Subtract p := p - min(n,p) so it holds the size of the extra tail needed cmp p, n cmovc n, p sub p, n // Initialize high part h = 0; if n = 0 do nothing but return that zero xor h, h test n, n jz end // Move c into a safer register as multiplies overwrite rdx mov c, rdx // Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 mov rax, [x] mul c add [z], rax mov h, rdx mov ishort, 1 dec n jz hightail // Main loop, where we always have CF + previous high part h to add in loop: adc h, [z+8*i] sbb r, r mov rax, [x+8*i] mul c sub rdx, r add rax, h mov [z+8*i], rax mov h, rdx inc i dec n jnz loop hightail: adc h, 0 // Propagate the carry all the way to the end with h as extra carry word tail: test p, p jz end add [z+8*i], h mov hshort, 0 inc i dec p jz highend tloop: adc [z+8*i], h inc i dec p jnz tloop highend: adc h, 0 // Return the high/carry word end: mov rax, h pop rbx #if WINDOWS_ABI pop rsi pop rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif