// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // ---------------------------------------------------------------------------- // Multiply z := x * y // Inputs x[8], y[8]; output z[16] // // extern void bignum_mul_8_16_alt // (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); // // Standard x86-64 ABI: RDI = z, RSI = x, RDX = y // Microsoft x64 ABI: RCX = z, RDX = x, R8 = y // ---------------------------------------------------------------------------- #include "s2n_bignum_internal.h" .intel_syntax noprefix S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) .text // These are actually right #define z rdi #define x rsi // This is moved from rdx to free it for muls #define y rcx // Other variables used as a rotating 3-word window to add terms to #define t0 r8 #define t1 r9 #define t2 r10 // Macro for the key "multiply and add to (c,h,l)" step #define combadd(c,h,l,numa,numb) \ mov rax, numa; \ mul QWORD PTR numb; \ add l, rax; \ adc h, rdx; \ adc c, 0 // A minutely shorter form for when c = 0 initially #define combadz(c,h,l,numa,numb) \ mov rax, numa; \ mul QWORD PTR numb; \ add l, rax; \ adc h, rdx; \ adc c, c // A short form where we don't expect a top carry #define combads(h,l,numa,numb) \ mov rax, numa; \ mul QWORD PTR numb; \ add l, rax; \ adc h, rdx S2N_BN_SYMBOL(bignum_mul_8_16_alt): _CET_ENDBR #if WINDOWS_ABI push rdi push rsi mov rdi, rcx mov rsi, rdx mov rdx, r8 #endif // Copy y into a safe register to start with mov y, rdx // Result term 0 mov rax, [x] mul QWORD PTR [y] mov [z], rax mov t0, rdx xor t1, t1 // Result term 1 xor t2, t2 combads(t1,t0,[x],[y+8]) combadz(t2,t1,t0,[x+8],[y]) mov [z+8], t0 // Result term 2 xor t0, t0 combadz(t0,t2,t1,[x],[y+16]) combadd(t0,t2,t1,[x+8],[y+8]) combadd(t0,t2,t1,[x+16],[y]) mov [z+16], t1 // Result term 3 xor t1, t1 combadz(t1,t0,t2,[x],[y+24]) combadd(t1,t0,t2,[x+8],[y+16]) combadd(t1,t0,t2,[x+16],[y+8]) combadd(t1,t0,t2,[x+24],[y]) mov [z+24], t2 // Result term 4 xor t2, t2 combadz(t2,t1,t0,[x],[y+32]) combadd(t2,t1,t0,[x+8],[y+24]) combadd(t2,t1,t0,[x+16],[y+16]) combadd(t2,t1,t0,[x+24],[y+8]) combadd(t2,t1,t0,[x+32],[y]) mov [z+32], t0 // Result term 5 xor t0, t0 combadz(t0,t2,t1,[x],[y+40]) combadd(t0,t2,t1,[x+8],[y+32]) combadd(t0,t2,t1,[x+16],[y+24]) combadd(t0,t2,t1,[x+24],[y+16]) combadd(t0,t2,t1,[x+32],[y+8]) combadd(t0,t2,t1,[x+40],[y]) mov [z+40], t1 // Result term 6 xor t1, t1 combadz(t1,t0,t2,[x],[y+48]) combadd(t1,t0,t2,[x+8],[y+40]) combadd(t1,t0,t2,[x+16],[y+32]) combadd(t1,t0,t2,[x+24],[y+24]) combadd(t1,t0,t2,[x+32],[y+16]) combadd(t1,t0,t2,[x+40],[y+8]) combadd(t1,t0,t2,[x+48],[y]) mov [z+48], t2 // Result term 7 xor t2, t2 combadz(t2,t1,t0,[x],[y+56]) combadd(t2,t1,t0,[x+8],[y+48]) combadd(t2,t1,t0,[x+16],[y+40]) combadd(t2,t1,t0,[x+24],[y+32]) combadd(t2,t1,t0,[x+32],[y+24]) combadd(t2,t1,t0,[x+40],[y+16]) combadd(t2,t1,t0,[x+48],[y+8]) combadd(t2,t1,t0,[x+56],[y]) mov [z+56], t0 // Result term 8 xor t0, t0 combadz(t0,t2,t1,[x+8],[y+56]) combadd(t0,t2,t1,[x+16],[y+48]) combadd(t0,t2,t1,[x+24],[y+40]) combadd(t0,t2,t1,[x+32],[y+32]) combadd(t0,t2,t1,[x+40],[y+24]) combadd(t0,t2,t1,[x+48],[y+16]) combadd(t0,t2,t1,[x+56],[y+8]) mov [z+64], t1 // Result term 9 xor t1, t1 combadz(t1,t0,t2,[x+16],[y+56]) combadd(t1,t0,t2,[x+24],[y+48]) combadd(t1,t0,t2,[x+32],[y+40]) combadd(t1,t0,t2,[x+40],[y+32]) combadd(t1,t0,t2,[x+48],[y+24]) combadd(t1,t0,t2,[x+56],[y+16]) mov [z+72], t2 // Result term 10 xor t2, t2 combadz(t2,t1,t0,[x+24],[y+56]) combadd(t2,t1,t0,[x+32],[y+48]) combadd(t2,t1,t0,[x+40],[y+40]) combadd(t2,t1,t0,[x+48],[y+32]) combadd(t2,t1,t0,[x+56],[y+24]) mov [z+80], t0 // Result term 11 xor t0, t0 combadz(t0,t2,t1,[x+32],[y+56]) combadd(t0,t2,t1,[x+40],[y+48]) combadd(t0,t2,t1,[x+48],[y+40]) combadd(t0,t2,t1,[x+56],[y+32]) mov [z+88], t1 // Result term 12 xor t1, t1 combadz(t1,t0,t2,[x+40],[y+56]) combadd(t1,t0,t2,[x+48],[y+48]) combadd(t1,t0,t2,[x+56],[y+40]) mov [z+96], t2 // Result term 13 xor t2, t2 combadz(t2,t1,t0,[x+48],[y+56]) combadd(t2,t1,t0,[x+56],[y+48]) mov [z+104], t0 // Result term 14 combads(t2,t1,[x+56],[y+56]) mov [z+112], t1 // Result term 11 mov [z+120], t2 // Return #if WINDOWS_ABI pop rsi pop rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif