// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // ---------------------------------------------------------------------------- // Square z := x^2 // Input x[n]; output z[k] // // extern void bignum_sqr // (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); // // Does the "z := x^2" operation where x is n digits and result z is k. // Truncates the result in general unless k >= 2 * n // // Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x // Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x // ---------------------------------------------------------------------------- #include "s2n_bignum_internal.h" .intel_syntax noprefix S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr) S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr) .text // First three are where arguments come in, but n is moved. #define p rdi #define z rsi #define x rcx #define n r8 // These are always local scratch since multiplier result is in these #define a rax #define d rdx // Other variables #define i rbx #define ll rbp #define hh r9 #define k r10 #define y r11 #define htop r12 #define l r13 #define h r14 #define c r15 // Short versions #define llshort ebp S2N_BN_SYMBOL(bignum_sqr): _CET_ENDBR #if WINDOWS_ABI push rdi push rsi mov rdi, rcx mov rsi, rdx mov rdx, r8 mov rcx, r9 #endif // We use too many registers, and also we need rax:rdx for multiplications push rbx push rbp push r12 push r13 push r14 push r15 mov n, rdx // If p = 0 the result is trivial and nothing needs doing test p, p jz end // initialize (hh,ll) = 0 xor llshort, llshort xor hh, hh // Iterate outer loop from k = 0 ... k = p - 1 producing result digits xor k, k outerloop: // First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n // We want to accumulate all x[i] * x[k - i] for bot <= i < top // For the optimization of squaring we avoid duplication and do // 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n // Initialize i = bot; in fact just compute bot as i directly. xor c, c lea i, [k+1] mov htop, i shr htop, 1 sub i, n cmovc i, c cmp htop, n cmovnc htop, n // Initialize the three-part local sum (c,h,l); c was already done above xor l, l xor h, h // If htop <= bot then main doubled part of the sum is empty cmp i, htop jnc nosumming // Use a moving pointer for [y] = x[k-i] for the cofactor mov a, k sub a, i lea y, [x+8*a] // Do the main part of the sum x[i] * x[k - i] for 2 * i < k innerloop: mov a, [x+8*i] mul QWORD PTR [y] add l, a adc h, d adc c, 0 sub y, 8 inc i cmp i, htop jc innerloop // Now double it add l, l adc h, h adc c, c // If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term nosumming: test k, 1 jnz innerend cmp i, n jnc innerend mov a, [x+8*i] mul a add l, a adc h, d adc c, 0 // Now add the local sum into the global sum, store and shift innerend: add l, ll mov [z+8*k], l adc h, hh mov ll, h adc c, 0 mov hh, c inc k cmp k, p jc outerloop // Restore registers and return end: pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx #if WINDOWS_ABI pop rsi pop rdi #endif ret #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif