/* * AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its Usage for Computing * the GCM Mode" paper and reference code, using the aggregated reduction method. * Originally adapted by Romain Dolbeau. */ #include #include #include #include #include "core.h" #include "crypto_aead_aes256gcm.h" #include "export.h" #include "private/common.h" #include "private/sse2_64_32.h" #include "randombytes.h" #include "runtime.h" #include "utils.h" #if defined(HAVE_TMMINTRIN_H) && defined(HAVE_WMMINTRIN_H) # ifdef __GNUC__ # pragma GCC target("ssse3") # pragma GCC target("aes") # pragma GCC target("pclmul") # endif #include #include #ifndef ENOSYS # define ENOSYS ENXIO #endif #if defined(__INTEL_COMPILER) || defined(_bswap64) #elif defined(_MSC_VER) # define _bswap64(a) _byteswap_uint64(a) #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) # define _bswap64(a) __builtin_bswap64(a) #else static inline uint64_t _bswap64(const uint64_t x) { return ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) | ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) | ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) | ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL); } #endif typedef struct context { CRYPTO_ALIGN(16) unsigned char H[16]; __m128i rkeys[16]; } context; static inline void aesni_key256_expand(const unsigned char *key, __m128i * const rkeys) { __m128i X0, X1, X2, X3; int i = 0; X0 = _mm_loadu_si128((const __m128i *) &key[0]); rkeys[i++] = X0; X2 = _mm_loadu_si128((const __m128i *) &key[16]); rkeys[i++] = X2; #define EXPAND_KEY_1(S) do { \ X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X2, (S)), 0xff); \ X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x10)); \ X0 = _mm_xor_si128(X0, X3); \ X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x8c)); \ X0 = _mm_xor_si128(_mm_xor_si128(X0, X3), X1); \ rkeys[i++] = X0; \ } while (0) #define EXPAND_KEY_2(S) do { \ X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X0, (S)), 0xaa); \ X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x10)); \ X2 = _mm_xor_si128(X2, X3); \ X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x8c)); \ X2 = _mm_xor_si128(_mm_xor_si128(X2, X3), X1); \ rkeys[i++] = X2; \ } while (0) X3 = _mm_setzero_si128(); EXPAND_KEY_1(0x01); EXPAND_KEY_2(0x01); EXPAND_KEY_1(0x02); EXPAND_KEY_2(0x02); EXPAND_KEY_1(0x04); EXPAND_KEY_2(0x04); EXPAND_KEY_1(0x08); EXPAND_KEY_2(0x08); EXPAND_KEY_1(0x10); EXPAND_KEY_2(0x10); EXPAND_KEY_1(0x20); EXPAND_KEY_2(0x20); EXPAND_KEY_1(0x40); } /** single, by-the-book AES encryption with AES-NI */ static inline void aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) { __m128i temp = _mm_xor_si128(nv, rkeys[0]); temp = _mm_aesenc_si128(temp, rkeys[1]); temp = _mm_aesenc_si128(temp, rkeys[2]); temp = _mm_aesenc_si128(temp, rkeys[3]); temp = _mm_aesenc_si128(temp, rkeys[4]); temp = _mm_aesenc_si128(temp, rkeys[5]); temp = _mm_aesenc_si128(temp, rkeys[6]); temp = _mm_aesenc_si128(temp, rkeys[7]); temp = _mm_aesenc_si128(temp, rkeys[8]); temp = _mm_aesenc_si128(temp, rkeys[9]); temp = _mm_aesenc_si128(temp, rkeys[10]); temp = _mm_aesenc_si128(temp, rkeys[11]); temp = _mm_aesenc_si128(temp, rkeys[12]); temp = _mm_aesenc_si128(temp, rkeys[13]); temp = _mm_aesenclast_si128(temp, rkeys[14]); _mm_storeu_si128((__m128i *) out, temp); } /** multiple-blocks-at-once AES encryption with AES-NI ; on Haswell, aesenc has a latency of 7 and a throughput of 1 so the sequence of aesenc should be bubble-free if you have at least 8 blocks. Let's build an arbitratry-sized function */ /* Step 1 : loading the nonce */ /* load & increment the n vector (non-vectorized, unused for now) */ #define NVDECLx(a) \ __m128i nv##a #define NVx(a) \ nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ n[3]++ /* Step 2 : define value in round one (xor with subkey #0, aka key) */ #define TEMPDECLx(a) \ __m128i temp##a #define TEMPx(a) \ temp##a = _mm_xor_si128(nv##a, rkeys[0]) /* Step 3: one round of AES */ #define AESENCx(a) \ temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr]) /* Step 4: last round of AES */ #define AESENCLASTx(a) \ temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]) /* Step 5: store result */ #define STOREx(a) \ _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a) /* all the MAKE* macros are for automatic explicit unrolling */ #define MAKE4(X) \ X(0); \ X(1); \ X(2); \ X(3) #define MAKE8(X) \ X(0); \ X(1); \ X(2); \ X(3); \ X(4); \ X(5); \ X(6); \ X(7) #define COUNTER_INC2(N) (N)[3] += 2 /* create a function of unrolling N ; the MAKEN is the unrolling macro, defined above. The N in MAKEN must match N, obviously. */ #define FUNC(N, MAKEN) \ static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \ { \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int roundctr; \ MAKEN(NVDECLx); \ MAKEN(TEMPDECLx); \ \ MAKEN(NVx); \ MAKEN(TEMPx); \ for (roundctr = 1; roundctr < 14; roundctr++) { \ MAKEN(AESENCx); \ } \ MAKEN(AESENCLASTx); \ MAKEN(STOREx); \ } FUNC(8, MAKE8) /* all GF(2^128) fnctions are by the book, meaning this one: */ static inline void addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b) { const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i A, B, C; __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; __m128i tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18; __m128i tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; __m128i tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36; if (xlen >= 16) { A = _mm_loadu_si128((const __m128i *) a); } else { CRYPTO_ALIGN(16) unsigned char padded[16]; unsigned int i; memset(padded, 0, 16); for (i = 0; i < xlen; i++) { padded[i] = a[i]; } A = _mm_load_si128((const __m128i *) padded); } A = _mm_shuffle_epi8(A, rev); B = _mm_loadu_si128((const __m128i *) b); C = _mm_loadu_si128((const __m128i *) c); A = _mm_xor_si128(A, C); tmp3 = _mm_clmulepi64_si128(A, B, 0x00); tmp4 = _mm_clmulepi64_si128(A, B, 0x10); tmp5 = _mm_clmulepi64_si128(A, B, 0x01); tmp6 = _mm_clmulepi64_si128(A, B, 0x11); tmp10 = _mm_xor_si128(tmp4, tmp5); tmp13 = _mm_slli_si128(tmp10, 8); tmp11 = _mm_srli_si128(tmp10, 8); tmp15 = _mm_xor_si128(tmp3, tmp13); tmp17 = _mm_xor_si128(tmp6, tmp11); tmp7 = _mm_srli_epi32(tmp15, 31); tmp8 = _mm_srli_epi32(tmp17, 31); tmp16 = _mm_slli_epi32(tmp15, 1); tmp18 = _mm_slli_epi32(tmp17, 1); tmp9 = _mm_srli_si128(tmp7, 12); tmp22 = _mm_slli_si128(tmp8, 4); tmp25 = _mm_slli_si128(tmp7, 4); tmp29 = _mm_or_si128(tmp16, tmp25); tmp19 = _mm_or_si128(tmp18, tmp22); tmp20 = _mm_or_si128(tmp19, tmp9); tmp26 = _mm_slli_epi32(tmp29, 31); tmp23 = _mm_slli_epi32(tmp29, 30); tmp32 = _mm_slli_epi32(tmp29, 25); tmp27 = _mm_xor_si128(tmp26, tmp23); tmp28 = _mm_xor_si128(tmp27, tmp32); tmp24 = _mm_srli_si128(tmp28, 4); tmp33 = _mm_slli_si128(tmp28, 12); tmp30 = _mm_xor_si128(tmp29, tmp33); tmp2 = _mm_srli_epi32(tmp30, 1); tmp12 = _mm_srli_epi32(tmp30, 2); tmp14 = _mm_srli_epi32(tmp30, 7); tmp34 = _mm_xor_si128(tmp2, tmp12); tmp35 = _mm_xor_si128(tmp34, tmp14); tmp36 = _mm_xor_si128(tmp35, tmp24); tmp31 = _mm_xor_si128(tmp30, tmp36); tmp21 = _mm_xor_si128(tmp20, tmp31); _mm_storeu_si128((__m128i *) c, tmp21); } /* pure multiplication, for pre-computing powers of H */ static inline __m128i mulv(__m128i A, __m128i B) { __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); __m128i tmp13 = _mm_slli_si128(tmp10, 8); __m128i tmp11 = _mm_srli_si128(tmp10, 8); __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); __m128i tmp7 = _mm_srli_epi32(tmp15, 31); __m128i tmp8 = _mm_srli_epi32(tmp17, 31); __m128i tmp16 = _mm_slli_epi32(tmp15, 1); __m128i tmp18 = _mm_slli_epi32(tmp17, 1); __m128i tmp9 = _mm_srli_si128(tmp7, 12); __m128i tmp22 = _mm_slli_si128(tmp8, 4); __m128i tmp25 = _mm_slli_si128(tmp7, 4); __m128i tmp29 = _mm_or_si128(tmp16, tmp25); __m128i tmp19 = _mm_or_si128(tmp18, tmp22); __m128i tmp20 = _mm_or_si128(tmp19, tmp9); __m128i tmp26 = _mm_slli_epi32(tmp29, 31); __m128i tmp23 = _mm_slli_epi32(tmp29, 30); __m128i tmp32 = _mm_slli_epi32(tmp29, 25); __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); __m128i tmp24 = _mm_srli_si128(tmp28, 4); __m128i tmp33 = _mm_slli_si128(tmp28, 12); __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); __m128i tmp2 = _mm_srli_epi32(tmp30, 1); __m128i tmp12 = _mm_srli_epi32(tmp30, 2); __m128i tmp14 = _mm_srli_epi32(tmp30, 7); __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); __m128i C = _mm_xor_si128(tmp20, tmp31); return C; } /* 4 multiply-accumulate at once; again for the Aggregated Reduction Method & sample code. Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */ #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) #define RED_MUL_MID(a) \ tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ tmp##a = _mm_xor_si128(tmp##a, H##a); \ tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) #define MULREDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \ do { \ MAKE4(RED_DECL); \ __m128i lo, hi; \ __m128i tmp8, tmp9; \ __m128i H0 = H0_; \ __m128i H1 = H1_; \ __m128i H2 = H2_; \ __m128i H3 = H3_; \ __m128i X0 = X0_; \ __m128i X1 = X1_; \ __m128i X2 = X2_; \ __m128i X3 = X3_; \ \ /* byte-revert the inputs & xor the first one into the accumulator */ \ \ MAKE4(RED_SHUFFLE); \ X3 = _mm_xor_si128(X3, accv); \ \ /* 4 low H*X (x0*h0) */ \ \ MAKE4(RED_MUL_LOW); \ lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \ lo = _mm_xor_si128(lo, H2_X2_lo); \ lo = _mm_xor_si128(lo, H3_X3_lo); \ \ /* 4 high H*X (x1*h1) */ \ \ MAKE4(RED_MUL_HIGH); \ hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \ hi = _mm_xor_si128(hi, H2_X2_hi); \ hi = _mm_xor_si128(hi, H3_X3_hi); \ \ /* 4 middle H*X, using Karatsuba, i.e. \ x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \ we already have all x1y1 & x0y0 (accumulated in hi & lo) \ (0 is low half and 1 is high half) \ */ \ /* permute the high and low 64 bits in H1 & X1, \ so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \ then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \ and finally multiply \ */ \ MAKE4(RED_MUL_MID); \ \ /* substracts x1*h1 and x0*h0 */ \ tmp0 = _mm_xor_si128(tmp0, lo); \ tmp0 = _mm_xor_si128(tmp0, hi); \ tmp0 = _mm_xor_si128(tmp1, tmp0); \ tmp0 = _mm_xor_si128(tmp2, tmp0); \ tmp0 = _mm_xor_si128(tmp3, tmp0);\ \ /* reduction */ \ tmp0B = _mm_slli_si128(tmp0, 8); \ tmp0 = _mm_srli_si128(tmp0, 8); \ lo = _mm_xor_si128(tmp0B, lo); \ hi = _mm_xor_si128(tmp0, hi); \ tmp3 = lo; \ tmp2B = hi; \ tmp3B = _mm_srli_epi32(tmp3, 31); \ tmp8 = _mm_srli_epi32(tmp2B, 31); \ tmp3 = _mm_slli_epi32(tmp3, 1); \ tmp2B = _mm_slli_epi32(tmp2B, 1); \ tmp9 = _mm_srli_si128(tmp3B, 12); \ tmp8 = _mm_slli_si128(tmp8, 4); \ tmp3B = _mm_slli_si128(tmp3B, 4); \ tmp3 = _mm_or_si128(tmp3, tmp3B); \ tmp2B = _mm_or_si128(tmp2B, tmp8); \ tmp2B = _mm_or_si128(tmp2B, tmp9); \ tmp3B = _mm_slli_epi32(tmp3, 31); \ tmp8 = _mm_slli_epi32(tmp3, 30); \ tmp9 = _mm_slli_epi32(tmp3, 25); \ tmp3B = _mm_xor_si128(tmp3B, tmp8); \ tmp3B = _mm_xor_si128(tmp3B, tmp9); \ tmp8 = _mm_srli_si128(tmp3B, 4); \ tmp3B = _mm_slli_si128(tmp3B, 12); \ tmp3 = _mm_xor_si128(tmp3, tmp3B); \ tmp2 = _mm_srli_epi32(tmp3, 1); \ tmp0B = _mm_srli_epi32(tmp3, 2); \ tmp1B = _mm_srli_epi32(tmp3, 7); \ tmp2 = _mm_xor_si128(tmp2, tmp0B); \ tmp2 = _mm_xor_si128(tmp2, tmp1B); \ tmp2 = _mm_xor_si128(tmp2, tmp8); \ tmp3 = _mm_xor_si128(tmp3, tmp2); \ tmp2B = _mm_xor_si128(tmp2B, tmp3); \ \ accv = tmp2B; \ } while(0) #define XORx(a) \ temp##a = _mm_xor_si128(temp##a, \ _mm_loadu_si128((const __m128i *) (in + a * 16))) #define LOADx(a) \ __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)) /* full encrypt & checksum 8 blocks at once */ #define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \ do { \ unsigned char *out = out_; \ uint32_t *n = n_; \ const unsigned char *in = in_; \ const __m128i hv = hv_; \ const __m128i h2v = h2v_; \ const __m128i h3v = h3v_; \ const __m128i h4v = h4v_; \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __m128i accv_; \ int roundctr; \ \ MAKE8(NVDECLx); \ MAKE8(TEMPDECLx); \ MAKE8(NVx); \ MAKE8(TEMPx); \ for (roundctr = 1; roundctr < 14; roundctr++) { \ MAKE8(AESENCx); \ } \ MAKE8(AESENCLASTx); \ MAKE8(XORx); \ MAKE8(STOREx); \ accv_ = _mm_load_si128((const __m128i *) accum); \ MULREDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \ MULREDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \ _mm_store_si128((__m128i *) accum, accv_); \ } while(0) /* checksum 8 blocks at once */ #define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \ do { \ const unsigned char *in = in_; \ const __m128i hv = hv_; \ const __m128i h2v = h2v_; \ const __m128i h3v = h3v_; \ const __m128i h4v = h4v_; \ __m128i accv_; \ \ MAKE8(LOADx); \ accv_ = _mm_load_si128((const __m128i *) accum); \ MULREDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \ MULREDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \ _mm_store_si128((__m128i *) accum, accv_); \ } while(0) /* decrypt 8 blocks at once */ #define aesni_decrypt8full(out_, n_, rkeys, in_) \ do { \ unsigned char *out = out_; \ uint32_t *n = n_; \ const unsigned char *in = in_; \ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int roundctr; \ \ MAKE8(NVDECLx); \ MAKE8(TEMPDECLx); \ MAKE8(NVx); \ MAKE8(TEMPx); \ for (roundctr = 1; roundctr < 14; roundctr++) { \ MAKE8(AESENCx); \ } \ MAKE8(AESENCLASTx); \ MAKE8(XORx); \ MAKE8(STOREx); \ } while(0) int crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, const unsigned char *k) { context *ctx = (context *) ctx_; __m128i *rkeys = ctx->rkeys; __m128i zero = _mm_setzero_si128(); unsigned char *H = ctx->H; COMPILER_ASSERT((sizeof *ctx_) >= (sizeof *ctx)); aesni_key256_expand(k, rkeys); aesni_encrypt1(H, zero, rkeys); return 0; } int crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *mac, unsigned long long *maclen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const context *ctx = (const context *) ctx_; const __m128i *rkeys = ctx->rkeys; __m128i Hv, H2v, H3v, H4v, accv; unsigned long long i, j; unsigned long long adlen_rnd64 = adlen & ~63ULL; unsigned long long mlen_rnd128 = mlen & ~127ULL; CRYPTO_ALIGN(16) uint32_t n2[4]; CRYPTO_ALIGN(16) unsigned char H[16]; CRYPTO_ALIGN(16) unsigned char T[16]; CRYPTO_ALIGN(16) unsigned char accum[16]; CRYPTO_ALIGN(16) unsigned char fb[16]; (void) nsec; memcpy(H, ctx->H, sizeof H); if (mlen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) { sodium_misuse(); /* LCOV_EXCL_LINE */ } memcpy(&n2[0], npub, 3 * 4); n2[3] = 0x01000000; aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); { uint64_t x; x = _bswap64((uint64_t) (8 * adlen)); memcpy(&fb[0], &x, sizeof x); x = _bswap64((uint64_t) (8 * mlen)); memcpy(&fb[8], &x, sizeof x); } /* we store H (and it's power) byte-reverted once and for all */ Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); _mm_store_si128((__m128i *) H, Hv); H2v = mulv(Hv, Hv); H3v = mulv(H2v, Hv); H4v = mulv(H3v, Hv); accv = _mm_setzero_si128(); /* unrolled by 4 GCM (by 8 doesn't improve using MULREDUCE4) */ for (i = 0; i < adlen_rnd64; i += 64) { __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0)); __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48)); MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv); } _mm_store_si128((__m128i *) accum, accv); /* GCM remainder loop */ for (i = adlen_rnd64; i < adlen; i += 16) { unsigned int blocklen = 16; if (i + (unsigned long long) blocklen > adlen) { blocklen = (unsigned int) (adlen - i); } addmul(accum, ad + i, blocklen, H); } /* this only does 8 full blocks, so no fancy bounds checking is necessary*/ #define LOOPRND128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_encrypt8full(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev); \ } \ } while(0) /* remainder loop, with the slower GCM update to accommodate partial blocks */ #define LOOPRMD128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = mlen_rnd128; i < mlen; i += lb) { \ CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ unsigned long long mj = lb; \ \ aesni_encrypt8(outni, n2, rkeys); \ if ((i + mj) >= mlen) { \ mj = mlen - i; \ } \ for (j = 0; j < mj; j++) { \ c[i + j] = m[i + j] ^ outni[j]; \ } \ for (j = 0; j < mj; j += 16) { \ unsigned int bl = 16; \ \ if (j + (unsigned long long) bl >= mj) { \ bl = (unsigned int) (mj - j); \ } \ addmul(accum, c + i + j, bl, H); \ } \ } \ } while(0) n2[3] &= 0x00ffffff; COUNTER_INC2(n2); LOOPRND128; LOOPRMD128; addmul(accum, fb, 16, H); for (i = 0; i < 16; ++i) { mac[i] = T[i] ^ accum[15 - i]; } if (maclen_p != NULL) { *maclen_p = 16; } return 0; } int crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, c + mlen, NULL, m, mlen, ad, adlen, nsec, npub, ctx_); if (clen_p != NULL) { *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; } return ret; } int crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *mac, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const context *ctx = (const context *) ctx_; const __m128i *rkeys = ctx->rkeys; __m128i Hv, H2v, H3v, H4v, accv; unsigned long long i, j; unsigned long long adlen_rnd64 = adlen & ~63ULL; unsigned long long mlen; unsigned long long mlen_rnd128; CRYPTO_ALIGN(16) uint32_t n2[4]; CRYPTO_ALIGN(16) unsigned char H[16]; CRYPTO_ALIGN(16) unsigned char T[16]; CRYPTO_ALIGN(16) unsigned char accum[16]; CRYPTO_ALIGN(16) unsigned char fb[16]; (void) nsec; if (clen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) { sodium_misuse(); /* LCOV_EXCL_LINE */ } mlen = clen; memcpy(&n2[0], npub, 3 * 4); n2[3] = 0x01000000; aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); { uint64_t x; x = _bswap64((uint64_t)(8 * adlen)); memcpy(&fb[0], &x, sizeof x); x = _bswap64((uint64_t)(8 * mlen)); memcpy(&fb[8], &x, sizeof x); } memcpy(H, ctx->H, sizeof H); Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); _mm_store_si128((__m128i *) H, Hv); H2v = mulv(Hv, Hv); H3v = mulv(H2v, Hv); H4v = mulv(H3v, Hv); accv = _mm_setzero_si128(); for (i = 0; i < adlen_rnd64; i += 64) { __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0)); __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16)); __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32)); __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48)); MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv); } _mm_store_si128((__m128i *) accum, accv); for (i = adlen_rnd64; i < adlen; i += 16) { unsigned int blocklen = 16; if (i + (unsigned long long) blocklen > adlen) { blocklen = (unsigned int) (adlen - i); } addmul(accum, ad + i, blocklen, H); } mlen_rnd128 = mlen & ~127ULL; #define LOOPACCUMDRND128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev); \ } \ } while(0) #define LOOPDRND128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = 0; i < mlen_rnd128; i += lb) { \ aesni_decrypt8full(m + i, n2, rkeys, c + i); \ } \ } while(0) #define LOOPACCUMDRMD128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = mlen_rnd128; i < mlen; i += lb) { \ unsigned long long mj = lb; \ \ if ((i + mj) >= mlen) { \ mj = mlen - i; \ } \ for (j = 0; j < mj; j += 16) { \ unsigned int bl = 16; \ \ if (j + (unsigned long long) bl >= mj) { \ bl = (unsigned int) (mj - j); \ } \ addmul(accum, c + i + j, bl, H); \ } \ } \ } while(0) #define LOOPDRMD128 \ do { \ const int iter = 8; \ const int lb = iter * 16; \ \ for (i = mlen_rnd128; i < mlen; i += lb) { \ CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ unsigned long long mj = lb; \ \ if ((i + mj) >= mlen) { \ mj = mlen - i; \ } \ aesni_encrypt8(outni, n2, rkeys); \ for (j = 0; j < mj; j++) { \ m[i + j] = c[i + j] ^ outni[j]; \ } \ } \ } while(0) n2[3] &= 0x00ffffff; COUNTER_INC2(n2); LOOPACCUMDRND128; LOOPACCUMDRMD128; addmul(accum, fb, 16, H); { unsigned char d = 0; for (i = 0; i < 16; i++) { d |= (mac[i] ^ (T[i] ^ accum[15 - i])); } if (d != 0) { if (m != NULL) { memset(m, 0, mlen); } return -1; } if (m == NULL) { return 0; } } n2[3] = 0U; COUNTER_INC2(n2); LOOPDRND128; LOOPDRMD128; return 0; } int crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { unsigned long long mlen = 0ULL; int ret = -1; if (clen >= crypto_aead_aes256gcm_ABYTES) { ret = crypto_aead_aes256gcm_decrypt_detached_afternm (m, nsec, c, clen - crypto_aead_aes256gcm_ABYTES, c + clen - crypto_aead_aes256gcm_ABYTES, ad, adlen, npub, ctx_); } if (mlen_p != NULL) { if (ret == 0) { mlen = clen - crypto_aead_aes256gcm_ABYTES; } *mlen_p = mlen; } return ret; } int crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac, unsigned long long *maclen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; crypto_aead_aes256gcm_beforenm(&ctx, k); return crypto_aead_aes256gcm_encrypt_detached_afternm (c, mac, maclen_p, m, mlen, ad, adlen, nsec, npub, (const crypto_aead_aes256gcm_state *) &ctx); } int crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; int ret; crypto_aead_aes256gcm_beforenm(&ctx, k); ret = crypto_aead_aes256gcm_encrypt_afternm (c, clen_p, m, mlen, ad, adlen, nsec, npub, (const crypto_aead_aes256gcm_state *) &ctx); sodium_memzero(ctx, sizeof ctx); return ret; } int crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *mac, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; crypto_aead_aes256gcm_beforenm(&ctx, k); return crypto_aead_aes256gcm_decrypt_detached_afternm (m, nsec, c, clen, mac, ad, adlen, npub, (const crypto_aead_aes256gcm_state *) &ctx); } int crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; int ret; crypto_aead_aes256gcm_beforenm(&ctx, k); ret = crypto_aead_aes256gcm_decrypt_afternm (m, mlen_p, nsec, c, clen, ad, adlen, npub, (const crypto_aead_aes256gcm_state *) &ctx); sodium_memzero(ctx, sizeof ctx); return ret; } int crypto_aead_aes256gcm_is_available(void) { return sodium_runtime_has_pclmul() & sodium_runtime_has_aesni(); } #else int crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, unsigned char *mac, unsigned long long *maclen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *mac, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, const unsigned char *k) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, unsigned char *mac, unsigned long long *maclen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *mac, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const crypto_aead_aes256gcm_state *ctx_) { errno = ENOSYS; return -1; } int crypto_aead_aes256gcm_is_available(void) { return 0; } #endif size_t crypto_aead_aes256gcm_keybytes(void) { return crypto_aead_aes256gcm_KEYBYTES; } size_t crypto_aead_aes256gcm_nsecbytes(void) { return crypto_aead_aes256gcm_NSECBYTES; } size_t crypto_aead_aes256gcm_npubbytes(void) { return crypto_aead_aes256gcm_NPUBBYTES; } size_t crypto_aead_aes256gcm_abytes(void) { return crypto_aead_aes256gcm_ABYTES; } size_t crypto_aead_aes256gcm_statebytes(void) { return (sizeof(crypto_aead_aes256gcm_state) + (size_t) 15U) & ~(size_t) 15U; } size_t crypto_aead_aes256gcm_messagebytes_max(void) { return crypto_aead_aes256gcm_MESSAGEBYTES_MAX; } void crypto_aead_aes256gcm_keygen(unsigned char k[crypto_aead_aes256gcm_KEYBYTES]) { randombytes_buf(k, crypto_aead_aes256gcm_KEYBYTES); }