diff options
Diffstat (limited to 'src/crypto/chacha20poly1305.c')
-rw-r--r-- | src/crypto/chacha20poly1305.c | 639 |
1 files changed, 334 insertions, 305 deletions
diff --git a/src/crypto/chacha20poly1305.c b/src/crypto/chacha20poly1305.c index a9c3bf8..ac033f0 100644 --- a/src/crypto/chacha20poly1305.c +++ b/src/crypto/chacha20poly1305.c @@ -1,6 +1,34 @@ -/* - * Copyright (C) 2015-2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. +/* Copyright (C) 2015-2017 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright 2015 Martin Willi. + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * - Redistributions of source code must retain copyright notices, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * - Neither the name of the CRYPTOGAMS nor the names of its + * copyright holder and contributors may be used to endorse or + * promote products derived from this software without specific + * prior written permission. + * ALTERNATIVELY, provided that this notice is retained in full, this + * product may be distributed under the terms of the GNU General Public + * License (GPL), in which case the provisions of the GPL apply INSTEAD OF + * those given above. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "chacha20poly1305.h" @@ -15,27 +43,39 @@ #if defined(CONFIG_X86_64) #include <asm/cpufeature.h> #include <asm/processor.h> +asmlinkage void poly1305_init_x86_64(void *ctx, const unsigned char key[16]); +asmlinkage void poly1305_blocks_x86_64(void *ctx, const unsigned char *inp, size_t len, u32 padbit); +asmlinkage void poly1305_emit_x86_64(void *ctx, unsigned char mac[16], const u32 nonce[4]); #ifdef CONFIG_AS_SSSE3 -asmlinkage void hchacha20_asm_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key); -asmlinkage void chacha20_asm_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_asm_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key); +asmlinkage void chacha20_ssse3(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]); #endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_asm_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +#ifdef CONFIG_AS_AVX +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[16], const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, size_t len, u32 padbit); #endif -asmlinkage void poly1305_asm_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks); -asmlinkage void poly1305_asm_2block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); #ifdef CONFIG_AS_AVX2 -asmlinkage void poly1305_asm_4block_avx2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); +asmlinkage void chacha20_avx2(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]); +asmlinkage void poly1305_blocks_avx2(void *ctx, const unsigned char *inp, size_t len, u32 padbit); #endif -static bool chacha20poly1305_use_avx2 __read_mostly; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha20_avx512(unsigned char *out, const unsigned char *in, size_t len, const unsigned int key[8], const unsigned int counter[4]); +asmlinkage void poly1305_blocks_avx512(void *ctx, const unsigned char *inp, size_t len, u32 padbit); +#endif + static bool chacha20poly1305_use_ssse3 __read_mostly; -static bool chacha20poly1305_use_sse2 __read_mostly; +static bool chacha20poly1305_use_avx __read_mostly; +static bool chacha20poly1305_use_avx2 __read_mostly; +static bool chacha20poly1305_use_avx512 __read_mostly; + void chacha20poly1305_fpu_init(void) { - chacha20poly1305_use_sse2 = boot_cpu_has(X86_FEATURE_XMM2); chacha20poly1305_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3); + chacha20poly1305_use_avx = boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); chacha20poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifndef COMPAT_CANNOT_USE_AVX512 + chacha20poly1305_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_ZMM_Hi256, NULL); +#endif } #elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) #include <asm/hwcap.h> @@ -77,21 +117,6 @@ static inline u32 rotl32(u32 v, u8 n) return (v << n) | (v >> (sizeof(v) * 8 - n)); } -static inline u64 mlt(u64 a, u64 b) -{ - return a * b; -} - -static inline u32 sr(u64 v, u_char n) -{ - return v >> n; -} - -static inline u32 and(u32 v, u32 mask) -{ - return v & mask; -} - struct chacha20_ctx { u32 state[CHACHA20_BLOCK_SIZE / sizeof(u32)]; } __aligned(32); @@ -232,17 +257,13 @@ static void hchacha20_generic(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 static inline void hchacha20(u8 derived_key[CHACHA20POLY1305_KEYLEN], const u8 nonce[16], const u8 key[CHACHA20POLY1305_KEYLEN], bool have_simd) { - if (!have_simd) - goto no_simd; - #if defined(CONFIG_X86_64) && defined(CONFIG_AS_SSSE3) - if (chacha20poly1305_use_ssse3) { - hchacha20_asm_ssse3(derived_key, nonce, key); + if (have_simd && chacha20poly1305_use_ssse3) { + hchacha20_ssse3(derived_key, nonce, key); return; } #endif -no_simd: hchacha20_generic(derived_key, nonce, key); } @@ -266,7 +287,7 @@ static void chacha20_keysetup(struct chacha20_ctx *ctx, const u8 key[CHACHA20_KE ctx->state[15] = le32_to_cpuvp(nonce + 4); } -static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, unsigned int bytes, bool have_simd) +static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 bytes, bool have_simd) { u8 buf[CHACHA20_BLOCK_SIZE]; @@ -281,37 +302,23 @@ static void chacha20_crypt(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, uns goto no_simd; #if defined(CONFIG_X86_64) +#ifdef CONFIG_AS_AVX512 + if (chacha20poly1305_use_avx512) { + chacha20_avx512(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + ctx->state[12] += (bytes + 63) / 64; + return; + } +#endif #ifdef CONFIG_AS_AVX2 if (chacha20poly1305_use_avx2) { - while (bytes >= CHACHA20_BLOCK_SIZE * 8) { - chacha20_asm_8block_xor_avx2(ctx->state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 8; - src += CHACHA20_BLOCK_SIZE * 8; - dst += CHACHA20_BLOCK_SIZE * 8; - ctx->state[12] += 8; - } + chacha20_avx2(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + ctx->state[12] += (bytes + 63) / 64; + return; } #endif #ifdef CONFIG_AS_SSSE3 - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_asm_4block_xor_ssse3(ctx->state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - ctx->state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_asm_block_xor_ssse3(ctx->state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - ctx->state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_asm_block_xor_ssse3(ctx->state, buf, buf); - memcpy(dst, buf, bytes); - } + chacha20_ssse3(dst, src, bytes, &ctx->state[4], &ctx->state[12]); + ctx->state[12] += (bytes + 63) / 64; return; #endif #elif IS_ENABLED(CONFIG_KERNEL_MODE_NEON) @@ -352,261 +359,283 @@ no_simd: crypto_xor(dst, buf, bytes); } } +typedef void (*poly1305_blocks_f)(void *ctx, const u8 *inp, size_t len, u32 padbit); +typedef void (*poly1305_emit_f)(void *ctx, u8 mac[16], const u32 nonce[4]); struct poly1305_ctx { - /* key */ - u32 r[5]; - /* finalize key */ - u32 s[4]; - /* accumulator */ + u8 opaque[24 * sizeof(u64)]; + u32 nonce[4]; + u8 data[POLY1305_BLOCK_SIZE]; + size_t num; + struct { + poly1305_blocks_f blocks; + poly1305_emit_f emit; + } func; +} __aligned(8); + +#ifndef CONFIG_X86_64 +struct poly1305_internal { u32 h[5]; - /* partial buffer */ - u8 buf[POLY1305_BLOCK_SIZE]; - /* bytes used in partial buffer */ - unsigned int buflen; - /* derived key u set? */ - bool uset; - /* derived keys r^3, r^4 set? */ - bool wset; - /* derived Poly1305 key r^2 */ - u32 u[5]; - /* derived Poly1305 key r^3 */ - u32 r3[5]; - /* derived Poly1305 key r^4 */ - u32 r4[5]; + u32 r[4]; }; -static void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE]) +static void poly1305_init_generic(void *ctx, const u8 key[16]) { - memset(ctx, 0, sizeof(struct poly1305_ctx)); + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - ctx->r[0] = (le32_to_cpuvp(key + 0) >> 0) & 0x3ffffff; - ctx->r[1] = (get_unaligned_le32(key + 3) >> 2) & 0x3ffff03; - ctx->r[2] = (get_unaligned_le32(key + 6) >> 4) & 0x3ffc0ff; - ctx->r[3] = (get_unaligned_le32(key + 9) >> 6) & 0x3f03fff; - ctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff; - ctx->s[0] = le32_to_cpuvp(key + 16); - ctx->s[1] = le32_to_cpuvp(key + 20); - ctx->s[2] = le32_to_cpuvp(key + 24); - ctx->s[3] = le32_to_cpuvp(key + 28); + st->r[0] = le32_to_cpuvp(&key[ 0]) & 0x0fffffff; + st->r[1] = le32_to_cpuvp(&key[ 4]) & 0x0ffffffc; + st->r[2] = le32_to_cpuvp(&key[ 8]) & 0x0ffffffc; + st->r[3] = le32_to_cpuvp(&key[12]) & 0x0ffffffc; } -static unsigned int poly1305_generic_blocks(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen, u32 hibit) +static void +poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len, u32 padbit) { - u32 r0, r1, r2, r3, r4; - u32 s1, s2, s3, s4; - u32 h0, h1, h2, h3, h4; - u64 d0, d1, d2, d3, d4; - - r0 = ctx->r[0]; - r1 = ctx->r[1]; - r2 = ctx->r[2]; - r3 = ctx->r[3]; - r4 = ctx->r[4]; - - s1 = r1 * 5; - s2 = r2 * 5; - s3 = r3 * 5; - s4 = r4 * 5; - - h0 = ctx->h[0]; - h1 = ctx->h[1]; - h2 = ctx->h[2]; - h3 = ctx->h[3]; - h4 = ctx->h[4]; - - while (likely(srclen >= POLY1305_BLOCK_SIZE)) { +#define CONSTANT_TIME_CARRY(a,b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + u32 r0, r1, r2, r3; + u32 s1, s2, s3; + u32 h0, h1, h2, h3, h4, c; + u64 d0, d1, d2, d3; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + + s1 = r1 + (r1 >> 2); + s2 = r2 + (r2 >> 2); + s3 = r3 + (r3 >> 2); + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (len >= POLY1305_BLOCK_SIZE) { /* h += m[i] */ - h0 += (le32_to_cpuvp(src + 0) >> 0) & 0x3ffffff; - h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff; - h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit; - - /* h *= r */ - d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + mlt(h3, s2) + mlt(h4, s1); - d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) + mlt(h3, s3) + mlt(h4, s2); - d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) + mlt(h3, s4) + mlt(h4, s3); - d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) + mlt(h3, r0) + mlt(h4, s4); - d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) + mlt(h3, r1) + mlt(h4, r0); - - /* (partial) h %= p */ - d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff); - d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff); - d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff); - d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff); - h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff); - h1 += h0 >> 26; h0 = h0 & 0x3ffffff; - - src += POLY1305_BLOCK_SIZE; - srclen -= POLY1305_BLOCK_SIZE; + h0 = (u32)(d0 = (u64)h0 + le32_to_cpuvp(inp + 0)); + h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + le32_to_cpuvp(inp + 4)); + h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + le32_to_cpuvp(inp + 8)); + h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + le32_to_cpuvp(inp + 12)); + h4 += (u32)(d3 >> 32) + padbit; + + /* h *= r "%" p, where "%" stands for "partial remainder" */ + d0 = ((u64)h0 * r0) + + ((u64)h1 * s3) + + ((u64)h2 * s2) + + ((u64)h3 * s1); + d1 = ((u64)h0 * r1) + + ((u64)h1 * r0) + + ((u64)h2 * s3) + + ((u64)h3 * s2) + + (h4 * s1); + d2 = ((u64)h0 * r2) + + ((u64)h1 * r1) + + ((u64)h2 * r0) + + ((u64)h3 * s3) + + (h4 * s2); + d3 = ((u64)h0 * r3) + + ((u64)h1 * r2) + + ((u64)h2 * r1) + + ((u64)h3 * r0) + + (h4 * s3); + h4 = (h4 * r0); + + /* last reduction step: */ + /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ + h0 = (u32)d0; + h1 = (u32)(d1 += d0 >> 32); + h2 = (u32)(d2 += d1 >> 32); + h3 = (u32)(d3 += d2 >> 32); + h4 += (u32)(d3 >> 32); + /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ + c = (h4 >> 2) + (h4 & ~3U); + h4 &= 3; + h0 += c; + h1 += (c = CONSTANT_TIME_CARRY(h0,c)); + h2 += (c = CONSTANT_TIME_CARRY(h1,c)); + h3 += (c = CONSTANT_TIME_CARRY(h2,c)); + h4 += CONSTANT_TIME_CARRY(h3,c); + /* + * Occasional overflows to 3rd bit of h4 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ + + inp += POLY1305_BLOCK_SIZE; + len -= POLY1305_BLOCK_SIZE; } - ctx->h[0] = h0; - ctx->h[1] = h1; - ctx->h[2] = h2; - ctx->h[3] = h3; - ctx->h[4] = h4; - - return srclen; + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +#undef CONSTANT_TIME_CARRY } -#ifdef CONFIG_X86_64 -static void poly1305_simd_mult(u32 *a, const u32 *b) +static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4]) { - u8 m[POLY1305_BLOCK_SIZE]; - - memset(m, 0, sizeof(m)); - /* The poly1305 block function adds a hi-bit to the accumulator which - * we don't need for key multiplication; compensate for it. - */ - a[4] -= 1U << 24; - poly1305_asm_block_sse2(a, m, b, 1); + struct poly1305_internal *st = (struct poly1305_internal *)ctx; + __le32 *omac = (__force __le32 *)mac; + u32 h0, h1, h2, h3, h4; + u32 g0, g1, g2, g3, g4; + u64 t; + u32 mask; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + /* compare to modulus by computing h + -p */ + g0 = (u32)(t = (u64)h0 + 5); + g1 = (u32)(t = (u64)h1 + (t >> 32)); + g2 = (u32)(t = (u64)h2 + (t >> 32)); + g3 = (u32)(t = (u64)h3 + (t >> 32)); + g4 = h4 + (u32)(t >> 32); + + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ + mask = 0 - (g4 >> 2); + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + + /* mac = (h + nonce) % (2^128) */ + h0 = (u32)(t = (u64)h0 + nonce[0]); + h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); + h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); + h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); + + omac[0] = cpu_to_le32(h0); + omac[1] = cpu_to_le32(h1); + omac[2] = cpu_to_le32(h2); + omac[3] = cpu_to_le32(h3); } +#endif /* !CONFIG_X86_64 */ -static unsigned int poly1305_simd_blocks(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen) +void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], bool have_simd) { - unsigned int blocks; + ctx->nonce[0] = le32_to_cpuvp(&key[16]); + ctx->nonce[1] = le32_to_cpuvp(&key[20]); + ctx->nonce[2] = le32_to_cpuvp(&key[24]); + ctx->nonce[3] = le32_to_cpuvp(&key[28]); +#ifdef CONFIG_X86_64 + poly1305_init_x86_64(ctx->opaque, key); + ctx->func.blocks = poly1305_blocks_x86_64; + ctx->func.emit = poly1305_emit_x86_64; +#ifdef CONFIG_AS_AVX512 + if(chacha20poly1305_use_avx512 && have_simd) { + ctx->func.blocks = poly1305_blocks_avx512; + ctx->func.emit = poly1305_emit_avx; + } else +#endif #ifdef CONFIG_AS_AVX2 - if (chacha20poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(!ctx->wset)) { - if (!ctx->uset) { - memcpy(ctx->u, ctx->r, sizeof(ctx->u)); - poly1305_simd_mult(ctx->u, ctx->r); - ctx->uset = true; - } - memcpy(ctx->r3, ctx->u, sizeof(ctx->u)); - poly1305_simd_mult(ctx->r3, ctx->r); - memcpy(ctx->r4, ctx->r3, sizeof(ctx->u)); - poly1305_simd_mult(ctx->r4, ctx->r); - ctx->wset = true; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_asm_4block_avx2(ctx->h, src, ctx->r, blocks, ctx->u); - src += POLY1305_BLOCK_SIZE * 4 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; - } + if (chacha20poly1305_use_avx2 && have_simd) { + ctx->func.blocks = poly1305_blocks_avx2; + ctx->func.emit = poly1305_emit_avx; + } else #endif - if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(!ctx->uset)) { - memcpy(ctx->u, ctx->r, sizeof(ctx->u)); - poly1305_simd_mult(ctx->u, ctx->r); - ctx->uset = true; - } - blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_asm_2block_sse2(ctx->h, src, ctx->r, blocks, ctx->u); - src += POLY1305_BLOCK_SIZE * 2 * blocks; - srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; - } - if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_asm_block_sse2(ctx->h, src, ctx->r, 1); - srclen -= POLY1305_BLOCK_SIZE; +#ifdef CONFIG_AS_AVX + if (chacha20poly1305_use_avx && have_simd) { + ctx->func.blocks = poly1305_blocks_avx; + ctx->func.emit = poly1305_emit_avx; } - return srclen; -} #endif +#else + poly1305_init_generic(ctx->opaque, key); +#endif + ctx->num = 0; +} -static void poly1305_update(struct poly1305_ctx *ctx, const u8 *src, unsigned int srclen, bool have_simd) +void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len) { - unsigned int bytes; - - if (unlikely(ctx->buflen)) { - bytes = min(srclen, POLY1305_BLOCK_SIZE - ctx->buflen); - memcpy(ctx->buf + ctx->buflen, src, bytes); - src += bytes; - srclen -= bytes; - ctx->buflen += bytes; - - if (ctx->buflen == POLY1305_BLOCK_SIZE) { #ifdef CONFIG_X86_64 - if (have_simd && chacha20poly1305_use_sse2) - poly1305_simd_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE); - else + const poly1305_blocks_f blocks = ctx->func.blocks; +#else + const poly1305_blocks_f blocks = poly1305_blocks_generic; #endif - poly1305_generic_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE, 1U << 24); - ctx->buflen = 0; + + const size_t num = ctx->num; + size_t rem;; + + if (num) { + rem = POLY1305_BLOCK_SIZE - num; + if (len >= rem) { + memcpy(ctx->data + num, inp, rem); + blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1); + inp += rem; + len -= rem; + } else { + /* Still not enough data to process a block. */ + memcpy(ctx->data + num, inp, len); + ctx->num = num + len; + return; } } - if (likely(srclen >= POLY1305_BLOCK_SIZE)) { -#ifdef CONFIG_X86_64 - if (have_simd && chacha20poly1305_use_sse2) - bytes = poly1305_simd_blocks(ctx, src, srclen); - else -#endif - bytes = poly1305_generic_blocks(ctx, src, srclen, 1U << 24); - src += srclen - bytes; - srclen = bytes; - } + rem = len % POLY1305_BLOCK_SIZE; + len -= rem; - if (unlikely(srclen)) { - ctx->buflen = srclen; - memcpy(ctx->buf, src, srclen); + if (len >= POLY1305_BLOCK_SIZE) { + blocks(ctx->opaque, inp, len, 1); + inp += len; } + + if (rem) + memcpy(ctx->data, inp, rem); + + ctx->num = rem; } -static void poly1305_finish(struct poly1305_ctx *ctx, u8 *dst) +void poly1305_finish(struct poly1305_ctx * ctx, u8 mac[16]) { - __le32 *mac = (__le32 *)dst; - u32 h0, h1, h2, h3, h4; - u32 g0, g1, g2, g3, g4; - u32 mask; - u64 f = 0; +#ifdef CONFIG_X86_64 + poly1305_blocks_f blocks = ctx->func.blocks; + poly1305_emit_f emit = ctx->func.emit; +#else + poly1305_blocks_f blocks = poly1305_blocks_generic; + poly1305_emit_f emit = poly1305_emit_generic; +#endif + size_t num = ctx->num; - if (unlikely(ctx->buflen)) { - ctx->buf[ctx->buflen++] = 1; - memset(ctx->buf + ctx->buflen, 0, POLY1305_BLOCK_SIZE - ctx->buflen); - poly1305_generic_blocks(ctx, ctx->buf, POLY1305_BLOCK_SIZE, 0); + if (num) { + ctx->data[num++] = 1; /* pad bit */ + while (num < POLY1305_BLOCK_SIZE) + ctx->data[num++] = 0; + blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0); } - /* fully carry h */ - h0 = ctx->h[0]; - h1 = ctx->h[1]; - h2 = ctx->h[2]; - h3 = ctx->h[3]; - h4 = ctx->h[4]; - - h2 += (h1 >> 26); h1 = h1 & 0x3ffffff; - h3 += (h2 >> 26); h2 = h2 & 0x3ffffff; - h4 += (h3 >> 26); h3 = h3 & 0x3ffffff; - h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff; - h1 += (h0 >> 26); h0 = h0 & 0x3ffffff; - - /* compute h + -p */ - g0 = h0 + 5; - g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff; - g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff; - g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff; - g4 = h4 + (g3 >> 26) - (1U << 26); g3 &= 0x3ffffff; - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - h0 = (h0 >> 0) | (h1 << 26); - h1 = (h1 >> 6) | (h2 << 20); - h2 = (h2 >> 12) | (h3 << 14); - h3 = (h3 >> 18) | (h4 << 8); - - /* mac = (h + s) % (2^128) */ - f = (f >> 32) + h0 + ctx->s[0]; mac[0] = cpu_to_le32(f); - f = (f >> 32) + h1 + ctx->s[1]; mac[1] = cpu_to_le32(f); - f = (f >> 32) + h2 + ctx->s[2]; mac[2] = cpu_to_le32(f); - f = (f >> 32) + h3 + ctx->s[3]; mac[3] = cpu_to_le32(f); + emit(ctx->opaque, mac, ctx->nonce); + + /* zero out the state */ + memzero_explicit(ctx, sizeof(*ctx)); } + static const u8 pad0[16] = { 0 }; static struct crypto_alg chacha20_alg = { @@ -636,22 +665,22 @@ static inline void __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd); - poly1305_init(&poly1305_state, block0); + poly1305_init(&poly1305_state, block0, have_simd); memzero_explicit(block0, sizeof(block0)); - poly1305_update(&poly1305_state, ad, ad_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); chacha20_crypt(&chacha20_state, dst, src, src_len, have_simd); - poly1305_update(&poly1305_state, dst, src_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, dst, src_len); + poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); len = cpu_to_le64(ad_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); len = cpu_to_le64(src_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); poly1305_finish(&poly1305_state, dst + src_len); @@ -687,11 +716,11 @@ bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *sr chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd); - poly1305_init(&poly1305_state, block0); + poly1305_init(&poly1305_state, block0, have_simd); memzero_explicit(block0, sizeof(block0)); - poly1305_update(&poly1305_state, ad, ad_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); if (likely(src_len)) { blkcipher_walk_init(&walk, dst, src, src_len); @@ -700,25 +729,25 @@ bool chacha20poly1305_encrypt_sg(struct scatterlist *dst, struct scatterlist *sr size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE); chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd); - poly1305_update(&poly1305_state, walk.dst.virt.addr, chunk_len, have_simd); + poly1305_update(&poly1305_state, walk.dst.virt.addr, chunk_len); ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); } if (walk.nbytes) { chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd); - poly1305_update(&poly1305_state, walk.dst.virt.addr, walk.nbytes, have_simd); + poly1305_update(&poly1305_state, walk.dst.virt.addr, walk.nbytes); ret = blkcipher_walk_done(&chacha20_desc, &walk, 0); } } if (unlikely(ret)) goto err; - poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); len = cpu_to_le64(ad_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); len = cpu_to_le64(src_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); poly1305_finish(&poly1305_state, mac); scatterwalk_map_and_copy(mac, dst, src_len, sizeof(mac), 1); @@ -749,21 +778,21 @@ static inline bool __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd); - poly1305_init(&poly1305_state, block0); + poly1305_init(&poly1305_state, block0, have_simd); memzero_explicit(block0, sizeof(block0)); - poly1305_update(&poly1305_state, ad, ad_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); dst_len = src_len - POLY1305_MAC_SIZE; - poly1305_update(&poly1305_state, src, dst_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, src, dst_len); + poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf); len = cpu_to_le64(ad_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); len = cpu_to_le64(dst_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); poly1305_finish(&poly1305_state, mac); memzero_explicit(&poly1305_state, sizeof(poly1305_state)); @@ -811,11 +840,11 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr chacha20_keysetup(&chacha20_state, key, (u8 *)&le_nonce); chacha20_crypt(&chacha20_state, block0, block0, sizeof(block0), have_simd); - poly1305_init(&poly1305_state, block0); + poly1305_init(&poly1305_state, block0, have_simd); memzero_explicit(block0, sizeof(block0)); - poly1305_update(&poly1305_state, ad, ad_len, have_simd); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, ad, ad_len); + poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); dst_len = src_len - POLY1305_MAC_SIZE; if (likely(dst_len)) { @@ -824,12 +853,12 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { size_t chunk_len = rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE); - poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len, have_simd); + poly1305_update(&poly1305_state, walk.src.virt.addr, chunk_len); chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, chunk_len, have_simd); ret = blkcipher_walk_done(&chacha20_desc, &walk, walk.nbytes % CHACHA20_BLOCK_SIZE); } if (walk.nbytes) { - poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes, have_simd); + poly1305_update(&poly1305_state, walk.src.virt.addr, walk.nbytes); chacha20_crypt(&chacha20_state, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, have_simd); ret = blkcipher_walk_done(&chacha20_desc, &walk, 0); } @@ -837,13 +866,13 @@ bool chacha20poly1305_decrypt_sg(struct scatterlist *dst, struct scatterlist *sr if (unlikely(ret)) goto err; - poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf, have_simd); + poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf); len = cpu_to_le64(ad_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); len = cpu_to_le64(dst_len); - poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len), have_simd); + poly1305_update(&poly1305_state, (u8 *)&len, sizeof(len)); poly1305_finish(&poly1305_state, computed_mac); memzero_explicit(&poly1305_state, sizeof(poly1305_state)); |