summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2019-05-08 17:46:42 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2019-05-29 01:23:24 +0200
commit31f9df3abd91b4c4037fdb64c47bca6dae9693eb (patch)
treea29a50f1e0d9dca651bce92953d7a54cc3826769
parent377c3938c67b9f6515c12678e9a0ade703acf500 (diff)
blake2s: shorten ssse3 loop
This (mostly) preserves the performance (as measured on Haswell and *lake) of last commit, but it drastically reduces code size. Signed-off-by: Samuel Neves <sneves@dei.uc.pt> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64.S923
1 files changed, 66 insertions, 857 deletions
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S
index 9bb4c83..f1a2f6c 100644
--- a/src/crypto/zinc/blake2s/blake2s-x86_64.S
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S
@@ -16,10 +16,23 @@ ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
-#ifdef CONFIG_AS_AVX512
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.align 64
SIGMA:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
+.align 64
+SIGMA2:
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
@@ -43,6 +56,7 @@ ENTRY(blake2s_compress_ssse3)
movdqa ROR328(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movq %rcx,%xmm15
+ leaq SIGMA+0xa0(%rip),%r8
jmp .Lbeginofloop
.align 32
.Lbeginofloop:
@@ -52,827 +66,19 @@ ENTRY(blake2s_compress_ssse3)
movdqa IV(%rip),%xmm2
movdqa %xmm14,%xmm3
pxor IV+0x10(%rip),%xmm3
- movl 0x8(%rsi),%r8d
- movl 0x18(%rsi),%r9d
- movl (%rsi),%r10d
- movl 0x10(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0xc(%rsi),%r8d
- movl 0x1c(%rsi),%r9d
- movl 0x4(%rsi),%r10d
- movl 0x14(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x20(%rsi),%r8d
- movl 0x30(%rsi),%r9d
- movl 0x38(%rsi),%r10d
- movl 0x28(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x24(%rsi),%r8d
- movl 0x34(%rsi),%r9d
- movl 0x3c(%rsi),%r10d
- movl 0x2c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x10(%rsi),%r8d
- movl 0x34(%rsi),%r9d
- movl 0x38(%rsi),%r10d
- movl 0x24(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x20(%rsi),%r8d
- movl 0x18(%rsi),%r9d
- movl 0x28(%rsi),%r10d
- movl 0x3c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x4(%rsi),%r8d
- movl 0x2c(%rsi),%r9d
- movl 0x14(%rsi),%r10d
- movl (%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x30(%rsi),%r8d
- movl 0x1c(%rsi),%r9d
- movl 0xc(%rsi),%r10d
- movl 0x8(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x30(%rsi),%r8d
- movl 0x3c(%rsi),%r9d
- movl 0x2c(%rsi),%r10d
- movl 0x14(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl (%rsi),%r8d
- movl 0x34(%rsi),%r9d
- movl 0x20(%rsi),%r10d
- movl 0x8(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x28(%rsi),%r8d
- movl 0x1c(%rsi),%r9d
- movl 0x24(%rsi),%r10d
- movl 0xc(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x38(%rsi),%r8d
- movl 0x4(%rsi),%r9d
- movl 0x10(%rsi),%r10d
- movl 0x18(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0xc(%rsi),%r8d
- movl 0x2c(%rsi),%r9d
- movl 0x1c(%rsi),%r10d
- movl 0x34(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x4(%rsi),%r8d
- movl 0x38(%rsi),%r9d
- movl 0x24(%rsi),%r10d
- movl 0x30(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x8(%rsi),%r8d
- movl 0x10(%rsi),%r9d
- movl 0x3c(%rsi),%r10d
- movl 0x14(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x18(%rsi),%r8d
- movl (%rsi),%r9d
- movl 0x20(%rsi),%r10d
- movl 0x28(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x14(%rsi),%r8d
- movl 0x28(%rsi),%r9d
- movl 0x24(%rsi),%r10d
- movl 0x8(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x1c(%rsi),%r8d
- movl 0x3c(%rsi),%r9d
- movl (%rsi),%r10d
- movl 0x10(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x38(%rsi),%r8d
- movl 0x18(%rsi),%r9d
- movl 0xc(%rsi),%r10d
- movl 0x2c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x4(%rsi),%r8d
- movl 0x20(%rsi),%r9d
- movl 0x34(%rsi),%r10d
- movl 0x30(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x18(%rsi),%r8d
- movl 0x20(%rsi),%r9d
- movl 0x8(%rsi),%r10d
- movl (%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x28(%rsi),%r8d
- movl 0xc(%rsi),%r9d
- movl 0x30(%rsi),%r10d
- movl 0x2c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x10(%rsi),%r8d
- movl 0x3c(%rsi),%r9d
- movl 0x4(%rsi),%r10d
- movl 0x1c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x34(%rsi),%r8d
- movl 0x38(%rsi),%r9d
- movl 0x24(%rsi),%r10d
- movl 0x14(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x4(%rsi),%r8d
- movl 0x10(%rsi),%r9d
- movl 0x30(%rsi),%r10d
- movl 0x38(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x3c(%rsi),%r8d
- movl 0x28(%rsi),%r9d
- movl 0x14(%rsi),%r10d
- movl 0x34(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl (%rsi),%r8d
- movl 0x24(%rsi),%r9d
- movl 0x20(%rsi),%r10d
- movl 0x18(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x1c(%rsi),%r8d
- movl 0x8(%rsi),%r9d
- movl 0x2c(%rsi),%r10d
- movl 0xc(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x1c(%rsi),%r8d
- movl 0xc(%rsi),%r9d
- movl 0x34(%rsi),%r10d
- movl 0x30(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x38(%rsi),%r8d
- movl 0x24(%rsi),%r9d
- movl 0x2c(%rsi),%r10d
- movl 0x4(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x14(%rsi),%r8d
- movl 0x20(%rsi),%r9d
- movl 0x8(%rsi),%r10d
- movl 0x3c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl (%rsi),%r8d
- movl 0x18(%rsi),%r9d
- movl 0x28(%rsi),%r10d
- movl 0x10(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x38(%rsi),%r8d
- movl (%rsi),%r9d
- movl 0x18(%rsi),%r10d
- movl 0x2c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x24(%rsi),%r8d
- movl 0x20(%rsi),%r9d
- movl 0x3c(%rsi),%r10d
- movl 0xc(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movl 0x30(%rsi),%r8d
- movl 0x4(%rsi),%r9d
- movl 0x28(%rsi),%r10d
- movl 0x34(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movl 0x8(%rsi),%r8d
- movl 0x10(%rsi),%r9d
- movl 0x14(%rsi),%r10d
- movl 0x1c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- movl 0x20(%rsi),%r8d
- movl 0x4(%rsi),%r9d
- movl 0x28(%rsi),%r10d
- movl 0x1c(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm4
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm4
+ leaq SIGMA(%rip),%rcx
+.Lroundloop:
+ movzbl (%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0x1(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0x2(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x3(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ punpckldq %xmm5,%xmm4
+ punpckldq %xmm7,%xmm6
+ punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -883,17 +89,17 @@ ENTRY(blake2s_compress_ssse3)
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
- movl 0x10(%rsi),%r8d
- movl 0x14(%rsi),%r9d
- movl 0x8(%rsi),%r10d
- movl 0x18(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm5
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm5
+ movzbl 0x4(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0x5(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x6(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0x7(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ punpckldq %xmm6,%xmm5
+ punpckldq %xmm4,%xmm7
+ punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -907,17 +113,17 @@ ENTRY(blake2s_compress_ssse3)
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
- movl 0x3c(%rsi),%r8d
- movl 0xc(%rsi),%r9d
- movl 0x34(%rsi),%r10d
- movl 0x24(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm6
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm6
+ movzbl 0x8(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x9(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0xa(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0xb(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ punpckldq %xmm7,%xmm6
+ punpckldq %xmm5,%xmm4
+ punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -928,17 +134,17 @@ ENTRY(blake2s_compress_ssse3)
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
- movl 0x2c(%rsi),%r8d
- movl 0x30(%rsi),%r9d
- movl (%rsi),%r10d
- movl 0x38(%rsi),%r11d
- shlq $0x20,%r8
- shlq $0x20,%r9
- orq %r10,%r8
- orq %r11,%r9
- movq %r8,%xmm7
- movq %r9,%xmm8
- punpcklqdq %xmm8,%xmm7
+ movzbl 0xc(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0xd(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0xe(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0xf(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ punpckldq %xmm4,%xmm7
+ punpckldq %xmm6,%xmm5
+ punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -952,6 +158,9 @@ ENTRY(blake2s_compress_ssse3)
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
+ addq $0x10,%rcx
+ cmpq %r8,%rcx
+ jnz .Lroundloop
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
@@ -986,7 +195,7 @@ ENTRY(blake2s_compress_avx512)
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
- leaq SIGMA(%rip),%rax
+ leaq SIGMA2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
addq $0x40,%rax