RTG-1339 Support post-quantum hybrid key exchange

Func spec: https://wiki.cfops.it/x/ZcBKHw
This commit is contained in:
Bas Westerbaan
2022-08-24 14:33:10 +02:00
committed by Devin Carr
parent 3e0ff3a771
commit 11cbff4ff7
171 changed files with 15270 additions and 196 deletions

View File

@@ -0,0 +1,149 @@
// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
//
// Keccak-f[1600] is the permutation underlying several algorithms such as
// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
// useful in some scenarios like in hash-based signatures.
//
// # Limitations
//
// Note that not all the architectures support SIMD instructions. This package
// uses AVX2 instructions that are available in some AMD64 architectures
// and NEON instructions that are available in some ARM64 architectures.
//
// For those systems not supporting these, the package still provides the
// expected functionality by means of a generic and slow implementation.
// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
// to determine if the current system supports the SIMD implementation.
package keccakf1600
import (
"unsafe"
"github.com/cloudflare/circl/internal/sha3"
"golang.org/x/sys/cpu"
)
// StateX4 contains state for the four-way permutation including the four
// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
// and get a pointer to the interleaved buffer.
type StateX4 struct {
// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
// aligned on 32 bytes for bet performance. Thus we leave some headroom
// to be able to move the start of the state.
// 4 x 25 uint64s for the interleaved states and three uint64s headroom
// to fix alignment.
a [103]uint64
// Offset into a that is 32 byte aligned.
offset int
}
// StateX2 contains state for the two-way permutation including the two
// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
// and get a pointer to the interleaved buffer.
type StateX2 struct {
// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
// aligned on 32 bytes for bet performance. Thus we leave some headroom
// to be able to move the start of the state.
// 2 x 25 uint64s for the interleaved states and three uint64s headroom
// to fix alignment.
a [53]uint64
// Offset into a that is 32 byte aligned.
offset int
}
// IsEnabledX4 returns true if the architecture supports a four-way SIMD
// implementation provided in this package.
func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
// IsEnabledX2 returns true if the architecture supports a two-way SIMD
// implementation provided in this package.
func IsEnabledX2() bool {
// After Go 1.16 the flag cpu.ARM64.HasSHA3 is no longer exposed.
return false
}
// Initialize the state and returns the buffer on which the four permutations
// will act: a uint64 slice of length 100. The first permutation will act
// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
func (s *StateX4) Initialize() []uint64 {
rp := unsafe.Pointer(&s.a[0])
// uint64s are always aligned by a multiple of 8. Compute the remainder
// of the address modulo 32 divided by 8.
rem := (int(uintptr(rp)&31) >> 3)
if rem != 0 {
s.offset = 4 - rem
}
// The slice we return will be aligned on 32 byte boundary.
return s.a[s.offset : s.offset+100]
}
// Initialize the state and returns the buffer on which the two permutations
// will act: a uint64 slice of length 50. The first permutation will act
// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
func (s *StateX2) Initialize() []uint64 {
rp := unsafe.Pointer(&s.a[0])
// uint64s are always aligned by a multiple of 8. Compute the remainder
// of the address modulo 32 divided by 8.
rem := (int(uintptr(rp)&31) >> 3)
if rem != 0 {
s.offset = 4 - rem
}
// The slice we return will be aligned on 32 byte boundary.
return s.a[s.offset : s.offset+50]
}
// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
// returned from Initialize().
func (s *StateX4) Permute() {
if IsEnabledX4() {
permuteSIMDx4(s.a[s.offset:])
} else {
permuteScalarX4(s.a[s.offset:]) // A slower generic implementation.
}
}
// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
// returned from Initialize().
func (s *StateX2) Permute() {
if IsEnabledX2() {
permuteSIMDx2(s.a[s.offset:])
} else {
permuteScalarX2(s.a[s.offset:]) // A slower generic implementation.
}
}
func permuteScalarX4(a []uint64) {
var buf [25]uint64
for i := 0; i < 4; i++ {
for j := 0; j < 25; j++ {
buf[j] = a[4*j+i]
}
sha3.KeccakF1600(&buf)
for j := 0; j < 25; j++ {
a[4*j+i] = buf[j]
}
}
}
func permuteScalarX2(a []uint64) {
var buf [25]uint64
for i := 0; i < 2; i++ {
for j := 0; j < 25; j++ {
buf[j] = a[2*j+i]
}
sha3.KeccakF1600(&buf)
for j := 0; j < 25; j++ {
a[2*j+i] = buf[j]
}
}
}

View File

@@ -0,0 +1,13 @@
//go:build arm64 && go1.16
// +build arm64,go1.16
package keccakf1600
import "github.com/cloudflare/circl/internal/sha3"
func permuteSIMDx2(state []uint64) { f1600x2ARM(&state[0], &sha3.RC) }
func permuteSIMDx4(state []uint64) { permuteScalarX4(state) }
//go:noescape
func f1600x2ARM(state *uint64, rc *[24]uint64)

View File

@@ -0,0 +1,130 @@
// +build arm64,go1.16
// Taken from https://github.com/bwesterb/armed-keccak
#include "textflag.h"
// func f1600x2ARM(state *uint64, rc *[24]uint64)
TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16
MOVD state+0(FP), R0
MOVD rc+8(FP), R1
MOVD R0, R2
MOVD $24, R3
VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
VLD1.P (R0), [V24.B16]
loop:
// Execute theta but without xorring into the state yet.
VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
// Xor parities from step theta into the state at the same time as
// exeuting rho and pi.
VRAX1 V26.D2, V29.D2, V30.D2
VRAX1 V29.D2, V27.D2, V29.D2
VRAX1 V27.D2, V25.D2, V27.D2
VRAX1 V25.D2, V28.D2, V25.D2
VRAX1 V28.D2, V26.D2, V28.D2
VEOR V30.B16, V0.B16, V0.B16
VMOV V1.B16, V31.B16
VXAR $20, V27.D2, V6.D2, V1.D2
VXAR $44, V25.D2, V9.D2, V6.D2
VXAR $3 , V28.D2, V22.D2, V9.D2
VXAR $25, V25.D2, V14.D2, V22.D2
VXAR $46, V30.D2, V20.D2, V14.D2
VXAR $2 , V28.D2, V2.D2, V20.D2
VXAR $21, V28.D2, V12.D2, V2.D2
VXAR $39, V29.D2, V13.D2, V12.D2
VXAR $56, V25.D2, V19.D2, V13.D2
VXAR $8 , V29.D2, V23.D2, V19.D2
VXAR $23, V30.D2, V15.D2, V23.D2
VXAR $37, V25.D2, V4.D2, V15.D2
VXAR $50, V25.D2, V24.D2, V4.D2
VXAR $62, V27.D2, V21.D2, V24.D2
VXAR $9 , V29.D2, V8.D2, V21.D2
VXAR $19, V27.D2, V16.D2, V8.D2
VXAR $28, V30.D2, V5.D2, V16.D2
VXAR $36, V29.D2, V3.D2, V5.D2
VXAR $43, V29.D2, V18.D2, V3.D2
VXAR $49, V28.D2, V17.D2, V18.D2
VXAR $54, V27.D2, V11.D2, V17.D2
VXAR $58, V28.D2, V7.D2, V11.D2
VXAR $61, V30.D2, V10.D2, V7.D2
VXAR $63, V27.D2, V31.D2, V10.D2
// Chi
VBCAX V1.B16, V2.B16, V0.B16, V25.B16
VBCAX V2.B16, V3.B16, V1.B16, V26.B16
VBCAX V3.B16, V4.B16, V2.B16, V2.B16
VBCAX V4.B16, V0.B16, V3.B16, V3.B16
VBCAX V0.B16, V1.B16, V4.B16, V4.B16
VMOV V25.B16, V0.B16
VMOV V26.B16, V1.B16
VBCAX V6.B16, V7.B16, V5.B16, V25.B16
VBCAX V7.B16, V8.B16, V6.B16, V26.B16
VBCAX V8.B16, V9.B16, V7.B16, V7.B16
VBCAX V9.B16, V5.B16, V8.B16, V8.B16
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
VMOV V25.B16, V5.B16
VMOV V26.B16, V6.B16
VBCAX V11.B16, V12.B16, V10.B16, V25.B16
VBCAX V12.B16, V13.B16, V11.B16, V26.B16
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
VBCAX V14.B16, V10.B16, V13.B16, V13.B16
VBCAX V10.B16, V11.B16, V14.B16, V14.B16
VMOV V25.B16, V10.B16
VMOV V26.B16, V11.B16
VBCAX V16.B16, V17.B16, V15.B16, V25.B16
VBCAX V17.B16, V18.B16, V16.B16, V26.B16
VBCAX V18.B16, V19.B16, V17.B16, V17.B16
VBCAX V19.B16, V15.B16, V18.B16, V18.B16
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
VMOV V25.B16, V15.B16
VMOV V26.B16, V16.B16
VBCAX V21.B16, V22.B16, V20.B16, V25.B16
VBCAX V22.B16, V23.B16, V21.B16, V26.B16
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
VBCAX V24.B16, V20.B16, V23.B16, V23.B16
VBCAX V20.B16, V21.B16, V24.B16, V24.B16
VMOV V25.B16, V20.B16
VMOV V26.B16, V21.B16
// Iota
VLD1R.P 8(R1), [V25.D2]
VEOR V25.B16, V0.B16, V0.B16
SUBS $1, R3, R3
CBNZ R3, loop
MOVD R2, R0
VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
VST1.P [V24.B16], (R0)
RET

View File

@@ -0,0 +1,7 @@
package keccakf1600
import "github.com/cloudflare/circl/internal/sha3"
func permuteSIMDx4(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) }
func permuteSIMDx2(state []uint64) { permuteScalarX2(state) }

View File

@@ -0,0 +1,894 @@
// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT.
// +build amd64
#include "textflag.h"
// func f1600x4AVX2(state *uint64, rc *[24]uint64)
// Requires: AVX, AVX2
TEXT ·f1600x4AVX2(SB), NOSPLIT, $0-16
MOVQ state+0(FP), AX
MOVQ rc+8(FP), CX
MOVQ $0x0000000000000006, DX
loop:
VMOVDQA (AX), Y0
VMOVDQA 32(AX), Y1
VMOVDQA 64(AX), Y2
VMOVDQA 96(AX), Y3
VMOVDQA 128(AX), Y4
VPXOR 160(AX), Y0, Y0
VPXOR 192(AX), Y1, Y1
VPXOR 224(AX), Y2, Y2
VPXOR 256(AX), Y3, Y3
VPXOR 288(AX), Y4, Y4
VPXOR 320(AX), Y0, Y0
VPXOR 352(AX), Y1, Y1
VPXOR 384(AX), Y2, Y2
VPXOR 416(AX), Y3, Y3
VPXOR 448(AX), Y4, Y4
VPXOR 480(AX), Y0, Y0
VPXOR 512(AX), Y1, Y1
VPXOR 544(AX), Y2, Y2
VPXOR 576(AX), Y3, Y3
VPXOR 608(AX), Y4, Y4
VPXOR 640(AX), Y0, Y0
VPXOR 672(AX), Y1, Y1
VPXOR 704(AX), Y2, Y2
VPXOR 736(AX), Y3, Y3
VPXOR 768(AX), Y4, Y4
VPSLLQ $0x01, Y1, Y5
VPSLLQ $0x01, Y2, Y6
VPSLLQ $0x01, Y3, Y7
VPSLLQ $0x01, Y4, Y8
VPSLLQ $0x01, Y0, Y9
VPSRLQ $0x3f, Y1, Y10
VPSRLQ $0x3f, Y2, Y11
VPSRLQ $0x3f, Y3, Y12
VPSRLQ $0x3f, Y4, Y13
VPSRLQ $0x3f, Y0, Y14
VPOR Y5, Y10, Y10
VPOR Y6, Y11, Y11
VPOR Y7, Y12, Y12
VPOR Y8, Y13, Y13
VPOR Y9, Y14, Y14
VPXOR Y10, Y4, Y10
VPXOR Y11, Y0, Y11
VPXOR Y12, Y1, Y12
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPXOR (AX), Y10, Y0
VPXOR 192(AX), Y11, Y1
VPXOR 384(AX), Y12, Y2
VPXOR 576(AX), Y13, Y3
VPXOR 768(AX), Y14, Y4
VPSLLQ $0x2c, Y1, Y6
VPSLLQ $0x2b, Y2, Y7
VPSLLQ $0x15, Y3, Y8
VPSLLQ $0x0e, Y4, Y9
VPSRLQ $0x14, Y1, Y1
VPSRLQ $0x15, Y2, Y2
VPSRLQ $0x2b, Y3, Y3
VPSRLQ $0x32, Y4, Y4
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VPBROADCASTQ (CX), Y0
VPXOR Y0, Y5, Y5
VMOVDQA Y5, (AX)
VMOVDQA Y6, 192(AX)
VMOVDQA Y7, 384(AX)
VMOVDQA Y8, 576(AX)
VMOVDQA Y9, 768(AX)
VPXOR 96(AX), Y13, Y0
VPXOR 288(AX), Y14, Y1
VPXOR 320(AX), Y10, Y2
VPXOR 512(AX), Y11, Y3
VPXOR 704(AX), Y12, Y4
VPSLLQ $0x1c, Y0, Y5
VPSLLQ $0x14, Y1, Y6
VPSLLQ $0x03, Y2, Y7
VPSLLQ $0x2d, Y3, Y8
VPSLLQ $0x3d, Y4, Y9
VPSRLQ $0x24, Y0, Y0
VPSRLQ $0x2c, Y1, Y1
VPSRLQ $0x3d, Y2, Y2
VPSRLQ $0x13, Y3, Y3
VPSRLQ $0x03, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 320(AX)
VMOVDQA Y6, 512(AX)
VMOVDQA Y7, 704(AX)
VMOVDQA Y8, 96(AX)
VMOVDQA Y9, 288(AX)
VPXOR 32(AX), Y11, Y0
VPXOR 224(AX), Y12, Y1
VPXOR 416(AX), Y13, Y2
VPXOR 608(AX), Y14, Y3
VPXOR 640(AX), Y10, Y4
VPSLLQ $0x01, Y0, Y5
VPSLLQ $0x06, Y1, Y6
VPSLLQ $0x19, Y2, Y7
VPSLLQ $0x08, Y3, Y8
VPSLLQ $0x12, Y4, Y9
VPSRLQ $0x3f, Y0, Y0
VPSRLQ $0x3a, Y1, Y1
VPSRLQ $0x27, Y2, Y2
VPSRLQ $0x38, Y3, Y3
VPSRLQ $0x2e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 640(AX)
VMOVDQA Y6, 32(AX)
VMOVDQA Y7, 224(AX)
VMOVDQA Y8, 416(AX)
VMOVDQA Y9, 608(AX)
VPXOR 128(AX), Y14, Y0
VPXOR 160(AX), Y10, Y1
VPXOR 352(AX), Y11, Y2
VPXOR 544(AX), Y12, Y3
VPXOR 736(AX), Y13, Y4
VPSLLQ $0x1b, Y0, Y5
VPSLLQ $0x24, Y1, Y6
VPSLLQ $0x0a, Y2, Y7
VPSLLQ $0x0f, Y3, Y8
VPSLLQ $0x38, Y4, Y9
VPSRLQ $0x25, Y0, Y0
VPSRLQ $0x1c, Y1, Y1
VPSRLQ $0x36, Y2, Y2
VPSRLQ $0x31, Y3, Y3
VPSRLQ $0x08, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 160(AX)
VMOVDQA Y6, 352(AX)
VMOVDQA Y7, 544(AX)
VMOVDQA Y8, 736(AX)
VMOVDQA Y9, 128(AX)
VPXOR 64(AX), Y12, Y0
VPXOR 256(AX), Y13, Y1
VPXOR 448(AX), Y14, Y2
VPXOR 480(AX), Y10, Y3
VPXOR 672(AX), Y11, Y4
VPSLLQ $0x3e, Y0, Y5
VPSLLQ $0x37, Y1, Y6
VPSLLQ $0x27, Y2, Y7
VPSLLQ $0x29, Y3, Y8
VPSLLQ $0x02, Y4, Y9
VPSRLQ $0x02, Y0, Y0
VPSRLQ $0x09, Y1, Y1
VPSRLQ $0x19, Y2, Y2
VPSRLQ $0x17, Y3, Y3
VPSRLQ $0x3e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 480(AX)
VMOVDQA Y6, 672(AX)
VMOVDQA Y7, 64(AX)
VMOVDQA Y8, 256(AX)
VMOVDQA Y9, 448(AX)
VMOVDQA (AX), Y0
VMOVDQA 32(AX), Y1
VMOVDQA 64(AX), Y2
VMOVDQA 96(AX), Y3
VMOVDQA 128(AX), Y4
VPXOR 160(AX), Y0, Y0
VPXOR 192(AX), Y1, Y1
VPXOR 224(AX), Y2, Y2
VPXOR 256(AX), Y3, Y3
VPXOR 288(AX), Y4, Y4
VPXOR 320(AX), Y0, Y0
VPXOR 352(AX), Y1, Y1
VPXOR 384(AX), Y2, Y2
VPXOR 416(AX), Y3, Y3
VPXOR 448(AX), Y4, Y4
VPXOR 480(AX), Y0, Y0
VPXOR 512(AX), Y1, Y1
VPXOR 544(AX), Y2, Y2
VPXOR 576(AX), Y3, Y3
VPXOR 608(AX), Y4, Y4
VPXOR 640(AX), Y0, Y0
VPXOR 672(AX), Y1, Y1
VPXOR 704(AX), Y2, Y2
VPXOR 736(AX), Y3, Y3
VPXOR 768(AX), Y4, Y4
VPSLLQ $0x01, Y1, Y5
VPSLLQ $0x01, Y2, Y6
VPSLLQ $0x01, Y3, Y7
VPSLLQ $0x01, Y4, Y8
VPSLLQ $0x01, Y0, Y9
VPSRLQ $0x3f, Y1, Y10
VPSRLQ $0x3f, Y2, Y11
VPSRLQ $0x3f, Y3, Y12
VPSRLQ $0x3f, Y4, Y13
VPSRLQ $0x3f, Y0, Y14
VPOR Y5, Y10, Y10
VPOR Y6, Y11, Y11
VPOR Y7, Y12, Y12
VPOR Y8, Y13, Y13
VPOR Y9, Y14, Y14
VPXOR Y10, Y4, Y10
VPXOR Y11, Y0, Y11
VPXOR Y12, Y1, Y12
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPXOR (AX), Y10, Y0
VPXOR 512(AX), Y11, Y1
VPXOR 224(AX), Y12, Y2
VPXOR 736(AX), Y13, Y3
VPXOR 448(AX), Y14, Y4
VPSLLQ $0x2c, Y1, Y6
VPSLLQ $0x2b, Y2, Y7
VPSLLQ $0x15, Y3, Y8
VPSLLQ $0x0e, Y4, Y9
VPSRLQ $0x14, Y1, Y1
VPSRLQ $0x15, Y2, Y2
VPSRLQ $0x2b, Y3, Y3
VPSRLQ $0x32, Y4, Y4
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VPBROADCASTQ 8(CX), Y0
VPXOR Y0, Y5, Y5
VMOVDQA Y5, (AX)
VMOVDQA Y6, 512(AX)
VMOVDQA Y7, 224(AX)
VMOVDQA Y8, 736(AX)
VMOVDQA Y9, 448(AX)
VPXOR 576(AX), Y13, Y0
VPXOR 288(AX), Y14, Y1
VPXOR 640(AX), Y10, Y2
VPXOR 352(AX), Y11, Y3
VPXOR 64(AX), Y12, Y4
VPSLLQ $0x1c, Y0, Y5
VPSLLQ $0x14, Y1, Y6
VPSLLQ $0x03, Y2, Y7
VPSLLQ $0x2d, Y3, Y8
VPSLLQ $0x3d, Y4, Y9
VPSRLQ $0x24, Y0, Y0
VPSRLQ $0x2c, Y1, Y1
VPSRLQ $0x3d, Y2, Y2
VPSRLQ $0x13, Y3, Y3
VPSRLQ $0x03, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 640(AX)
VMOVDQA Y6, 352(AX)
VMOVDQA Y7, 64(AX)
VMOVDQA Y8, 576(AX)
VMOVDQA Y9, 288(AX)
VPXOR 192(AX), Y11, Y0
VPXOR 704(AX), Y12, Y1
VPXOR 416(AX), Y13, Y2
VPXOR 128(AX), Y14, Y3
VPXOR 480(AX), Y10, Y4
VPSLLQ $0x01, Y0, Y5
VPSLLQ $0x06, Y1, Y6
VPSLLQ $0x19, Y2, Y7
VPSLLQ $0x08, Y3, Y8
VPSLLQ $0x12, Y4, Y9
VPSRLQ $0x3f, Y0, Y0
VPSRLQ $0x3a, Y1, Y1
VPSRLQ $0x27, Y2, Y2
VPSRLQ $0x38, Y3, Y3
VPSRLQ $0x2e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 480(AX)
VMOVDQA Y6, 192(AX)
VMOVDQA Y7, 704(AX)
VMOVDQA Y8, 416(AX)
VMOVDQA Y9, 128(AX)
VPXOR 768(AX), Y14, Y0
VPXOR 320(AX), Y10, Y1
VPXOR 32(AX), Y11, Y2
VPXOR 544(AX), Y12, Y3
VPXOR 256(AX), Y13, Y4
VPSLLQ $0x1b, Y0, Y5
VPSLLQ $0x24, Y1, Y6
VPSLLQ $0x0a, Y2, Y7
VPSLLQ $0x0f, Y3, Y8
VPSLLQ $0x38, Y4, Y9
VPSRLQ $0x25, Y0, Y0
VPSRLQ $0x1c, Y1, Y1
VPSRLQ $0x36, Y2, Y2
VPSRLQ $0x31, Y3, Y3
VPSRLQ $0x08, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 320(AX)
VMOVDQA Y6, 32(AX)
VMOVDQA Y7, 544(AX)
VMOVDQA Y8, 256(AX)
VMOVDQA Y9, 768(AX)
VPXOR 384(AX), Y12, Y0
VPXOR 96(AX), Y13, Y1
VPXOR 608(AX), Y14, Y2
VPXOR 160(AX), Y10, Y3
VPXOR 672(AX), Y11, Y4
VPSLLQ $0x3e, Y0, Y5
VPSLLQ $0x37, Y1, Y6
VPSLLQ $0x27, Y2, Y7
VPSLLQ $0x29, Y3, Y8
VPSLLQ $0x02, Y4, Y9
VPSRLQ $0x02, Y0, Y0
VPSRLQ $0x09, Y1, Y1
VPSRLQ $0x19, Y2, Y2
VPSRLQ $0x17, Y3, Y3
VPSRLQ $0x3e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 160(AX)
VMOVDQA Y6, 672(AX)
VMOVDQA Y7, 384(AX)
VMOVDQA Y8, 96(AX)
VMOVDQA Y9, 608(AX)
VMOVDQA (AX), Y0
VMOVDQA 32(AX), Y1
VMOVDQA 64(AX), Y2
VMOVDQA 96(AX), Y3
VMOVDQA 128(AX), Y4
VPXOR 160(AX), Y0, Y0
VPXOR 192(AX), Y1, Y1
VPXOR 224(AX), Y2, Y2
VPXOR 256(AX), Y3, Y3
VPXOR 288(AX), Y4, Y4
VPXOR 320(AX), Y0, Y0
VPXOR 352(AX), Y1, Y1
VPXOR 384(AX), Y2, Y2
VPXOR 416(AX), Y3, Y3
VPXOR 448(AX), Y4, Y4
VPXOR 480(AX), Y0, Y0
VPXOR 512(AX), Y1, Y1
VPXOR 544(AX), Y2, Y2
VPXOR 576(AX), Y3, Y3
VPXOR 608(AX), Y4, Y4
VPXOR 640(AX), Y0, Y0
VPXOR 672(AX), Y1, Y1
VPXOR 704(AX), Y2, Y2
VPXOR 736(AX), Y3, Y3
VPXOR 768(AX), Y4, Y4
VPSLLQ $0x01, Y1, Y5
VPSLLQ $0x01, Y2, Y6
VPSLLQ $0x01, Y3, Y7
VPSLLQ $0x01, Y4, Y8
VPSLLQ $0x01, Y0, Y9
VPSRLQ $0x3f, Y1, Y10
VPSRLQ $0x3f, Y2, Y11
VPSRLQ $0x3f, Y3, Y12
VPSRLQ $0x3f, Y4, Y13
VPSRLQ $0x3f, Y0, Y14
VPOR Y5, Y10, Y10
VPOR Y6, Y11, Y11
VPOR Y7, Y12, Y12
VPOR Y8, Y13, Y13
VPOR Y9, Y14, Y14
VPXOR Y10, Y4, Y10
VPXOR Y11, Y0, Y11
VPXOR Y12, Y1, Y12
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPXOR (AX), Y10, Y0
VPXOR 352(AX), Y11, Y1
VPXOR 704(AX), Y12, Y2
VPXOR 256(AX), Y13, Y3
VPXOR 608(AX), Y14, Y4
VPSLLQ $0x2c, Y1, Y6
VPSLLQ $0x2b, Y2, Y7
VPSLLQ $0x15, Y3, Y8
VPSLLQ $0x0e, Y4, Y9
VPSRLQ $0x14, Y1, Y1
VPSRLQ $0x15, Y2, Y2
VPSRLQ $0x2b, Y3, Y3
VPSRLQ $0x32, Y4, Y4
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VPBROADCASTQ 16(CX), Y0
VPXOR Y0, Y5, Y5
VMOVDQA Y5, (AX)
VMOVDQA Y6, 352(AX)
VMOVDQA Y7, 704(AX)
VMOVDQA Y8, 256(AX)
VMOVDQA Y9, 608(AX)
VPXOR 736(AX), Y13, Y0
VPXOR 288(AX), Y14, Y1
VPXOR 480(AX), Y10, Y2
VPXOR 32(AX), Y11, Y3
VPXOR 384(AX), Y12, Y4
VPSLLQ $0x1c, Y0, Y5
VPSLLQ $0x14, Y1, Y6
VPSLLQ $0x03, Y2, Y7
VPSLLQ $0x2d, Y3, Y8
VPSLLQ $0x3d, Y4, Y9
VPSRLQ $0x24, Y0, Y0
VPSRLQ $0x2c, Y1, Y1
VPSRLQ $0x3d, Y2, Y2
VPSRLQ $0x13, Y3, Y3
VPSRLQ $0x03, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 480(AX)
VMOVDQA Y6, 32(AX)
VMOVDQA Y7, 384(AX)
VMOVDQA Y8, 736(AX)
VMOVDQA Y9, 288(AX)
VPXOR 512(AX), Y11, Y0
VPXOR 64(AX), Y12, Y1
VPXOR 416(AX), Y13, Y2
VPXOR 768(AX), Y14, Y3
VPXOR 160(AX), Y10, Y4
VPSLLQ $0x01, Y0, Y5
VPSLLQ $0x06, Y1, Y6
VPSLLQ $0x19, Y2, Y7
VPSLLQ $0x08, Y3, Y8
VPSLLQ $0x12, Y4, Y9
VPSRLQ $0x3f, Y0, Y0
VPSRLQ $0x3a, Y1, Y1
VPSRLQ $0x27, Y2, Y2
VPSRLQ $0x38, Y3, Y3
VPSRLQ $0x2e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 160(AX)
VMOVDQA Y6, 512(AX)
VMOVDQA Y7, 64(AX)
VMOVDQA Y8, 416(AX)
VMOVDQA Y9, 768(AX)
VPXOR 448(AX), Y14, Y0
VPXOR 640(AX), Y10, Y1
VPXOR 192(AX), Y11, Y2
VPXOR 544(AX), Y12, Y3
VPXOR 96(AX), Y13, Y4
VPSLLQ $0x1b, Y0, Y5
VPSLLQ $0x24, Y1, Y6
VPSLLQ $0x0a, Y2, Y7
VPSLLQ $0x0f, Y3, Y8
VPSLLQ $0x38, Y4, Y9
VPSRLQ $0x25, Y0, Y0
VPSRLQ $0x1c, Y1, Y1
VPSRLQ $0x36, Y2, Y2
VPSRLQ $0x31, Y3, Y3
VPSRLQ $0x08, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 640(AX)
VMOVDQA Y6, 192(AX)
VMOVDQA Y7, 544(AX)
VMOVDQA Y8, 96(AX)
VMOVDQA Y9, 448(AX)
VPXOR 224(AX), Y12, Y0
VPXOR 576(AX), Y13, Y1
VPXOR 128(AX), Y14, Y2
VPXOR 320(AX), Y10, Y3
VPXOR 672(AX), Y11, Y4
VPSLLQ $0x3e, Y0, Y5
VPSLLQ $0x37, Y1, Y6
VPSLLQ $0x27, Y2, Y7
VPSLLQ $0x29, Y3, Y8
VPSLLQ $0x02, Y4, Y9
VPSRLQ $0x02, Y0, Y0
VPSRLQ $0x09, Y1, Y1
VPSRLQ $0x19, Y2, Y2
VPSRLQ $0x17, Y3, Y3
VPSRLQ $0x3e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 320(AX)
VMOVDQA Y6, 672(AX)
VMOVDQA Y7, 224(AX)
VMOVDQA Y8, 576(AX)
VMOVDQA Y9, 128(AX)
VMOVDQA (AX), Y0
VMOVDQA 32(AX), Y1
VMOVDQA 64(AX), Y2
VMOVDQA 96(AX), Y3
VMOVDQA 128(AX), Y4
VPXOR 160(AX), Y0, Y0
VPXOR 192(AX), Y1, Y1
VPXOR 224(AX), Y2, Y2
VPXOR 256(AX), Y3, Y3
VPXOR 288(AX), Y4, Y4
VPXOR 320(AX), Y0, Y0
VPXOR 352(AX), Y1, Y1
VPXOR 384(AX), Y2, Y2
VPXOR 416(AX), Y3, Y3
VPXOR 448(AX), Y4, Y4
VPXOR 480(AX), Y0, Y0
VPXOR 512(AX), Y1, Y1
VPXOR 544(AX), Y2, Y2
VPXOR 576(AX), Y3, Y3
VPXOR 608(AX), Y4, Y4
VPXOR 640(AX), Y0, Y0
VPXOR 672(AX), Y1, Y1
VPXOR 704(AX), Y2, Y2
VPXOR 736(AX), Y3, Y3
VPXOR 768(AX), Y4, Y4
VPSLLQ $0x01, Y1, Y5
VPSLLQ $0x01, Y2, Y6
VPSLLQ $0x01, Y3, Y7
VPSLLQ $0x01, Y4, Y8
VPSLLQ $0x01, Y0, Y9
VPSRLQ $0x3f, Y1, Y10
VPSRLQ $0x3f, Y2, Y11
VPSRLQ $0x3f, Y3, Y12
VPSRLQ $0x3f, Y4, Y13
VPSRLQ $0x3f, Y0, Y14
VPOR Y5, Y10, Y10
VPOR Y6, Y11, Y11
VPOR Y7, Y12, Y12
VPOR Y8, Y13, Y13
VPOR Y9, Y14, Y14
VPXOR Y10, Y4, Y10
VPXOR Y11, Y0, Y11
VPXOR Y12, Y1, Y12
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPXOR (AX), Y10, Y0
VPXOR 32(AX), Y11, Y1
VPXOR 64(AX), Y12, Y2
VPXOR 96(AX), Y13, Y3
VPXOR 128(AX), Y14, Y4
VPSLLQ $0x2c, Y1, Y6
VPSLLQ $0x2b, Y2, Y7
VPSLLQ $0x15, Y3, Y8
VPSLLQ $0x0e, Y4, Y9
VPSRLQ $0x14, Y1, Y1
VPSRLQ $0x15, Y2, Y2
VPSRLQ $0x2b, Y3, Y3
VPSRLQ $0x32, Y4, Y4
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VPBROADCASTQ 24(CX), Y0
VPXOR Y0, Y5, Y5
VMOVDQA Y5, (AX)
VMOVDQA Y6, 32(AX)
VMOVDQA Y7, 64(AX)
VMOVDQA Y8, 96(AX)
VMOVDQA Y9, 128(AX)
VPXOR 256(AX), Y13, Y0
VPXOR 288(AX), Y14, Y1
VPXOR 160(AX), Y10, Y2
VPXOR 192(AX), Y11, Y3
VPXOR 224(AX), Y12, Y4
VPSLLQ $0x1c, Y0, Y5
VPSLLQ $0x14, Y1, Y6
VPSLLQ $0x03, Y2, Y7
VPSLLQ $0x2d, Y3, Y8
VPSLLQ $0x3d, Y4, Y9
VPSRLQ $0x24, Y0, Y0
VPSRLQ $0x2c, Y1, Y1
VPSRLQ $0x3d, Y2, Y2
VPSRLQ $0x13, Y3, Y3
VPSRLQ $0x03, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 160(AX)
VMOVDQA Y6, 192(AX)
VMOVDQA Y7, 224(AX)
VMOVDQA Y8, 256(AX)
VMOVDQA Y9, 288(AX)
VPXOR 352(AX), Y11, Y0
VPXOR 384(AX), Y12, Y1
VPXOR 416(AX), Y13, Y2
VPXOR 448(AX), Y14, Y3
VPXOR 320(AX), Y10, Y4
VPSLLQ $0x01, Y0, Y5
VPSLLQ $0x06, Y1, Y6
VPSLLQ $0x19, Y2, Y7
VPSLLQ $0x08, Y3, Y8
VPSLLQ $0x12, Y4, Y9
VPSRLQ $0x3f, Y0, Y0
VPSRLQ $0x3a, Y1, Y1
VPSRLQ $0x27, Y2, Y2
VPSRLQ $0x38, Y3, Y3
VPSRLQ $0x2e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 320(AX)
VMOVDQA Y6, 352(AX)
VMOVDQA Y7, 384(AX)
VMOVDQA Y8, 416(AX)
VMOVDQA Y9, 448(AX)
VPXOR 608(AX), Y14, Y0
VPXOR 480(AX), Y10, Y1
VPXOR 512(AX), Y11, Y2
VPXOR 544(AX), Y12, Y3
VPXOR 576(AX), Y13, Y4
VPSLLQ $0x1b, Y0, Y5
VPSLLQ $0x24, Y1, Y6
VPSLLQ $0x0a, Y2, Y7
VPSLLQ $0x0f, Y3, Y8
VPSLLQ $0x38, Y4, Y9
VPSRLQ $0x25, Y0, Y0
VPSRLQ $0x1c, Y1, Y1
VPSRLQ $0x36, Y2, Y2
VPSRLQ $0x31, Y3, Y3
VPSRLQ $0x08, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 480(AX)
VMOVDQA Y6, 512(AX)
VMOVDQA Y7, 544(AX)
VMOVDQA Y8, 576(AX)
VMOVDQA Y9, 608(AX)
VPXOR 704(AX), Y12, Y0
VPXOR 736(AX), Y13, Y1
VPXOR 768(AX), Y14, Y2
VPXOR 640(AX), Y10, Y3
VPXOR 672(AX), Y11, Y4
VPSLLQ $0x3e, Y0, Y5
VPSLLQ $0x37, Y1, Y6
VPSLLQ $0x27, Y2, Y7
VPSLLQ $0x29, Y3, Y8
VPSLLQ $0x02, Y4, Y9
VPSRLQ $0x02, Y0, Y0
VPSRLQ $0x09, Y1, Y1
VPSRLQ $0x19, Y2, Y2
VPSRLQ $0x17, Y3, Y3
VPSRLQ $0x3e, Y4, Y4
VPOR Y5, Y0, Y0
VPOR Y6, Y1, Y1
VPOR Y7, Y2, Y2
VPOR Y8, Y3, Y3
VPOR Y9, Y4, Y4
VPANDN Y2, Y1, Y5
VPANDN Y3, Y2, Y6
VPANDN Y4, Y3, Y7
VPANDN Y0, Y4, Y8
VPANDN Y1, Y0, Y9
VPXOR Y0, Y5, Y5
VPXOR Y1, Y6, Y6
VPXOR Y2, Y7, Y7
VPXOR Y3, Y8, Y8
VPXOR Y4, Y9, Y9
VMOVDQA Y5, 640(AX)
VMOVDQA Y6, 672(AX)
VMOVDQA Y7, 704(AX)
VMOVDQA Y8, 736(AX)
VMOVDQA Y9, 768(AX)
ADDQ $0x20, CX
SUBQ $0x00000001, DX
JNZ loop
RET

View File

@@ -0,0 +1,9 @@
// Code generated by command: go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600. DO NOT EDIT.
//go:build amd64
// +build amd64
package keccakf1600
//go:noescape
func f1600x4AVX2(state *uint64, rc *[24]uint64)

View File

@@ -0,0 +1,8 @@
//go:build (!amd64 && !arm64) || (arm64 && !go1.16)
// +build !amd64,!arm64 arm64,!go1.16
package keccakf1600
func permuteSIMDx2(state []uint64) { permuteScalarX2(state) }
func permuteSIMDx4(state []uint64) { permuteScalarX4(state) }