RTG-1339 Support post-quantum hybrid key exchange

Func spec: https://wiki.cfops.it/x/ZcBKHw
2025-07-29 15:59:58 +00:00 · 2022-08-24 14:33:10 +02:00
parent 3e0ff3a771
commit 11cbff4ff7
171 changed files with 15270 additions and 196 deletions
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go
@@ -0,0 +1,302 @@
+//go:build amd64
+// +build amd64
+
+package common
+
+import (
+	"golang.org/x/sys/cpu"
+)
+
+// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
+// the values int16(zeta * 62209) for each zeta, which is used in
+// Montgomery reduction.  There is some duplication and reordering as
+// compared to Zetas to make it more covenient for use with AVX2.
+var ZetasAVX2 = [...]int16{
+	// level 1: int16(Zetas[1]*62209) and Zetas[1]
+	31499, 2571,
+
+	// level 2
+	//
+	// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
+	14746, 2970, 788, 1812,
+
+	// level 3, like level 2.
+	13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
+
+	0, 0, // padding
+
+	// layer 4. offset: 1*16
+	//
+	// The precomputed multiplication and zetas are grouped by 16 at a
+	// time as used in the set of butterflies, etc.
+	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
+	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
+	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
+	622, 622, 622, 622, 622, 622, 622, 622,
+	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
+	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
+	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
+	182, 182, 182, 182, 182, 182, 182, 182,
+	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
+	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
+	962, 962, 962, 962, 962, 962, 962, 962,
+	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
+	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
+	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
+	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
+	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
+
+	// layer 5. offset: 9*16
+	-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
+	-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
+	573, 573, 573, 573, 2004, 2004, 2004, 2004,
+	264, 264, 264, 264, 383, 383, 383, 383,
+	5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
+	21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
+	2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
+	1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
+	-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
+	-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
+	2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
+	732, 732, 732, 732, 608, 608, 608, 608,
+	18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
+	26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
+	1787, 1787, 1787, 1787, 411, 411, 411, 411,
+	3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
+
+	// layer 6. offset: 17*16
+	-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
+	-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
+	1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
+	2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
+	-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
+	9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
+	516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
+	1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
+	19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
+	-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
+	2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
+	107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
+	13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
+	16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
+	2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
+	448, 448, 2264, 2264, 677, 677, 2054, 2054,
+
+	// layer 7. offset: 25*16
+	-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
+	-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
+	2226, 430, 555, 843, 2078, 871, 1550, 105,
+	422, 587, 177, 3094, 3038, 2869, 1574, 1653,
+	32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
+	-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
+	3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
+	1739, 644, 2457, 349, 418, 329, 3173, 3254,
+	-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
+	-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
+	817, 1097, 603, 610, 1322, 2044, 1864, 384,
+	2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
+	10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
+	31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
+	2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
+	3221, 3021, 996, 991, 958, 1869, 1522, 1628,
+
+	// layer 1 inverse
+	23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
+	-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
+	1628, 1522, 1869, 958, 991, 996, 3021, 3221,
+	478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
+	14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
+	-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
+	1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
+	384, 1864, 2044, 1322, 610, 603, 1097, 817,
+	-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
+	12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
+	3254, 3173, 329, 418, 349, 2457, 644, 1739,
+	1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
+	5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
+	-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
+	1653, 1574, 2869, 3038, 3094, 177, 587, 422,
+	105, 1550, 871, 2078, 843, 555, 430, 2226,
+
+	// layer 2 inverse
+	-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
+	-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
+	2054, 2054, 677, 677, 2264, 2264, 448, 448,
+	2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
+	18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
+	-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
+	2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
+	830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
+	27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
+	-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
+	1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
+	2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
+	25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
+	30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
+	1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
+	1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
+
+	// layer 3 inverse
+	-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
+	8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
+	1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
+	411, 411, 411, 411, 1787, 1787, 1787, 1787,
+	8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
+	24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
+	608, 608, 608, 608, 732, 732, 732, 732,
+	1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
+	-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
+	-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
+	3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
+	1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
+	-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
+	17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
+	383, 383, 383, 383, 264, 264, 264, 264,
+	2004, 2004, 2004, 2004, 573, 573, 573, 573,
+
+	// layer 4 inverse
+	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
+	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
+	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
+	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
+	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
+	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
+	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
+	962, 962, 962, 962, 962, 962, 962, 962,
+	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
+	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
+	182, 182, 182, 182, 182, 182, 182, 182,
+	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
+	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
+	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
+	622, 622, 622, 622, 622, 622, 622, 622,
+	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
+
+	// layer 5 inverse
+	-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
+
+	// layer 6 inverse
+	788, 1812, 14746, 2970,
+
+	// layer 7 inverse
+	31499, 2571,
+}
+
+// Sets p to a + b.  Does not normalize coefficients.
+func (p *Poly) Add(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		addAVX2(
+			(*[N]int16)(p),
+			(*[N]int16)(a),
+			(*[N]int16)(b),
+		)
+	} else {
+		p.addGeneric(a, b)
+	}
+}
+
+// Sets p to a - b.  Does not normalize coefficients.
+func (p *Poly) Sub(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		subAVX2(
+			(*[N]int16)(p),
+			(*[N]int16)(a),
+			(*[N]int16)(b),
+		)
+	} else {
+		p.subGeneric(a, b)
+	}
+}
+
+// Executes an in-place forward "NTT" on p.
+//
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤7q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity of the NTT)
+// if the input is in regular form, then the result is also in regular form.
+// The order of coefficients will be "tangled". These can be put back into
+// their proper order by calling Detangle().
+func (p *Poly) NTT() {
+	if cpu.X86.HasAVX2 {
+		nttAVX2((*[N]int16)(p))
+	} else {
+		p.nttGeneric()
+	}
+}
+
+// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
+// factor R.
+//
+// Requires coefficients to be in "tangled" order, see Tangle().
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity)
+// if the input is in regular form, then the result is also in regular form.
+func (p *Poly) InvNTT() {
+	if cpu.X86.HasAVX2 {
+		invNttAVX2((*[N]int16)(p))
+	} else {
+		p.invNTTGeneric()
+	}
+}
+
+// Sets p to the "pointwise" multiplication of a and b.
+//
+// That is: InvNTT(p) = InvNTT(a) * InvNTT(b).  Assumes a and b are in
+// Montgomery form.  Products between coefficients of a and b must be strictly
+// bounded in absolute value by 2¹⁵q.  p will be in Montgomery form and
+// bounded in absolute value by 2q.
+//
+// Requires a and b to be in "tangled" order, see Tangle().  p will be in
+// tangled order as well.
+func (p *Poly) MulHat(a, b *Poly) {
+	if cpu.X86.HasAVX2 {
+		mulHatAVX2(
+			(*[N]int16)(p),
+			(*[N]int16)(a),
+			(*[N]int16)(b),
+		)
+	} else {
+		p.mulHatGeneric(a, b)
+	}
+}
+
+// Puts p into the right form to be used with (among others) InvNTT().
+func (p *Poly) Tangle() {
+	if cpu.X86.HasAVX2 {
+		tangleAVX2((*[N]int16)(p))
+	}
+
+	// When AVX2 is not available, we use the standard order.
+}
+
+// Puts p back into standard form.
+func (p *Poly) Detangle() {
+	if cpu.X86.HasAVX2 {
+		detangleAVX2((*[N]int16)(p))
+	}
+
+	// When AVX2 is not available, we use the standard order.
+}
+
+// Almost normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q}.
+func (p *Poly) BarrettReduce() {
+	if cpu.X86.HasAVX2 {
+		barrettReduceAVX2((*[N]int16)(p))
+	} else {
+		p.barrettReduceGeneric()
+	}
+}
+
+// Normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q-1}.
+func (p *Poly) Normalize() {
+	if cpu.X86.HasAVX2 {
+		normalizeAVX2((*[N]int16)(p))
+	} else {
+		p.normalizeGeneric()
+	}
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/field.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/field.go
@@ -0,0 +1,74 @@
+package common
+
+// Given -2¹⁵ q ≤ x < 2¹⁵ q, returns -q < y < q with x 2⁻¹⁶ = y (mod q).
+func montReduce(x int32) int16 {
+	// This is Montgomery reduction with R=2¹⁶.
+	//
+	// Note gcd(2¹⁶, q) = 1 as q is prime.  Write q' := 62209 = q⁻¹ mod R.
+	// First we compute
+	//
+	//	m := ((x mod R) q') mod R
+	//     = x q' mod R
+	//	   = int16(x q')
+	//	   = int16(int32(x) * int32(q'))
+	//
+	// Note that x q' might be as big as 2³² and could overflow the int32
+	// multiplication in the last line.  However for any int32s a and b,
+	// we have int32(int64(a)*int64(b)) = int32(a*b) and so the result is ok.
+	m := int16(x * 62209)
+
+	// Note that x - m q is divisable by R; indeed modulo R we have
+	//
+	//  x - m q ≡ x - x q' q ≡ x - x q⁻¹ q ≡ x - x = 0.
+	//
+	// We return y := (x - m q) / R.  Note that y is indeed correct as
+	// modulo q we have
+	//
+	//  y ≡ x R⁻¹ - m q R⁻¹ = x R⁻¹
+	//
+	// and as both 2¹⁵ q ≤ m q, x < 2¹⁵ q, we have
+	// 2¹⁶ q ≤ x - m q < 2¹⁶ and so q ≤ (x - m q) / R < q as desired.
+	return int16(uint32(x-int32(m)*int32(Q)) >> 16)
+}
+
+// Given any x, returns x R mod q where R=2¹⁶.
+func toMont(x int16) int16 {
+	// Note |1353 x| ≤ 1353 2¹⁵ ≤ 13318 q ≤ 2¹⁵ q and so we're within
+	// the bounds of montReduce.
+	return montReduce(int32(x) * 1353) // 1353 = R² mod q.
+}
+
+// Given any x, compute 0 ≤ y ≤ q with x = y (mod q).
+//
+// Beware: we might have barrettReduce(x) = q ≠ 0 for some x.  In fact,
+// this happens if and only if x = -nq for some positive integer n.
+func barrettReduce(x int16) int16 {
+	// This is standard Barrett reduction.
+	//
+	// For any x we have x mod q = x - ⌊x/q⌋ q.  We will use 20159/2²⁶ as
+	// an approximation of 1/q. Note that  0 ≤ 20159/2²⁶ - 1/q ≤ 0.135/2²⁶
+	// and so | x 20156/2²⁶ - x/q | ≤ 2⁻¹⁰ for |x| ≤ 2¹⁶.  For all x
+	// not a multiple of q, the number x/q is further than 1/q from any integer
+	// and so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋.  If x is a multiple of q and x is positive,
+	// then x 20156/2²⁶ is larger than x/q so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋ as well.
+	// Finally, if x is negative multiple of q, then ⌊x 20156/2²⁶⌋ = ⌊x/q⌋-1.
+	// Thus
+	//                        [ q        if x=-nq for pos. integer n
+	//  x - ⌊x 20156/2²⁶⌋ q = [
+	//                        [ x mod q  otherwise
+	//
+	// To compute actually compute this, note that
+	//
+	//  ⌊x 20156/2²⁶⌋ = (20159 x) >> 26.
+	return x - int16((int32(x)*20159)>>26)*Q
+}
+
+// Returns x if x < q and x - q otherwise.  Assumes x ≥ -29439.
+func csubq(x int16) int16 {
+	x -= Q // no overflow due to assumption x ≥ -29439.
+	// If x is positive, then x >> 15 = 0.  If x is negative,
+	// then uint16(x >> 15) = 2¹⁶-1.  So this will add back in q
+	// if x was smaller than q.
+	x += (x >> 15) & Q
+	return x
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/generic.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/generic.go
@@ -0,0 +1,77 @@
+//go:build !amd64
+// +build !amd64
+
+package common
+
+// Sets p to a + b.  Does not normalize coefficients.
+func (p *Poly) Add(a, b *Poly) {
+	p.addGeneric(a, b)
+}
+
+// Sets p to a - b.  Does not normalize coefficients.
+func (p *Poly) Sub(a, b *Poly) {
+	p.subGeneric(a, b)
+}
+
+// Executes an in-place forward "NTT" on p.
+//
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤7q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity of the NTT)
+// if the input is in regular form, then the result is also in regular form.
+// The order of coefficients will be "tangled". These can be put back into
+// their proper order by calling Detangle().
+func (p *Poly) NTT() {
+	p.nttGeneric()
+}
+
+// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
+// factor R.
+//
+// Requires coefficients to be in "tangled" order, see Tangle().
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity)
+// if the input is in regular form, then the result is also in regular form.
+func (p *Poly) InvNTT() {
+	p.invNTTGeneric()
+}
+
+// Sets p to the "pointwise" multiplication of a and b.
+//
+// That is: InvNTT(p) = InvNTT(a) * InvNTT(b).  Assumes a and b are in
+// Montgomery form.  Products between coefficients of a and b must be strictly
+// bounded in absolute value by 2¹⁵q.  p will be in Montgomery form and
+// bounded in absolute value by 2q.
+//
+// Requires a and b to be in "tangled" order, see Tangle().  p will be in
+// tangled order as well.
+func (p *Poly) MulHat(a, b *Poly) {
+	p.mulHatGeneric(a, b)
+}
+
+// Puts p into the right form to be used with (among others) InvNTT().
+func (p *Poly) Tangle() {
+	// In the generic implementation there is no advantage to using a
+	// different order, so we use the standard order everywhere.
+}
+
+// Puts p back into standard form.
+func (p *Poly) Detangle() {
+	// In the generic implementation there is no advantage to using a
+	// different order, so we use the standard order everywhere.
+}
+
+// Almost normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q}.
+func (p *Poly) BarrettReduce() {
+	p.barrettReduceGeneric()
+}
+
+// Normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q-1}.
+func (p *Poly) Normalize() {
+	p.normalizeGeneric()
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/ntt.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/ntt.go
@@ -0,0 +1,193 @@
+package common
+
+// Zetas lists precomputed powers of the primitive root of unity in
+// Montgomery representation used for the NTT:
+//
+//	Zetas[i] = ζᵇʳᵛ⁽ⁱ⁾ R mod q
+//
+// where ζ = 17, brv(i) is the bitreversal of a 7-bit number and R=2¹⁶ mod q.
+//
+// The following Python code generates the Zetas arrays:
+//
+//	q = 13*2**8 + 1; zeta = 17
+//	R = 2**16 % q # Montgomery const.
+//	def brv(x): return int(''.join(reversed(bin(x)[2:].zfill(7))),2)
+//	print([(pow(zeta, brv(i), q)*R)%q for i in range(128)])
+var Zetas = [128]int16{
+	2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182,
+	962, 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199,
+	2648, 1017, 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015,
+	2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126,
+	1469, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821,
+	2604, 448, 2264, 677, 2054, 2226, 430, 555, 843, 2078, 871, 1550,
+	105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, 3083, 778, 1159,
+	3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173,
+	3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218,
+	1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475,
+	2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628,
+}
+
+// InvNTTReductions keeps track of which coefficients to apply Barrett
+// reduction to in Poly.InvNTT().
+//
+// Generated in a lazily: once a butterfly is computed which is about to
+// overflow the int16, the largest coefficient is reduced.  If that is
+// not enough, the other coefficient is reduced as well.
+//
+// This is actually optimal, as proven in https://eprint.iacr.org/2020/1377.pdf
+var InvNTTReductions = [...]int{
+	-1, // after layer 1
+	-1, // after layer 2
+	16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
+	241, -1, // after layer 3
+	0, 1, 32, 33, 34, 35, 64, 65, 96, 97, 98, 99, 128, 129, 160, 161, 162, 163,
+	192, 193, 224, 225, 226, 227, -1, // after layer 4
+	2, 3, 66, 67, 68, 69, 70, 71, 130, 131, 194, 195, 196, 197, 198,
+	199, -1, // after layer 5
+	4, 5, 6, 7, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+	143, -1, // after layer 6
+	-1, //  after layer 7
+}
+
+// Executes an in-place forward "NTT" on p.
+//
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤7q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity of the NTT)
+// if the input is in regular form, then the result is also in regular form.
+// The order of coefficients will be "tangled". These can be put back into
+// their proper order by calling Detangle().
+func (p *Poly) nttGeneric() {
+	// Note that ℤ_q does not have a primitive 512ᵗʰ root of unity (as 512
+	// does not divide into q-1) and so we cannot do a regular NTT.  ℤ_q
+	// does have a primitive 256ᵗʰ root of unity, the smallest of which
+	// is ζ := 17.
+	//
+	// Recall that our base ring R := ℤ_q[x] / (x²⁵⁶ + 1).  The polynomial
+	// x²⁵⁶+1 will not split completely (as its roots would be 512ᵗʰ roots
+	// of unity.)  However, it does split almost (using ζ¹²⁸ = -1):
+	//
+	// x²⁵⁶ + 1 = (x²)¹²⁸ - ζ¹²⁸
+	//          = ((x²)⁶⁴ - ζ⁶⁴)((x²)⁶⁴ + ζ⁶⁴)
+	//          = ((x²)³² - ζ³²)((x²)³² + ζ³²)((x²)³² - ζ⁹⁶)((x²)³² + ζ⁹⁶)
+	//          ⋮
+	//          = (x² - ζ)(x² + ζ)(x² - ζ⁶⁵)(x² + ζ⁶⁵) … (x² + ζ¹²⁷)
+	//
+	// Note that the powers of ζ that appear (from the second line down) are
+	// in binary
+	//
+	// 0100000 1100000
+	// 0010000 1010000 0110000 1110000
+	// 0001000 1001000 0101000 1101000 0011000 1011000 0111000 1111000
+	//         …
+	//
+	// That is: brv(2), brv(3), brv(4), …, where brv(x) denotes the 7-bit
+	// bitreversal of x.  These powers of ζ are given by the Zetas array.
+	//
+	// The polynomials x² ± ζⁱ are irreducible and coprime, hence by
+	// the Chinese Remainder Theorem we know
+	//
+	//  ℤ_q[x]/(x²⁵⁶+1) → ℤ_q[x]/(x²-ζ) x … x  ℤ_q[x]/(x²+ζ¹²⁷)
+	//
+	// given by a ↦ ( a mod x²-ζ, …, a mod x²+ζ¹²⁷ )
+	// is an isomorphism, which is the "NTT".  It can be efficiently computed by
+	//
+	//
+	//  a ↦ ( a mod (x²)⁶⁴ - ζ⁶⁴, a mod (x²)⁶⁴ + ζ⁶⁴ )
+	//    ↦ ( a mod (x²)³² - ζ³², a mod (x²)³² + ζ³²,
+	//        a mod (x²)⁹⁶ - ζ⁹⁶, a mod (x²)⁹⁶ + ζ⁹⁶ )
+	//
+	//	    et cetera
+	//
+	// If N was 8 then this can be pictured in the following diagram:
+	//
+	//  https://cnx.org/resources/17ee4dfe517a6adda05377b25a00bf6e6c93c334/File0026.png
+	//
+	// Each cross is a Cooley-Tukey butterfly: it's the map
+	//
+	//  (a, b) ↦ (a + ζb, a - ζb)
+	//
+	// for the appropriate power ζ for that column and row group.
+
+	k := 0 // Index into Zetas
+
+	// l runs effectively over the columns in the diagram above; it is half the
+	// height of a row group, i.e. the number of butterflies in each row group.
+	// In the diagram above it would be 4, 2, 1.
+	for l := N / 2; l > 1; l >>= 1 {
+		// On the nᵗʰ iteration of the l-loop, the absolute value of the
+		// coefficients are bounded by nq.
+
+		// offset effectively loops over the row groups in this column; it is
+		// the first row in the row group.
+		for offset := 0; offset < N-l; offset += 2 * l {
+			k++
+			zeta := int32(Zetas[k])
+
+			// j loops over each butterfly in the row group.
+			for j := offset; j < offset+l; j++ {
+				t := montReduce(zeta * int32(p[j+l]))
+				p[j+l] = p[j] - t
+				p[j] += t
+			}
+		}
+	}
+}
+
+// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
+// factor R.
+//
+// Requires coefficients to be in "tangled" order, see Tangle().
+// Assumes the coefficients are in absolute value ≤q.  The resulting
+// coefficients are in absolute value ≤q.  If the input is in Montgomery
+// form, then the result is in Montgomery form and so (by linearity)
+// if the input is in regular form, then the result is also in regular form.
+func (p *Poly) invNTTGeneric() {
+	k := 127 // Index into Zetas
+	r := -1  // Index into InvNTTReductions.
+
+	// We basically do the oppposite of NTT, but postpone dividing by 2 in the
+	// inverse of the Cooley-Tukey butterfly and accumulate that into a big
+	// division by 2⁷ at the end.  See the comments in the NTT() function.
+
+	for l := 2; l < N; l <<= 1 {
+		for offset := 0; offset < N-l; offset += 2 * l {
+			// As we're inverting, we need powers of ζ⁻¹ (instead of ζ).
+			// To be precise, we need ζᵇʳᵛ⁽ᵏ⁾⁻¹²⁸. However, as ζ⁻¹²⁸ = -1,
+			// we can use the existing Zetas table instead of
+			// keeping a separate InvZetas table as in Dilithium.
+
+			minZeta := int32(Zetas[k])
+			k--
+
+			for j := offset; j < offset+l; j++ {
+				// Gentleman-Sande butterfly: (a, b) ↦ (a + b, ζ(a-b))
+				t := p[j+l] - p[j]
+				p[j] += p[j+l]
+				p[j+l] = montReduce(minZeta * int32(t))
+
+				// Note that if we had |a| < αq and |b| < βq before the
+				// butterfly, then now we have |a| < (α+β)q and |b| < q.
+			}
+		}
+
+		// We let the InvNTTReductions instruct us which coefficients to
+		// Barrett reduce.  See TestInvNTTReductions, which tests whether
+		// there is an overflow.
+		for {
+			r++
+			i := InvNTTReductions[r]
+			if i < 0 {
+				break
+			}
+			p[i] = barrettReduce(p[i])
+		}
+	}
+
+	for j := 0; j < N; j++ {
+		// Note 1441 = (128)⁻¹ R².  The coefficients are bounded by 9q, so
+		// as 1441 * 9 ≈ 2¹⁴ < 2¹⁵, we're within the required bounds
+		// for montReduce().
+		p[j] = montReduce(1441 * int32(p[j]))
+	}
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params.go
@@ -0,0 +1,22 @@
+package common
+
+import (
+	"github.com/cloudflare/circl/pke/kyber/internal/common/params"
+)
+
+const (
+	// Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1.
+	Q = params.Q
+
+	// N is the parameter N: the length of the polynomials
+	N = params.N
+
+	// PolySize is the size of a packed polynomial.
+	PolySize = params.PolySize
+
+	// PlaintextSize is the size of the plaintext
+	PlaintextSize = params.PlaintextSize
+
+	// Eta2 is the parameter η₂
+	Eta2 = params.Eta2
+)
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params/params.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params/params.go
@@ -0,0 +1,21 @@
+package params
+
+// We put these parameters in a separate package so that the Go code,
+// such as asm/src.go, that generates assembler can import it.
+
+const (
+	// Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1.
+	Q int16 = 3329
+
+	// N is the parameter N: the length of the polynomials
+	N int = 256
+
+	// PolySize is the size of a packed polynomial.
+	PolySize int = 384
+
+	// PlaintextSize is the size of the plaintext
+	PlaintextSize = 32
+
+	// Eta2 is the parameter η₂
+	Eta2 = 2
+)
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/poly.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/poly.go
@@ -0,0 +1,324 @@
+package common
+
+// An element of our base ring R which are polynomials over ℤ_q
+// modulo the equation Xᴺ = -1, where q=3329 and N=256.
+//
+// This type is also used to store NTT-transformed polynomials,
+// see Poly.NTT().
+//
+// Coefficients aren't always reduced.  See Normalize().
+type Poly [N]int16
+
+// Sets p to a + b.  Does not normalize coefficients.
+func (p *Poly) addGeneric(a, b *Poly) {
+	for i := 0; i < N; i++ {
+		p[i] = a[i] + b[i]
+	}
+}
+
+// Sets p to a - b.  Does not normalize coefficients.
+func (p *Poly) subGeneric(a, b *Poly) {
+	for i := 0; i < N; i++ {
+		p[i] = a[i] - b[i]
+	}
+}
+
+// Almost normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q}.
+func (p *Poly) barrettReduceGeneric() {
+	for i := 0; i < N; i++ {
+		p[i] = barrettReduce(p[i])
+	}
+}
+
+// Normalizes coefficients.
+//
+// Ensures each coefficient is in {0, …, q-1}.
+func (p *Poly) normalizeGeneric() {
+	for i := 0; i < N; i++ {
+		p[i] = csubq(barrettReduce(p[i]))
+	}
+}
+
+// Multiplies p in-place by the Montgomery factor 2¹⁶.
+//
+// Coefficients of p can be artbitray.  Resulting coefficients are bounded
+// in absolute value by q.
+func (p *Poly) ToMont() {
+	for i := 0; i < N; i++ {
+		p[i] = toMont(p[i])
+	}
+}
+
+// Sets p to the "pointwise" multiplication of a and b.
+//
+// That is: InvNTT(p) = InvNTT(a) * InvNTT(b).  Assumes a and b are in
+// Montgomery form.  Products between coefficients of a and b must be strictly
+// bounded in absolute value by 2¹⁵q.  p will be in Montgomery form and
+// bounded in absolute value by 2q.
+//
+// Requires a and b to be in "tangled" order, see Tangle().  p will be in
+// tangled order as well.
+func (p *Poly) mulHatGeneric(a, b *Poly) {
+	// Recall from the discussion in NTT(), that a transformed polynomial is
+	// an element of ℤ_q[x]/(x²-ζ) x … x  ℤ_q[x]/(x²+ζ¹²⁷);
+	// that is: 128 degree-one polynomials instead of simply 256 elements
+	// from ℤ_q as in the regular NTT.  So instead of pointwise multiplication,
+	// we multiply the 128 pairs of degree-one polynomials modulo the
+	// right equation:
+	//
+	//  (a₁ + a₂x)(b₁ + b₂x) = a₁b₁ + a₂b₂ζ' + (a₁b₂ + a₂b₁)x,
+	//
+	// where ζ' is the appropriate power of ζ.
+
+	k := 64
+	for i := 0; i < N; i += 4 {
+		zeta := int32(Zetas[k])
+		k++
+
+		p0 := montReduce(int32(a[i+1]) * int32(b[i+1]))
+		p0 = montReduce(int32(p0) * zeta)
+		p0 += montReduce(int32(a[i]) * int32(b[i]))
+
+		p1 := montReduce(int32(a[i]) * int32(b[i+1]))
+		p1 += montReduce(int32(a[i+1]) * int32(b[i]))
+
+		p[i] = p0
+		p[i+1] = p1
+
+		p2 := montReduce(int32(a[i+3]) * int32(b[i+3]))
+		p2 = -montReduce(int32(p2) * zeta)
+		p2 += montReduce(int32(a[i+2]) * int32(b[i+2]))
+
+		p3 := montReduce(int32(a[i+2]) * int32(b[i+3]))
+		p3 += montReduce(int32(a[i+3]) * int32(b[i+2]))
+
+		p[i+2] = p2
+		p[i+3] = p3
+	}
+}
+
+// Packs p into buf.  buf should be of length PolySize.
+//
+// Assumes p is normalized (and not just Barrett reduced) and "tangled",
+// see Tangle().
+func (p *Poly) Pack(buf []byte) {
+	q := *p
+	q.Detangle()
+	for i := 0; i < 128; i++ {
+		t0 := q[2*i]
+		t1 := q[2*i+1]
+		buf[3*i] = byte(t0)
+		buf[3*i+1] = byte(t0>>8) | byte(t1<<4)
+		buf[3*i+2] = byte(t1 >> 4)
+	}
+}
+
+// Unpacks p from buf.
+//
+// buf should be of length PolySize.  p will be "tangled", see Detangle().
+//
+// p will not be normalized; instead 0 ≤ p[i] < 4096.
+func (p *Poly) Unpack(buf []byte) {
+	for i := 0; i < 128; i++ {
+		p[2*i] = int16(buf[3*i]) | ((int16(buf[3*i+1]) << 8) & 0xfff)
+		p[2*i+1] = int16(buf[3*i+1]>>4) | (int16(buf[3*i+2]) << 4)
+	}
+	p.Tangle()
+}
+
+// Set p to Decompress_q(m, 1).
+//
+// p will be normalized.  m has to be of PlaintextSize.
+func (p *Poly) DecompressMessage(m []byte) {
+	// Decompress_q(x, 1) = ⌈xq/2⌋ = ⌊xq/2+½⌋ = (xq+1) >> 1 and so
+	// Decompress_q(0, 1) = 0 and Decompress_q(1, 1) = (q+1)/2.
+	for i := 0; i < 32; i++ {
+		for j := 0; j < 8; j++ {
+			bit := (m[i] >> uint(j)) & 1
+
+			// Set coefficient to either 0 or (q+1)/2 depending on the bit.
+			p[8*i+j] = -int16(bit) & ((Q + 1) / 2)
+		}
+	}
+}
+
+// Writes Compress_q(p, 1) to m.
+//
+// Assumes p is normalized.  m has to be of length at least PlaintextSize.
+func (p *Poly) CompressMessageTo(m []byte) {
+	// Compress_q(x, 1) is 1 on {833, …, 2496} and zero elsewhere.
+	for i := 0; i < 32; i++ {
+		m[i] = 0
+		for j := 0; j < 8; j++ {
+			x := 1664 - p[8*i+j]
+			// With the previous substitution, we want to return 1 if
+			// and only if x is in {831, …, -832}.
+			x = (x >> 15) ^ x
+			// Note (x >> 15)ˣ if x≥0 and -x-1 otherwise. Thus now we want
+			// to return 1 iff x ≤ 831, ie. x - 832 < 0.
+			x -= 832
+			m[i] |= ((byte(x >> 15)) & 1) << uint(j)
+		}
+	}
+}
+
+// Set p to Decompress_q(m, 1).
+//
+// Assumes d is in {3, 4, 5, 10, 11}.  p will be normalized.
+func (p *Poly) Decompress(m []byte, d int) {
+	// Decompress_q(x, d) = ⌈(q/2ᵈ)x⌋
+	//                    = ⌊(q/2ᵈ)x+½⌋
+	//                    = ⌊(qx + 2ᵈ⁻¹)/2ᵈ⌋
+	//                    = (qx + (1<<(d-1))) >> d
+	switch d {
+	case 4:
+		for i := 0; i < N/2; i++ {
+			p[2*i] = int16(((1 << 3) +
+				uint32(m[i]&15)*uint32(Q)) >> 4)
+			p[2*i+1] = int16(((1 << 3) +
+				uint32(m[i]>>4)*uint32(Q)) >> 4)
+		}
+	case 5:
+		var t [8]uint16
+		idx := 0
+		for i := 0; i < N/8; i++ {
+			t[0] = uint16(m[idx])
+			t[1] = (uint16(m[idx]) >> 5) | (uint16(m[idx+1] << 3))
+			t[2] = uint16(m[idx+1]) >> 2
+			t[3] = (uint16(m[idx+1]) >> 7) | (uint16(m[idx+2] << 1))
+			t[4] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3] << 4))
+			t[5] = uint16(m[idx+3]) >> 1
+			t[6] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4] << 2))
+			t[7] = uint16(m[idx+4]) >> 3
+
+			for j := 0; j < 8; j++ {
+				p[8*i+j] = int16(((1 << 4) +
+					uint32(t[j]&((1<<5)-1))*uint32(Q)) >> 5)
+			}
+
+			idx += 5
+		}
+
+	case 10:
+		var t [4]uint16
+		idx := 0
+		for i := 0; i < N/4; i++ {
+			t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8)
+			t[1] = (uint16(m[idx+1]) >> 2) | (uint16(m[idx+2]) << 6)
+			t[2] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3]) << 4)
+			t[3] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4]) << 2)
+
+			for j := 0; j < 4; j++ {
+				p[4*i+j] = int16(((1 << 9) +
+					uint32(t[j]&((1<<10)-1))*uint32(Q)) >> 10)
+			}
+
+			idx += 5
+		}
+	case 11:
+		var t [8]uint16
+		idx := 0
+		for i := 0; i < N/8; i++ {
+			t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8)
+			t[1] = (uint16(m[idx+1]) >> 3) | (uint16(m[idx+2]) << 5)
+			t[2] = (uint16(m[idx+2]) >> 6) | (uint16(m[idx+3]) << 2) | (uint16(m[idx+4]) << 10)
+			t[3] = (uint16(m[idx+4]) >> 1) | (uint16(m[idx+5]) << 7)
+			t[4] = (uint16(m[idx+5]) >> 4) | (uint16(m[idx+6]) << 4)
+			t[5] = (uint16(m[idx+6]) >> 7) | (uint16(m[idx+7]) << 1) | (uint16(m[idx+8]) << 9)
+			t[6] = (uint16(m[idx+8]) >> 2) | (uint16(m[idx+9]) << 6)
+			t[7] = (uint16(m[idx+9]) >> 5) | (uint16(m[idx+10]) << 3)
+
+			for j := 0; j < 8; j++ {
+				p[8*i+j] = int16(((1 << 10) +
+					uint32(t[j]&((1<<11)-1))*uint32(Q)) >> 11)
+			}
+
+			idx += 11
+		}
+	default:
+		panic("unsupported d")
+	}
+}
+
+// Writes Compress_q(p, d) to m.
+//
+// Assumes p is normalized and d is in {3, 4, 5, 10, 11}.
+func (p *Poly) CompressTo(m []byte, d int) {
+	// Compress_q(x, d) = ⌈(2ᵈ/q)x⌋ mod⁺ 2ᵈ
+	//                  = ⌊(2ᵈ/q)x+½⌋ mod⁺ 2ᵈ
+	//					= ⌊((x << d) + q/2) / q⌋ mod⁺ 2ᵈ
+	//					= DIV((x << d) + q/2, q) & ((1<<d) - 1)
+	switch d {
+	case 4:
+		var t [8]uint16
+		idx := 0
+		for i := 0; i < N/8; i++ {
+			for j := 0; j < 8; j++ {
+				t[j] = uint16(((uint32(p[8*i+j])<<4)+uint32(Q)/2)/
+					uint32(Q)) & ((1 << 4) - 1)
+			}
+			m[idx] = byte(t[0]) | byte(t[1]<<4)
+			m[idx+1] = byte(t[2]) | byte(t[3]<<4)
+			m[idx+2] = byte(t[4]) | byte(t[5]<<4)
+			m[idx+3] = byte(t[6]) | byte(t[7]<<4)
+			idx += 4
+		}
+
+	case 5:
+		var t [8]uint16
+		idx := 0
+		for i := 0; i < N/8; i++ {
+			for j := 0; j < 8; j++ {
+				t[j] = uint16(((uint32(p[8*i+j])<<5)+uint32(Q)/2)/
+					uint32(Q)) & ((1 << 5) - 1)
+			}
+			m[idx] = byte(t[0]) | byte(t[1]<<5)
+			m[idx+1] = byte(t[1]>>3) | byte(t[2]<<2) | byte(t[3]<<7)
+			m[idx+2] = byte(t[3]>>1) | byte(t[4]<<4)
+			m[idx+3] = byte(t[4]>>4) | byte(t[5]<<1) | byte(t[6]<<6)
+			m[idx+4] = byte(t[6]>>2) | byte(t[7]<<3)
+			idx += 5
+		}
+
+	case 10:
+		var t [4]uint16
+		idx := 0
+		for i := 0; i < N/4; i++ {
+			for j := 0; j < 4; j++ {
+				t[j] = uint16(((uint32(p[4*i+j])<<10)+uint32(Q)/2)/
+					uint32(Q)) & ((1 << 10) - 1)
+			}
+			m[idx] = byte(t[0])
+			m[idx+1] = byte(t[0]>>8) | byte(t[1]<<2)
+			m[idx+2] = byte(t[1]>>6) | byte(t[2]<<4)
+			m[idx+3] = byte(t[2]>>4) | byte(t[3]<<6)
+			m[idx+4] = byte(t[3] >> 2)
+			idx += 5
+		}
+	case 11:
+		var t [8]uint16
+		idx := 0
+		for i := 0; i < N/8; i++ {
+			for j := 0; j < 8; j++ {
+				t[j] = uint16(((uint32(p[8*i+j])<<11)+uint32(Q)/2)/
+					uint32(Q)) & ((1 << 11) - 1)
+			}
+			m[idx] = byte(t[0])
+			m[idx+1] = byte(t[0]>>8) | byte(t[1]<<3)
+			m[idx+2] = byte(t[1]>>5) | byte(t[2]<<6)
+			m[idx+3] = byte(t[2] >> 2)
+			m[idx+4] = byte(t[2]>>10) | byte(t[3]<<1)
+			m[idx+5] = byte(t[3]>>7) | byte(t[4]<<4)
+			m[idx+6] = byte(t[4]>>4) | byte(t[5]<<7)
+			m[idx+7] = byte(t[5] >> 1)
+			m[idx+8] = byte(t[5]>>9) | byte(t[6]<<2)
+			m[idx+9] = byte(t[6]>>6) | byte(t[7]<<5)
+			m[idx+10] = byte(t[7] >> 3)
+			idx += 11
+		}
+	default:
+		panic("unsupported d")
+	}
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/sample.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/sample.go
@@ -0,0 +1,236 @@
+package common
+
+import (
+	"encoding/binary"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	"github.com/cloudflare/circl/simd/keccakf1600"
+)
+
+// DeriveX4Available indicates whether the system supports the quick fourway
+// sampling variants like PolyDeriveUniformX4.
+var DeriveX4Available = keccakf1600.IsEnabledX4()
+
+// Samples p from a centered binomial distribution with given η.
+//
+// Essentially CBD_η(PRF(seed, nonce)) from the specification.
+func (p *Poly) DeriveNoise(seed []byte, nonce uint8, eta int) {
+	switch eta {
+	case 2:
+		p.DeriveNoise2(seed, nonce)
+	case 3:
+		p.DeriveNoise3(seed, nonce)
+	default:
+		panic("unsupported eta")
+	}
+}
+
+// Sample p from a centered binomial distribution with n=6 and p=½ - that is:
+// coefficients are in {-3, -2, -1, 0, 1, 2, 3} with probabilities {1/64, 3/32,
+// 15/64, 5/16, 16/64, 3/32, 1/64}.
+func (p *Poly) DeriveNoise3(seed []byte, nonce uint8) {
+	keySuffix := [1]byte{nonce}
+	h := sha3.NewShake256()
+	_, _ = h.Write(seed[:])
+	_, _ = h.Write(keySuffix[:])
+
+	// The distribution at hand is exactly the same as that
+	// of (a₁ + a₂ + a₃) - (b₁ + b₂+b₃) where a_i,b_i~U(1).  Thus we need
+	// 6 bits per coefficients, thus 192 bytes of input entropy.
+
+	// We add two extra zero bytes in the buffer to be able to read 8 bytes
+	// at the same time (while using only 6.)
+	var buf [192 + 2]byte
+	_, _ = h.Read(buf[:192])
+
+	for i := 0; i < 32; i++ {
+		// t is interpreted as a₁ + 2a₂ + 4a₃ + 8b₁ + 16b₂ + ….
+		t := binary.LittleEndian.Uint64(buf[6*i:])
+
+		d := t & 0x249249249249        // a₁ + 8b₁ + …
+		d += (t >> 1) & 0x249249249249 // a₁ + a₂ + 8(b₁ + b₂) + …
+		d += (t >> 2) & 0x249249249249 // a₁ + a₂ + a₃ + 4(b₁ + b₂ + b₃) + …
+
+		for j := 0; j < 8; j++ {
+			a := int16(d) & 0x7 // a₁ + a₂ + a₃
+			d >>= 3
+			b := int16(d) & 0x7 // b₁ + b₂ + b₃
+			d >>= 3
+			p[8*i+j] = a - b
+		}
+	}
+}
+
+// Sample p from a centered binomial distribution with n=4 and p=½ - that is:
+// coefficients are in {-2, -1, 0, 1, 2} with probabilities {1/16, 1/4,
+// 3/8, 1/4, 1/16}.
+func (p *Poly) DeriveNoise2(seed []byte, nonce uint8) {
+	keySuffix := [1]byte{nonce}
+	h := sha3.NewShake256()
+	_, _ = h.Write(seed[:])
+	_, _ = h.Write(keySuffix[:])
+
+	// The distribution at hand is exactly the same as that
+	// of (a + a') - (b + b') where a,a',b,b'~U(1).  Thus we need 4 bits per
+	// coefficients, thus 128 bytes of input entropy.
+
+	var buf [128]byte
+	_, _ = h.Read(buf[:])
+
+	for i := 0; i < 16; i++ {
+		// t is interpreted as a + 2a' + 4b + 8b' + ….
+		t := binary.LittleEndian.Uint64(buf[8*i:])
+
+		d := t & 0x5555555555555555        // a + 4b + …
+		d += (t >> 1) & 0x5555555555555555 // a+a' + 4(b + b') + …
+
+		for j := 0; j < 16; j++ {
+			a := int16(d) & 0x3
+			d >>= 2
+			b := int16(d) & 0x3
+			d >>= 2
+			p[16*i+j] = a - b
+		}
+	}
+}
+
+// For each i, sample ps[i] uniformly from the given seed for coordinates
+// xs[i] and ys[i]. ps[i] may be nil and is ignored in that case.
+//
+// Can only be called when DeriveX4Available is true.
+func PolyDeriveUniformX4(ps [4]*Poly, seed *[32]byte, xs, ys [4]uint8) {
+	var perm keccakf1600.StateX4
+	state := perm.Initialize()
+
+	// Absorb the seed in the four states
+	for i := 0; i < 4; i++ {
+		v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)])
+		for j := 0; j < 4; j++ {
+			state[i*4+j] = v
+		}
+	}
+
+	// Absorb the coordinates, the SHAKE128 domain separator (0b1111), the
+	// start of the padding (0b…001) and the end of the padding 0b100….
+	// Recall that the rate of SHAKE128 is 168; ie. 21 uint64s.
+	for j := 0; j < 4; j++ {
+		state[4*4+j] = uint64(xs[j]) | (uint64(ys[j]) << 8) | (0x1f << 16)
+		state[20*4+j] = 0x80 << 56
+	}
+
+	var idx [4]int // indices into ps
+	for j := 0; j < 4; j++ {
+		if ps[j] == nil {
+			idx[j] = N // mark nil polynomials as completed
+		}
+	}
+
+	done := false
+	for !done {
+		// Applies KeccaK-f[1600] to state to get the next 21 uint64s of each of
+		// the four SHAKE128 streams.
+		perm.Permute()
+
+		done = true
+
+	PolyLoop:
+		for j := 0; j < 4; j++ {
+			if idx[j] == N {
+				continue
+			}
+			for i := 0; i < 7; i++ {
+				var t [16]uint16
+
+				v1 := state[i*3*4+j]
+				v2 := state[(i*3+1)*4+j]
+				v3 := state[(i*3+2)*4+j]
+
+				t[0] = uint16(v1) & 0xfff
+				t[1] = uint16(v1>>12) & 0xfff
+				t[2] = uint16(v1>>24) & 0xfff
+				t[3] = uint16(v1>>36) & 0xfff
+				t[4] = uint16(v1>>48) & 0xfff
+				t[5] = uint16((v1>>60)|(v2<<4)) & 0xfff
+
+				t[6] = uint16(v2>>8) & 0xfff
+				t[7] = uint16(v2>>20) & 0xfff
+				t[8] = uint16(v2>>32) & 0xfff
+				t[9] = uint16(v2>>44) & 0xfff
+				t[10] = uint16((v2>>56)|(v3<<8)) & 0xfff
+
+				t[11] = uint16(v3>>4) & 0xfff
+				t[12] = uint16(v3>>16) & 0xfff
+				t[13] = uint16(v3>>28) & 0xfff
+				t[14] = uint16(v3>>40) & 0xfff
+				t[15] = uint16(v3>>52) & 0xfff
+
+				for k := 0; k < 16; k++ {
+					if t[k] < uint16(Q) {
+						ps[j][idx[j]] = int16(t[k])
+						idx[j]++
+						if idx[j] == N {
+							continue PolyLoop
+						}
+					}
+				}
+			}
+
+			done = false
+		}
+	}
+
+	for i := 0; i < 4; i++ {
+		if ps[i] != nil {
+			ps[i].Tangle()
+		}
+	}
+}
+
+// Sample p uniformly from the given seed and x and y coordinates.
+//
+// Coefficients are reduced and will be in "tangled" order.  See Tangle().
+func (p *Poly) DeriveUniform(seed *[32]byte, x, y uint8) {
+	var seedSuffix [2]byte
+	var buf [168]byte // rate of SHAKE-128
+
+	seedSuffix[0] = x
+	seedSuffix[1] = y
+
+	h := sha3.NewShake128()
+	_, _ = h.Write(seed[:])
+	_, _ = h.Write(seedSuffix[:])
+
+	i := 0
+	for {
+		_, _ = h.Read(buf[:])
+
+		for j := 0; j < 168; j += 3 {
+			t1 := (uint16(buf[j]) | (uint16(buf[j+1]) << 8)) & 0xfff
+			t2 := (uint16(buf[j+1]>>4) | (uint16(buf[j+2]) << 4)) & 0xfff
+
+			if t1 < uint16(Q) {
+				p[i] = int16(t1)
+				i++
+
+				if i == N {
+					break
+				}
+			}
+
+			if t2 < uint16(Q) {
+				p[i] = int16(t2)
+				i++
+
+				if i == N {
+					break
+				}
+			}
+		}
+
+		if i == N {
+			break
+		}
+	}
+
+	p.Tangle()
+}
--- a/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/stubs_amd64.go
+++ b/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/stubs_amd64.go
@@ -0,0 +1,33 @@
+// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
+
+//go:build amd64
+// +build amd64
+
+package common
+
+//go:noescape
+func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
+
+//go:noescape
+func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
+
+//go:noescape
+func nttAVX2(p *[256]int16)
+
+//go:noescape
+func invNttAVX2(p *[256]int16)
+
+//go:noescape
+func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
+
+//go:noescape
+func detangleAVX2(p *[256]int16)
+
+//go:noescape
+func tangleAVX2(p *[256]int16)
+
+//go:noescape
+func barrettReduceAVX2(p *[256]int16)
+
+//go:noescape
+func normalizeAVX2(p *[256]int16)