RTG-1339 Support post-quantum hybrid key exchange

Func spec: https://wiki.cfops.it/x/ZcBKHw
This commit is contained in:
Bas Westerbaan
2022-08-24 14:33:10 +02:00
committed by Devin Carr
parent 3e0ff3a771
commit 11cbff4ff7
171 changed files with 15270 additions and 196 deletions

View File

@@ -0,0 +1,302 @@
//go:build amd64
// +build amd64
package common
import (
"golang.org/x/sys/cpu"
)
// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
// the values int16(zeta * 62209) for each zeta, which is used in
// Montgomery reduction. There is some duplication and reordering as
// compared to Zetas to make it more covenient for use with AVX2.
var ZetasAVX2 = [...]int16{
// level 1: int16(Zetas[1]*62209) and Zetas[1]
31499, 2571,
// level 2
//
// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
14746, 2970, 788, 1812,
// level 3, like level 2.
13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
0, 0, // padding
// layer 4. offset: 1*16
//
// The precomputed multiplication and zetas are grouped by 16 at a
// time as used in the set of butterflies, etc.
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
622, 622, 622, 622, 622, 622, 622, 622,
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
182, 182, 182, 182, 182, 182, 182, 182,
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
962, 962, 962, 962, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
// layer 5. offset: 9*16
-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
573, 573, 573, 573, 2004, 2004, 2004, 2004,
264, 264, 264, 264, 383, 383, 383, 383,
5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
732, 732, 732, 732, 608, 608, 608, 608,
18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
1787, 1787, 1787, 1787, 411, 411, 411, 411,
3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
// layer 6. offset: 17*16
-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
448, 448, 2264, 2264, 677, 677, 2054, 2054,
// layer 7. offset: 25*16
-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
2226, 430, 555, 843, 2078, 871, 1550, 105,
422, 587, 177, 3094, 3038, 2869, 1574, 1653,
32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
1739, 644, 2457, 349, 418, 329, 3173, 3254,
-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
817, 1097, 603, 610, 1322, 2044, 1864, 384,
2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
3221, 3021, 996, 991, 958, 1869, 1522, 1628,
// layer 1 inverse
23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
1628, 1522, 1869, 958, 991, 996, 3021, 3221,
478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
384, 1864, 2044, 1322, 610, 603, 1097, 817,
-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
3254, 3173, 329, 418, 349, 2457, 644, 1739,
1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
1653, 1574, 2869, 3038, 3094, 177, 587, 422,
105, 1550, 871, 2078, 843, 555, 430, 2226,
// layer 2 inverse
-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
2054, 2054, 677, 677, 2264, 2264, 448, 448,
2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
// layer 3 inverse
-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
411, 411, 411, 411, 1787, 1787, 1787, 1787,
8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
608, 608, 608, 608, 732, 732, 732, 732,
1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
383, 383, 383, 383, 264, 264, 264, 264,
2004, 2004, 2004, 2004, 573, 573, 573, 573,
// layer 4 inverse
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
962, 962, 962, 962, 962, 962, 962, 962,
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
182, 182, 182, 182, 182, 182, 182, 182,
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
622, 622, 622, 622, 622, 622, 622, 622,
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
// layer 5 inverse
-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
// layer 6 inverse
788, 1812, 14746, 2970,
// layer 7 inverse
31499, 2571,
}
// Sets p to a + b. Does not normalize coefficients.
func (p *Poly) Add(a, b *Poly) {
if cpu.X86.HasAVX2 {
addAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.addGeneric(a, b)
}
}
// Sets p to a - b. Does not normalize coefficients.
func (p *Poly) Sub(a, b *Poly) {
if cpu.X86.HasAVX2 {
subAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.subGeneric(a, b)
}
}
// Executes an in-place forward "NTT" on p.
//
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤7q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity of the NTT)
// if the input is in regular form, then the result is also in regular form.
// The order of coefficients will be "tangled". These can be put back into
// their proper order by calling Detangle().
func (p *Poly) NTT() {
if cpu.X86.HasAVX2 {
nttAVX2((*[N]int16)(p))
} else {
p.nttGeneric()
}
}
// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
// factor R.
//
// Requires coefficients to be in "tangled" order, see Tangle().
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity)
// if the input is in regular form, then the result is also in regular form.
func (p *Poly) InvNTT() {
if cpu.X86.HasAVX2 {
invNttAVX2((*[N]int16)(p))
} else {
p.invNTTGeneric()
}
}
// Sets p to the "pointwise" multiplication of a and b.
//
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
// Montgomery form. Products between coefficients of a and b must be strictly
// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
// bounded in absolute value by 2q.
//
// Requires a and b to be in "tangled" order, see Tangle(). p will be in
// tangled order as well.
func (p *Poly) MulHat(a, b *Poly) {
if cpu.X86.HasAVX2 {
mulHatAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.mulHatGeneric(a, b)
}
}
// Puts p into the right form to be used with (among others) InvNTT().
func (p *Poly) Tangle() {
if cpu.X86.HasAVX2 {
tangleAVX2((*[N]int16)(p))
}
// When AVX2 is not available, we use the standard order.
}
// Puts p back into standard form.
func (p *Poly) Detangle() {
if cpu.X86.HasAVX2 {
detangleAVX2((*[N]int16)(p))
}
// When AVX2 is not available, we use the standard order.
}
// Almost normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q}.
func (p *Poly) BarrettReduce() {
if cpu.X86.HasAVX2 {
barrettReduceAVX2((*[N]int16)(p))
} else {
p.barrettReduceGeneric()
}
}
// Normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q-1}.
func (p *Poly) Normalize() {
if cpu.X86.HasAVX2 {
normalizeAVX2((*[N]int16)(p))
} else {
p.normalizeGeneric()
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,74 @@
package common
// Given -2¹⁵ q ≤ x < 2¹⁵ q, returns -q < y < q with x 2⁻¹⁶ = y (mod q).
func montReduce(x int32) int16 {
// This is Montgomery reduction with R=2¹⁶.
//
// Note gcd(2¹⁶, q) = 1 as q is prime. Write q' := 62209 = q⁻¹ mod R.
// First we compute
//
// m := ((x mod R) q') mod R
// = x q' mod R
// = int16(x q')
// = int16(int32(x) * int32(q'))
//
// Note that x q' might be as big as 2³² and could overflow the int32
// multiplication in the last line. However for any int32s a and b,
// we have int32(int64(a)*int64(b)) = int32(a*b) and so the result is ok.
m := int16(x * 62209)
// Note that x - m q is divisable by R; indeed modulo R we have
//
// x - m q ≡ x - x q' q ≡ x - x q⁻¹ q ≡ x - x = 0.
//
// We return y := (x - m q) / R. Note that y is indeed correct as
// modulo q we have
//
// y ≡ x R⁻¹ - m q R⁻¹ = x R⁻¹
//
// and as both 2¹⁵ q ≤ m q, x < 2¹⁵ q, we have
// 2¹⁶ q ≤ x - m q < 2¹⁶ and so q ≤ (x - m q) / R < q as desired.
return int16(uint32(x-int32(m)*int32(Q)) >> 16)
}
// Given any x, returns x R mod q where R=2¹⁶.
func toMont(x int16) int16 {
// Note |1353 x| ≤ 1353 2¹⁵ ≤ 13318 q ≤ 2¹⁵ q and so we're within
// the bounds of montReduce.
return montReduce(int32(x) * 1353) // 1353 = R² mod q.
}
// Given any x, compute 0 ≤ y ≤ q with x = y (mod q).
//
// Beware: we might have barrettReduce(x) = q ≠ 0 for some x. In fact,
// this happens if and only if x = -nq for some positive integer n.
func barrettReduce(x int16) int16 {
// This is standard Barrett reduction.
//
// For any x we have x mod q = x - ⌊x/q⌋ q. We will use 20159/2²⁶ as
// an approximation of 1/q. Note that 0 ≤ 20159/2²⁶ - 1/q ≤ 0.135/2²⁶
// and so | x 20156/2²⁶ - x/q | ≤ 2⁻¹⁰ for |x| ≤ 2¹⁶. For all x
// not a multiple of q, the number x/q is further than 1/q from any integer
// and so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋. If x is a multiple of q and x is positive,
// then x 20156/2²⁶ is larger than x/q so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋ as well.
// Finally, if x is negative multiple of q, then ⌊x 20156/2²⁶⌋ = ⌊x/q⌋-1.
// Thus
// [ q if x=-nq for pos. integer n
// x - ⌊x 20156/2²⁶⌋ q = [
// [ x mod q otherwise
//
// To compute actually compute this, note that
//
// ⌊x 20156/2²⁶⌋ = (20159 x) >> 26.
return x - int16((int32(x)*20159)>>26)*Q
}
// Returns x if x < q and x - q otherwise. Assumes x ≥ -29439.
func csubq(x int16) int16 {
x -= Q // no overflow due to assumption x ≥ -29439.
// If x is positive, then x >> 15 = 0. If x is negative,
// then uint16(x >> 15) = 2¹⁶-1. So this will add back in q
// if x was smaller than q.
x += (x >> 15) & Q
return x
}

View File

@@ -0,0 +1,77 @@
//go:build !amd64
// +build !amd64
package common
// Sets p to a + b. Does not normalize coefficients.
func (p *Poly) Add(a, b *Poly) {
p.addGeneric(a, b)
}
// Sets p to a - b. Does not normalize coefficients.
func (p *Poly) Sub(a, b *Poly) {
p.subGeneric(a, b)
}
// Executes an in-place forward "NTT" on p.
//
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤7q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity of the NTT)
// if the input is in regular form, then the result is also in regular form.
// The order of coefficients will be "tangled". These can be put back into
// their proper order by calling Detangle().
func (p *Poly) NTT() {
p.nttGeneric()
}
// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
// factor R.
//
// Requires coefficients to be in "tangled" order, see Tangle().
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity)
// if the input is in regular form, then the result is also in regular form.
func (p *Poly) InvNTT() {
p.invNTTGeneric()
}
// Sets p to the "pointwise" multiplication of a and b.
//
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
// Montgomery form. Products between coefficients of a and b must be strictly
// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
// bounded in absolute value by 2q.
//
// Requires a and b to be in "tangled" order, see Tangle(). p will be in
// tangled order as well.
func (p *Poly) MulHat(a, b *Poly) {
p.mulHatGeneric(a, b)
}
// Puts p into the right form to be used with (among others) InvNTT().
func (p *Poly) Tangle() {
// In the generic implementation there is no advantage to using a
// different order, so we use the standard order everywhere.
}
// Puts p back into standard form.
func (p *Poly) Detangle() {
// In the generic implementation there is no advantage to using a
// different order, so we use the standard order everywhere.
}
// Almost normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q}.
func (p *Poly) BarrettReduce() {
p.barrettReduceGeneric()
}
// Normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q-1}.
func (p *Poly) Normalize() {
p.normalizeGeneric()
}

View File

@@ -0,0 +1,193 @@
package common
// Zetas lists precomputed powers of the primitive root of unity in
// Montgomery representation used for the NTT:
//
// Zetas[i] = ζᵇʳᵛ⁽ⁱ⁾ R mod q
//
// where ζ = 17, brv(i) is the bitreversal of a 7-bit number and R=2¹⁶ mod q.
//
// The following Python code generates the Zetas arrays:
//
// q = 13*2**8 + 1; zeta = 17
// R = 2**16 % q # Montgomery const.
// def brv(x): return int(''.join(reversed(bin(x)[2:].zfill(7))),2)
// print([(pow(zeta, brv(i), q)*R)%q for i in range(128)])
var Zetas = [128]int16{
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182,
962, 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199,
2648, 1017, 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015,
2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126,
1469, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821,
2604, 448, 2264, 677, 2054, 2226, 430, 555, 843, 2078, 871, 1550,
105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, 3083, 778, 1159,
3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173,
3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218,
1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475,
2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628,
}
// InvNTTReductions keeps track of which coefficients to apply Barrett
// reduction to in Poly.InvNTT().
//
// Generated in a lazily: once a butterfly is computed which is about to
// overflow the int16, the largest coefficient is reduced. If that is
// not enough, the other coefficient is reduced as well.
//
// This is actually optimal, as proven in https://eprint.iacr.org/2020/1377.pdf
var InvNTTReductions = [...]int{
-1, // after layer 1
-1, // after layer 2
16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
241, -1, // after layer 3
0, 1, 32, 33, 34, 35, 64, 65, 96, 97, 98, 99, 128, 129, 160, 161, 162, 163,
192, 193, 224, 225, 226, 227, -1, // after layer 4
2, 3, 66, 67, 68, 69, 70, 71, 130, 131, 194, 195, 196, 197, 198,
199, -1, // after layer 5
4, 5, 6, 7, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, -1, // after layer 6
-1, // after layer 7
}
// Executes an in-place forward "NTT" on p.
//
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤7q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity of the NTT)
// if the input is in regular form, then the result is also in regular form.
// The order of coefficients will be "tangled". These can be put back into
// their proper order by calling Detangle().
func (p *Poly) nttGeneric() {
// Note that _q does not have a primitive 512ᵗʰ root of unity (as 512
// does not divide into q-1) and so we cannot do a regular NTT. _q
// does have a primitive 256ᵗʰ root of unity, the smallest of which
// is ζ := 17.
//
// Recall that our base ring R := _q[x] / (x²⁵⁶ + 1). The polynomial
// x²⁵⁶+1 will not split completely (as its roots would be 512ᵗʰ roots
// of unity.) However, it does split almost (using ζ¹²⁸ = -1):
//
// x²⁵⁶ + 1 = (x²)¹²⁸ - ζ¹²⁸
// = ((x²)⁶⁴ - ζ⁶⁴)((x²)⁶⁴ + ζ⁶⁴)
// = ((x²)³² - ζ³²)((x²)³² + ζ³²)((x²)³² - ζ⁹⁶)((x²)³² + ζ⁹⁶)
// ⋮
// = (x² - ζ)(x² + ζ)(x² - ζ⁶⁵)(x² + ζ⁶⁵) … (x² + ζ¹²⁷)
//
// Note that the powers of ζ that appear (from the second line down) are
// in binary
//
// 0100000 1100000
// 0010000 1010000 0110000 1110000
// 0001000 1001000 0101000 1101000 0011000 1011000 0111000 1111000
// …
//
// That is: brv(2), brv(3), brv(4), …, where brv(x) denotes the 7-bit
// bitreversal of x. These powers of ζ are given by the Zetas array.
//
// The polynomials x² ± ζⁱ are irreducible and coprime, hence by
// the Chinese Remainder Theorem we know
//
// _q[x]/(x²⁵⁶+1) → _q[x]/(x²-ζ) x … x _q[x]/(x²+ζ¹²⁷)
//
// given by a ↦ ( a mod x²-ζ, …, a mod x²+ζ¹²⁷ )
// is an isomorphism, which is the "NTT". It can be efficiently computed by
//
//
// a ↦ ( a mod (x²)⁶⁴ - ζ⁶⁴, a mod (x²)⁶⁴ + ζ⁶⁴ )
// ↦ ( a mod (x²)³² - ζ³², a mod (x²)³² + ζ³²,
// a mod (x²)⁹⁶ - ζ⁹⁶, a mod (x²)⁹⁶ + ζ⁹⁶ )
//
// et cetera
//
// If N was 8 then this can be pictured in the following diagram:
//
// https://cnx.org/resources/17ee4dfe517a6adda05377b25a00bf6e6c93c334/File0026.png
//
// Each cross is a Cooley-Tukey butterfly: it's the map
//
// (a, b) ↦ (a + ζb, a - ζb)
//
// for the appropriate power ζ for that column and row group.
k := 0 // Index into Zetas
// l runs effectively over the columns in the diagram above; it is half the
// height of a row group, i.e. the number of butterflies in each row group.
// In the diagram above it would be 4, 2, 1.
for l := N / 2; l > 1; l >>= 1 {
// On the nᵗʰ iteration of the l-loop, the absolute value of the
// coefficients are bounded by nq.
// offset effectively loops over the row groups in this column; it is
// the first row in the row group.
for offset := 0; offset < N-l; offset += 2 * l {
k++
zeta := int32(Zetas[k])
// j loops over each butterfly in the row group.
for j := offset; j < offset+l; j++ {
t := montReduce(zeta * int32(p[j+l]))
p[j+l] = p[j] - t
p[j] += t
}
}
}
}
// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
// factor R.
//
// Requires coefficients to be in "tangled" order, see Tangle().
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity)
// if the input is in regular form, then the result is also in regular form.
func (p *Poly) invNTTGeneric() {
k := 127 // Index into Zetas
r := -1 // Index into InvNTTReductions.
// We basically do the oppposite of NTT, but postpone dividing by 2 in the
// inverse of the Cooley-Tukey butterfly and accumulate that into a big
// division by 2⁷ at the end. See the comments in the NTT() function.
for l := 2; l < N; l <<= 1 {
for offset := 0; offset < N-l; offset += 2 * l {
// As we're inverting, we need powers of ζ⁻¹ (instead of ζ).
// To be precise, we need ζᵇʳᵛ⁽ᵏ⁾⁻¹²⁸. However, as ζ⁻¹²⁸ = -1,
// we can use the existing Zetas table instead of
// keeping a separate InvZetas table as in Dilithium.
minZeta := int32(Zetas[k])
k--
for j := offset; j < offset+l; j++ {
// Gentleman-Sande butterfly: (a, b) ↦ (a + b, ζ(a-b))
t := p[j+l] - p[j]
p[j] += p[j+l]
p[j+l] = montReduce(minZeta * int32(t))
// Note that if we had |a| < αq and |b| < βq before the
// butterfly, then now we have |a| < (α+β)q and |b| < q.
}
}
// We let the InvNTTReductions instruct us which coefficients to
// Barrett reduce. See TestInvNTTReductions, which tests whether
// there is an overflow.
for {
r++
i := InvNTTReductions[r]
if i < 0 {
break
}
p[i] = barrettReduce(p[i])
}
}
for j := 0; j < N; j++ {
// Note 1441 = (128)⁻¹ R². The coefficients are bounded by 9q, so
// as 1441 * 9 ≈ 2¹⁴ < 2¹⁵, we're within the required bounds
// for montReduce().
p[j] = montReduce(1441 * int32(p[j]))
}
}

View File

@@ -0,0 +1,22 @@
package common
import (
"github.com/cloudflare/circl/pke/kyber/internal/common/params"
)
const (
// Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1.
Q = params.Q
// N is the parameter N: the length of the polynomials
N = params.N
// PolySize is the size of a packed polynomial.
PolySize = params.PolySize
// PlaintextSize is the size of the plaintext
PlaintextSize = params.PlaintextSize
// Eta2 is the parameter η₂
Eta2 = params.Eta2
)

View File

@@ -0,0 +1,21 @@
package params
// We put these parameters in a separate package so that the Go code,
// such as asm/src.go, that generates assembler can import it.
const (
// Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1.
Q int16 = 3329
// N is the parameter N: the length of the polynomials
N int = 256
// PolySize is the size of a packed polynomial.
PolySize int = 384
// PlaintextSize is the size of the plaintext
PlaintextSize = 32
// Eta2 is the parameter η₂
Eta2 = 2
)

View File

@@ -0,0 +1,324 @@
package common
// An element of our base ring R which are polynomials over _q
// modulo the equation Xᴺ = -1, where q=3329 and N=256.
//
// This type is also used to store NTT-transformed polynomials,
// see Poly.NTT().
//
// Coefficients aren't always reduced. See Normalize().
type Poly [N]int16
// Sets p to a + b. Does not normalize coefficients.
func (p *Poly) addGeneric(a, b *Poly) {
for i := 0; i < N; i++ {
p[i] = a[i] + b[i]
}
}
// Sets p to a - b. Does not normalize coefficients.
func (p *Poly) subGeneric(a, b *Poly) {
for i := 0; i < N; i++ {
p[i] = a[i] - b[i]
}
}
// Almost normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q}.
func (p *Poly) barrettReduceGeneric() {
for i := 0; i < N; i++ {
p[i] = barrettReduce(p[i])
}
}
// Normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q-1}.
func (p *Poly) normalizeGeneric() {
for i := 0; i < N; i++ {
p[i] = csubq(barrettReduce(p[i]))
}
}
// Multiplies p in-place by the Montgomery factor 2¹⁶.
//
// Coefficients of p can be artbitray. Resulting coefficients are bounded
// in absolute value by q.
func (p *Poly) ToMont() {
for i := 0; i < N; i++ {
p[i] = toMont(p[i])
}
}
// Sets p to the "pointwise" multiplication of a and b.
//
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
// Montgomery form. Products between coefficients of a and b must be strictly
// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
// bounded in absolute value by 2q.
//
// Requires a and b to be in "tangled" order, see Tangle(). p will be in
// tangled order as well.
func (p *Poly) mulHatGeneric(a, b *Poly) {
// Recall from the discussion in NTT(), that a transformed polynomial is
// an element of _q[x]/(x²-ζ) x … x _q[x]/(x²+ζ¹²⁷);
// that is: 128 degree-one polynomials instead of simply 256 elements
// from _q as in the regular NTT. So instead of pointwise multiplication,
// we multiply the 128 pairs of degree-one polynomials modulo the
// right equation:
//
// (a₁ + a₂x)(b₁ + b₂x) = a₁b₁ + a₂b₂ζ' + (a₁b₂ + a₂b₁)x,
//
// where ζ' is the appropriate power of ζ.
k := 64
for i := 0; i < N; i += 4 {
zeta := int32(Zetas[k])
k++
p0 := montReduce(int32(a[i+1]) * int32(b[i+1]))
p0 = montReduce(int32(p0) * zeta)
p0 += montReduce(int32(a[i]) * int32(b[i]))
p1 := montReduce(int32(a[i]) * int32(b[i+1]))
p1 += montReduce(int32(a[i+1]) * int32(b[i]))
p[i] = p0
p[i+1] = p1
p2 := montReduce(int32(a[i+3]) * int32(b[i+3]))
p2 = -montReduce(int32(p2) * zeta)
p2 += montReduce(int32(a[i+2]) * int32(b[i+2]))
p3 := montReduce(int32(a[i+2]) * int32(b[i+3]))
p3 += montReduce(int32(a[i+3]) * int32(b[i+2]))
p[i+2] = p2
p[i+3] = p3
}
}
// Packs p into buf. buf should be of length PolySize.
//
// Assumes p is normalized (and not just Barrett reduced) and "tangled",
// see Tangle().
func (p *Poly) Pack(buf []byte) {
q := *p
q.Detangle()
for i := 0; i < 128; i++ {
t0 := q[2*i]
t1 := q[2*i+1]
buf[3*i] = byte(t0)
buf[3*i+1] = byte(t0>>8) | byte(t1<<4)
buf[3*i+2] = byte(t1 >> 4)
}
}
// Unpacks p from buf.
//
// buf should be of length PolySize. p will be "tangled", see Detangle().
//
// p will not be normalized; instead 0 ≤ p[i] < 4096.
func (p *Poly) Unpack(buf []byte) {
for i := 0; i < 128; i++ {
p[2*i] = int16(buf[3*i]) | ((int16(buf[3*i+1]) << 8) & 0xfff)
p[2*i+1] = int16(buf[3*i+1]>>4) | (int16(buf[3*i+2]) << 4)
}
p.Tangle()
}
// Set p to Decompress_q(m, 1).
//
// p will be normalized. m has to be of PlaintextSize.
func (p *Poly) DecompressMessage(m []byte) {
// Decompress_q(x, 1) = ⌈xq/2⌋ = ⌊xq/2+½⌋ = (xq+1) >> 1 and so
// Decompress_q(0, 1) = 0 and Decompress_q(1, 1) = (q+1)/2.
for i := 0; i < 32; i++ {
for j := 0; j < 8; j++ {
bit := (m[i] >> uint(j)) & 1
// Set coefficient to either 0 or (q+1)/2 depending on the bit.
p[8*i+j] = -int16(bit) & ((Q + 1) / 2)
}
}
}
// Writes Compress_q(p, 1) to m.
//
// Assumes p is normalized. m has to be of length at least PlaintextSize.
func (p *Poly) CompressMessageTo(m []byte) {
// Compress_q(x, 1) is 1 on {833, …, 2496} and zero elsewhere.
for i := 0; i < 32; i++ {
m[i] = 0
for j := 0; j < 8; j++ {
x := 1664 - p[8*i+j]
// With the previous substitution, we want to return 1 if
// and only if x is in {831, …, -832}.
x = (x >> 15) ^ x
// Note (x >> 15)ˣ if x≥0 and -x-1 otherwise. Thus now we want
// to return 1 iff x ≤ 831, ie. x - 832 < 0.
x -= 832
m[i] |= ((byte(x >> 15)) & 1) << uint(j)
}
}
}
// Set p to Decompress_q(m, 1).
//
// Assumes d is in {3, 4, 5, 10, 11}. p will be normalized.
func (p *Poly) Decompress(m []byte, d int) {
// Decompress_q(x, d) = ⌈(q/2ᵈ)x⌋
// = ⌊(q/2ᵈ)x+½⌋
// = ⌊(qx + 2ᵈ⁻¹)/2ᵈ⌋
// = (qx + (1<<(d-1))) >> d
switch d {
case 4:
for i := 0; i < N/2; i++ {
p[2*i] = int16(((1 << 3) +
uint32(m[i]&15)*uint32(Q)) >> 4)
p[2*i+1] = int16(((1 << 3) +
uint32(m[i]>>4)*uint32(Q)) >> 4)
}
case 5:
var t [8]uint16
idx := 0
for i := 0; i < N/8; i++ {
t[0] = uint16(m[idx])
t[1] = (uint16(m[idx]) >> 5) | (uint16(m[idx+1] << 3))
t[2] = uint16(m[idx+1]) >> 2
t[3] = (uint16(m[idx+1]) >> 7) | (uint16(m[idx+2] << 1))
t[4] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3] << 4))
t[5] = uint16(m[idx+3]) >> 1
t[6] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4] << 2))
t[7] = uint16(m[idx+4]) >> 3
for j := 0; j < 8; j++ {
p[8*i+j] = int16(((1 << 4) +
uint32(t[j]&((1<<5)-1))*uint32(Q)) >> 5)
}
idx += 5
}
case 10:
var t [4]uint16
idx := 0
for i := 0; i < N/4; i++ {
t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8)
t[1] = (uint16(m[idx+1]) >> 2) | (uint16(m[idx+2]) << 6)
t[2] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3]) << 4)
t[3] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4]) << 2)
for j := 0; j < 4; j++ {
p[4*i+j] = int16(((1 << 9) +
uint32(t[j]&((1<<10)-1))*uint32(Q)) >> 10)
}
idx += 5
}
case 11:
var t [8]uint16
idx := 0
for i := 0; i < N/8; i++ {
t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8)
t[1] = (uint16(m[idx+1]) >> 3) | (uint16(m[idx+2]) << 5)
t[2] = (uint16(m[idx+2]) >> 6) | (uint16(m[idx+3]) << 2) | (uint16(m[idx+4]) << 10)
t[3] = (uint16(m[idx+4]) >> 1) | (uint16(m[idx+5]) << 7)
t[4] = (uint16(m[idx+5]) >> 4) | (uint16(m[idx+6]) << 4)
t[5] = (uint16(m[idx+6]) >> 7) | (uint16(m[idx+7]) << 1) | (uint16(m[idx+8]) << 9)
t[6] = (uint16(m[idx+8]) >> 2) | (uint16(m[idx+9]) << 6)
t[7] = (uint16(m[idx+9]) >> 5) | (uint16(m[idx+10]) << 3)
for j := 0; j < 8; j++ {
p[8*i+j] = int16(((1 << 10) +
uint32(t[j]&((1<<11)-1))*uint32(Q)) >> 11)
}
idx += 11
}
default:
panic("unsupported d")
}
}
// Writes Compress_q(p, d) to m.
//
// Assumes p is normalized and d is in {3, 4, 5, 10, 11}.
func (p *Poly) CompressTo(m []byte, d int) {
// Compress_q(x, d) = ⌈(2ᵈ/q)x⌋ mod⁺ 2ᵈ
// = ⌊(2ᵈ/q)x+½⌋ mod⁺ 2ᵈ
// = ⌊((x << d) + q/2) / q⌋ mod⁺ 2ᵈ
// = DIV((x << d) + q/2, q) & ((1<<d) - 1)
switch d {
case 4:
var t [8]uint16
idx := 0
for i := 0; i < N/8; i++ {
for j := 0; j < 8; j++ {
t[j] = uint16(((uint32(p[8*i+j])<<4)+uint32(Q)/2)/
uint32(Q)) & ((1 << 4) - 1)
}
m[idx] = byte(t[0]) | byte(t[1]<<4)
m[idx+1] = byte(t[2]) | byte(t[3]<<4)
m[idx+2] = byte(t[4]) | byte(t[5]<<4)
m[idx+3] = byte(t[6]) | byte(t[7]<<4)
idx += 4
}
case 5:
var t [8]uint16
idx := 0
for i := 0; i < N/8; i++ {
for j := 0; j < 8; j++ {
t[j] = uint16(((uint32(p[8*i+j])<<5)+uint32(Q)/2)/
uint32(Q)) & ((1 << 5) - 1)
}
m[idx] = byte(t[0]) | byte(t[1]<<5)
m[idx+1] = byte(t[1]>>3) | byte(t[2]<<2) | byte(t[3]<<7)
m[idx+2] = byte(t[3]>>1) | byte(t[4]<<4)
m[idx+3] = byte(t[4]>>4) | byte(t[5]<<1) | byte(t[6]<<6)
m[idx+4] = byte(t[6]>>2) | byte(t[7]<<3)
idx += 5
}
case 10:
var t [4]uint16
idx := 0
for i := 0; i < N/4; i++ {
for j := 0; j < 4; j++ {
t[j] = uint16(((uint32(p[4*i+j])<<10)+uint32(Q)/2)/
uint32(Q)) & ((1 << 10) - 1)
}
m[idx] = byte(t[0])
m[idx+1] = byte(t[0]>>8) | byte(t[1]<<2)
m[idx+2] = byte(t[1]>>6) | byte(t[2]<<4)
m[idx+3] = byte(t[2]>>4) | byte(t[3]<<6)
m[idx+4] = byte(t[3] >> 2)
idx += 5
}
case 11:
var t [8]uint16
idx := 0
for i := 0; i < N/8; i++ {
for j := 0; j < 8; j++ {
t[j] = uint16(((uint32(p[8*i+j])<<11)+uint32(Q)/2)/
uint32(Q)) & ((1 << 11) - 1)
}
m[idx] = byte(t[0])
m[idx+1] = byte(t[0]>>8) | byte(t[1]<<3)
m[idx+2] = byte(t[1]>>5) | byte(t[2]<<6)
m[idx+3] = byte(t[2] >> 2)
m[idx+4] = byte(t[2]>>10) | byte(t[3]<<1)
m[idx+5] = byte(t[3]>>7) | byte(t[4]<<4)
m[idx+6] = byte(t[4]>>4) | byte(t[5]<<7)
m[idx+7] = byte(t[5] >> 1)
m[idx+8] = byte(t[5]>>9) | byte(t[6]<<2)
m[idx+9] = byte(t[6]>>6) | byte(t[7]<<5)
m[idx+10] = byte(t[7] >> 3)
idx += 11
}
default:
panic("unsupported d")
}
}

View File

@@ -0,0 +1,236 @@
package common
import (
"encoding/binary"
"github.com/cloudflare/circl/internal/sha3"
"github.com/cloudflare/circl/simd/keccakf1600"
)
// DeriveX4Available indicates whether the system supports the quick fourway
// sampling variants like PolyDeriveUniformX4.
var DeriveX4Available = keccakf1600.IsEnabledX4()
// Samples p from a centered binomial distribution with given η.
//
// Essentially CBD_η(PRF(seed, nonce)) from the specification.
func (p *Poly) DeriveNoise(seed []byte, nonce uint8, eta int) {
switch eta {
case 2:
p.DeriveNoise2(seed, nonce)
case 3:
p.DeriveNoise3(seed, nonce)
default:
panic("unsupported eta")
}
}
// Sample p from a centered binomial distribution with n=6 and p=½ - that is:
// coefficients are in {-3, -2, -1, 0, 1, 2, 3} with probabilities {1/64, 3/32,
// 15/64, 5/16, 16/64, 3/32, 1/64}.
func (p *Poly) DeriveNoise3(seed []byte, nonce uint8) {
keySuffix := [1]byte{nonce}
h := sha3.NewShake256()
_, _ = h.Write(seed[:])
_, _ = h.Write(keySuffix[:])
// The distribution at hand is exactly the same as that
// of (a₁ + a₂ + a₃) - (b₁ + b₂+b₃) where a_i,b_i~U(1). Thus we need
// 6 bits per coefficients, thus 192 bytes of input entropy.
// We add two extra zero bytes in the buffer to be able to read 8 bytes
// at the same time (while using only 6.)
var buf [192 + 2]byte
_, _ = h.Read(buf[:192])
for i := 0; i < 32; i++ {
// t is interpreted as a₁ + 2a₂ + 4a₃ + 8b₁ + 16b₂ + ….
t := binary.LittleEndian.Uint64(buf[6*i:])
d := t & 0x249249249249 // a₁ + 8b₁ + …
d += (t >> 1) & 0x249249249249 // a₁ + a₂ + 8(b₁ + b₂) + …
d += (t >> 2) & 0x249249249249 // a₁ + a₂ + a₃ + 4(b₁ + b₂ + b₃) + …
for j := 0; j < 8; j++ {
a := int16(d) & 0x7 // a₁ + a₂ + a₃
d >>= 3
b := int16(d) & 0x7 // b₁ + b₂ + b₃
d >>= 3
p[8*i+j] = a - b
}
}
}
// Sample p from a centered binomial distribution with n=4 and p=½ - that is:
// coefficients are in {-2, -1, 0, 1, 2} with probabilities {1/16, 1/4,
// 3/8, 1/4, 1/16}.
func (p *Poly) DeriveNoise2(seed []byte, nonce uint8) {
keySuffix := [1]byte{nonce}
h := sha3.NewShake256()
_, _ = h.Write(seed[:])
_, _ = h.Write(keySuffix[:])
// The distribution at hand is exactly the same as that
// of (a + a') - (b + b') where a,a',b,b'~U(1). Thus we need 4 bits per
// coefficients, thus 128 bytes of input entropy.
var buf [128]byte
_, _ = h.Read(buf[:])
for i := 0; i < 16; i++ {
// t is interpreted as a + 2a' + 4b + 8b' + ….
t := binary.LittleEndian.Uint64(buf[8*i:])
d := t & 0x5555555555555555 // a + 4b + …
d += (t >> 1) & 0x5555555555555555 // a+a' + 4(b + b') + …
for j := 0; j < 16; j++ {
a := int16(d) & 0x3
d >>= 2
b := int16(d) & 0x3
d >>= 2
p[16*i+j] = a - b
}
}
}
// For each i, sample ps[i] uniformly from the given seed for coordinates
// xs[i] and ys[i]. ps[i] may be nil and is ignored in that case.
//
// Can only be called when DeriveX4Available is true.
func PolyDeriveUniformX4(ps [4]*Poly, seed *[32]byte, xs, ys [4]uint8) {
var perm keccakf1600.StateX4
state := perm.Initialize()
// Absorb the seed in the four states
for i := 0; i < 4; i++ {
v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)])
for j := 0; j < 4; j++ {
state[i*4+j] = v
}
}
// Absorb the coordinates, the SHAKE128 domain separator (0b1111), the
// start of the padding (0b…001) and the end of the padding 0b100….
// Recall that the rate of SHAKE128 is 168; ie. 21 uint64s.
for j := 0; j < 4; j++ {
state[4*4+j] = uint64(xs[j]) | (uint64(ys[j]) << 8) | (0x1f << 16)
state[20*4+j] = 0x80 << 56
}
var idx [4]int // indices into ps
for j := 0; j < 4; j++ {
if ps[j] == nil {
idx[j] = N // mark nil polynomials as completed
}
}
done := false
for !done {
// Applies KeccaK-f[1600] to state to get the next 21 uint64s of each of
// the four SHAKE128 streams.
perm.Permute()
done = true
PolyLoop:
for j := 0; j < 4; j++ {
if idx[j] == N {
continue
}
for i := 0; i < 7; i++ {
var t [16]uint16
v1 := state[i*3*4+j]
v2 := state[(i*3+1)*4+j]
v3 := state[(i*3+2)*4+j]
t[0] = uint16(v1) & 0xfff
t[1] = uint16(v1>>12) & 0xfff
t[2] = uint16(v1>>24) & 0xfff
t[3] = uint16(v1>>36) & 0xfff
t[4] = uint16(v1>>48) & 0xfff
t[5] = uint16((v1>>60)|(v2<<4)) & 0xfff
t[6] = uint16(v2>>8) & 0xfff
t[7] = uint16(v2>>20) & 0xfff
t[8] = uint16(v2>>32) & 0xfff
t[9] = uint16(v2>>44) & 0xfff
t[10] = uint16((v2>>56)|(v3<<8)) & 0xfff
t[11] = uint16(v3>>4) & 0xfff
t[12] = uint16(v3>>16) & 0xfff
t[13] = uint16(v3>>28) & 0xfff
t[14] = uint16(v3>>40) & 0xfff
t[15] = uint16(v3>>52) & 0xfff
for k := 0; k < 16; k++ {
if t[k] < uint16(Q) {
ps[j][idx[j]] = int16(t[k])
idx[j]++
if idx[j] == N {
continue PolyLoop
}
}
}
}
done = false
}
}
for i := 0; i < 4; i++ {
if ps[i] != nil {
ps[i].Tangle()
}
}
}
// Sample p uniformly from the given seed and x and y coordinates.
//
// Coefficients are reduced and will be in "tangled" order. See Tangle().
func (p *Poly) DeriveUniform(seed *[32]byte, x, y uint8) {
var seedSuffix [2]byte
var buf [168]byte // rate of SHAKE-128
seedSuffix[0] = x
seedSuffix[1] = y
h := sha3.NewShake128()
_, _ = h.Write(seed[:])
_, _ = h.Write(seedSuffix[:])
i := 0
for {
_, _ = h.Read(buf[:])
for j := 0; j < 168; j += 3 {
t1 := (uint16(buf[j]) | (uint16(buf[j+1]) << 8)) & 0xfff
t2 := (uint16(buf[j+1]>>4) | (uint16(buf[j+2]) << 4)) & 0xfff
if t1 < uint16(Q) {
p[i] = int16(t1)
i++
if i == N {
break
}
}
if t2 < uint16(Q) {
p[i] = int16(t2)
i++
if i == N {
break
}
}
}
if i == N {
break
}
}
p.Tangle()
}

View File

@@ -0,0 +1,33 @@
// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
//go:build amd64
// +build amd64
package common
//go:noescape
func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
//go:noescape
func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
//go:noescape
func nttAVX2(p *[256]int16)
//go:noescape
func invNttAVX2(p *[256]int16)
//go:noescape
func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
//go:noescape
func detangleAVX2(p *[256]int16)
//go:noescape
func tangleAVX2(p *[256]int16)
//go:noescape
func barrettReduceAVX2(p *[256]int16)
//go:noescape
func normalizeAVX2(p *[256]int16)