core/stdarch/crates/core_arch/src/x86/
gfni.rs

1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::x86::__m128i;
14use crate::core_arch::x86::__m256i;
15use crate::core_arch::x86::__m512i;
16use crate::core_arch::x86::__mmask16;
17use crate::core_arch::x86::__mmask32;
18use crate::core_arch::x86::__mmask64;
19use crate::intrinsics::simd::simd_select_bitmask;
20use crate::mem::transmute;
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25#[allow(improper_ctypes)]
26unsafe extern "C" {
27    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
28    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
29    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
30    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
31    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
32    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
33    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
34    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
36    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
38    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39    #[link_name = "llvm.x86.vgf2p8mulb.512"]
40    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
41    #[link_name = "llvm.x86.vgf2p8mulb.256"]
42    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
43    #[link_name = "llvm.x86.vgf2p8mulb.128"]
44    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
45}
46
47// LLVM requires AVX512BW for a lot of these instructions, see
48// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
49// however our tests also require the target feature list to match Intel's
50// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
51// requirement (for now)
52// also see
53// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
54// for forcing GFNI, BW and optionally VL extension
55
56/// Performs a multiplication in GF(2^8) on the packed bytes.
57/// The field is in polynomial representation with the reduction polynomial
58///  x^8 + x^4 + x^3 + x + 1.
59///
60/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
61#[inline]
62#[target_feature(enable = "gfni,avx512f")]
63#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
64#[cfg_attr(test, assert_instr(vgf2p8mulb))]
65pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
66    transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
67}
68
69/// Performs a multiplication in GF(2^8) on the packed bytes.
70/// The field is in polynomial representation with the reduction polynomial
71///  x^8 + x^4 + x^3 + x + 1.
72///
73/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
74/// Otherwise the computation result is written into the result.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
77#[inline]
78#[target_feature(enable = "gfni,avx512bw,avx512f")]
79#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
80#[cfg_attr(test, assert_instr(vgf2p8mulb))]
81pub unsafe fn _mm512_mask_gf2p8mul_epi8(
82    src: __m512i,
83    k: __mmask64,
84    a: __m512i,
85    b: __m512i,
86) -> __m512i {
87    transmute(simd_select_bitmask(
88        k,
89        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
90        src.as_i8x64(),
91    ))
92}
93
94/// Performs a multiplication in GF(2^8) on the packed bytes.
95/// The field is in polynomial representation with the reduction polynomial
96///  x^8 + x^4 + x^3 + x + 1.
97///
98/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
99/// Otherwise the computation result is written into the result.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
102#[inline]
103#[target_feature(enable = "gfni,avx512bw,avx512f")]
104#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
105#[cfg_attr(test, assert_instr(vgf2p8mulb))]
106pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
107    let zero = i8x64::ZERO;
108    transmute(simd_select_bitmask(
109        k,
110        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
111        zero,
112    ))
113}
114
115/// Performs a multiplication in GF(2^8) on the packed bytes.
116/// The field is in polynomial representation with the reduction polynomial
117///  x^8 + x^4 + x^3 + x + 1.
118///
119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
120#[inline]
121#[target_feature(enable = "gfni,avx")]
122#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
123#[cfg_attr(test, assert_instr(vgf2p8mulb))]
124pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
125    transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
126}
127
128/// Performs a multiplication in GF(2^8) on the packed bytes.
129/// The field is in polynomial representation with the reduction polynomial
130///  x^8 + x^4 + x^3 + x + 1.
131///
132/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
133/// Otherwise the computation result is written into the result.
134///
135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
136#[inline]
137#[target_feature(enable = "gfni,avx512bw,avx512vl")]
138#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
139#[cfg_attr(test, assert_instr(vgf2p8mulb))]
140pub unsafe fn _mm256_mask_gf2p8mul_epi8(
141    src: __m256i,
142    k: __mmask32,
143    a: __m256i,
144    b: __m256i,
145) -> __m256i {
146    transmute(simd_select_bitmask(
147        k,
148        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
149        src.as_i8x32(),
150    ))
151}
152
153/// Performs a multiplication in GF(2^8) on the packed bytes.
154/// The field is in polynomial representation with the reduction polynomial
155///  x^8 + x^4 + x^3 + x + 1.
156///
157/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
158/// Otherwise the computation result is written into the result.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
161#[inline]
162#[target_feature(enable = "gfni,avx512bw,avx512vl")]
163#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
164#[cfg_attr(test, assert_instr(vgf2p8mulb))]
165pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
166    let zero = i8x32::ZERO;
167    transmute(simd_select_bitmask(
168        k,
169        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
170        zero,
171    ))
172}
173
174/// Performs a multiplication in GF(2^8) on the packed bytes.
175/// The field is in polynomial representation with the reduction polynomial
176///  x^8 + x^4 + x^3 + x + 1.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
179#[inline]
180#[target_feature(enable = "gfni")]
181#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
182#[cfg_attr(test, assert_instr(gf2p8mulb))]
183pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
184    transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
185}
186
187/// Performs a multiplication in GF(2^8) on the packed bytes.
188/// The field is in polynomial representation with the reduction polynomial
189///  x^8 + x^4 + x^3 + x + 1.
190///
191/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
192/// Otherwise the computation result is written into the result.
193///
194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
195#[inline]
196#[target_feature(enable = "gfni,avx512bw,avx512vl")]
197#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
198#[cfg_attr(test, assert_instr(vgf2p8mulb))]
199pub unsafe fn _mm_mask_gf2p8mul_epi8(
200    src: __m128i,
201    k: __mmask16,
202    a: __m128i,
203    b: __m128i,
204) -> __m128i {
205    transmute(simd_select_bitmask(
206        k,
207        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
208        src.as_i8x16(),
209    ))
210}
211
212/// Performs a multiplication in GF(2^8) on the packed bytes.
213/// The field is in polynomial representation with the reduction polynomial
214///  x^8 + x^4 + x^3 + x + 1.
215///
216/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
217/// Otherwise the computation result is written into the result.
218///
219/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
220#[inline]
221#[target_feature(enable = "gfni,avx512bw,avx512vl")]
222#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
223#[cfg_attr(test, assert_instr(vgf2p8mulb))]
224pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
225    let zero = i8x16::ZERO;
226    transmute(simd_select_bitmask(
227        k,
228        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
229        zero,
230    ))
231}
232
233/// Performs an affine transformation on the packed bytes in x.
234/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
235/// and b being a constant 8-bit immediate value.
236/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
237///
238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
239#[inline]
240#[target_feature(enable = "gfni,avx512f")]
241#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
242#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
243#[rustc_legacy_const_generics(2)]
244pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
245    static_assert_uimm_bits!(B, 8);
246    let b = B as u8;
247    let x = x.as_i8x64();
248    let a = a.as_i8x64();
249    let r = vgf2p8affineqb_512(x, a, b);
250    transmute(r)
251}
252
253/// Performs an affine transformation on the packed bytes in x.
254/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
255/// and b being a constant 8-bit immediate value.
256/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
257///
258/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
259/// Otherwise the computation result is written into the result.
260///
261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
262#[inline]
263#[target_feature(enable = "gfni,avx512bw,avx512f")]
264#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
265#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
266#[rustc_legacy_const_generics(3)]
267pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
268    k: __mmask64,
269    x: __m512i,
270    a: __m512i,
271) -> __m512i {
272    static_assert_uimm_bits!(B, 8);
273    let b = B as u8;
274    let zero = i8x64::ZERO;
275    let x = x.as_i8x64();
276    let a = a.as_i8x64();
277    let r = vgf2p8affineqb_512(x, a, b);
278    transmute(simd_select_bitmask(k, r, zero))
279}
280
281/// Performs an affine transformation on the packed bytes in x.
282/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
283/// and b being a constant 8-bit immediate value.
284/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
285///
286/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
287/// Otherwise the computation result is written into the result.
288///
289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
290#[inline]
291#[target_feature(enable = "gfni,avx512bw,avx512f")]
292#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
293#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
294#[rustc_legacy_const_generics(4)]
295pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
296    src: __m512i,
297    k: __mmask64,
298    x: __m512i,
299    a: __m512i,
300) -> __m512i {
301    static_assert_uimm_bits!(B, 8);
302    let b = B as u8;
303    let x = x.as_i8x64();
304    let a = a.as_i8x64();
305    let r = vgf2p8affineqb_512(x, a, b);
306    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
307}
308
309/// Performs an affine transformation on the packed bytes in x.
310/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
311/// and b being a constant 8-bit immediate value.
312/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
313///
314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
315#[inline]
316#[target_feature(enable = "gfni,avx")]
317#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
318#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
319#[rustc_legacy_const_generics(2)]
320pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
321    static_assert_uimm_bits!(B, 8);
322    let b = B as u8;
323    let x = x.as_i8x32();
324    let a = a.as_i8x32();
325    let r = vgf2p8affineqb_256(x, a, b);
326    transmute(r)
327}
328
329/// Performs an affine transformation on the packed bytes in x.
330/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
331/// and b being a constant 8-bit immediate value.
332/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
333///
334/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
335/// Otherwise the computation result is written into the result.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
338#[inline]
339#[target_feature(enable = "gfni,avx512bw,avx512vl")]
340#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
341#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
342#[rustc_legacy_const_generics(3)]
343pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
344    k: __mmask32,
345    x: __m256i,
346    a: __m256i,
347) -> __m256i {
348    static_assert_uimm_bits!(B, 8);
349    let b = B as u8;
350    let zero = i8x32::ZERO;
351    let x = x.as_i8x32();
352    let a = a.as_i8x32();
353    let r = vgf2p8affineqb_256(x, a, b);
354    transmute(simd_select_bitmask(k, r, zero))
355}
356
357/// Performs an affine transformation on the packed bytes in x.
358/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
359/// and b being a constant 8-bit immediate value.
360/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
361///
362/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
363/// Otherwise the computation result is written into the result.
364///
365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
366#[inline]
367#[target_feature(enable = "gfni,avx512bw,avx512vl")]
368#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
369#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
370#[rustc_legacy_const_generics(4)]
371pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
372    src: __m256i,
373    k: __mmask32,
374    x: __m256i,
375    a: __m256i,
376) -> __m256i {
377    static_assert_uimm_bits!(B, 8);
378    let b = B as u8;
379    let x = x.as_i8x32();
380    let a = a.as_i8x32();
381    let r = vgf2p8affineqb_256(x, a, b);
382    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
383}
384
385/// Performs an affine transformation on the packed bytes in x.
386/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
387/// and b being a constant 8-bit immediate value.
388/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
389///
390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
391#[inline]
392#[target_feature(enable = "gfni")]
393#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
394#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
395#[rustc_legacy_const_generics(2)]
396pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
397    static_assert_uimm_bits!(B, 8);
398    let b = B as u8;
399    let x = x.as_i8x16();
400    let a = a.as_i8x16();
401    let r = vgf2p8affineqb_128(x, a, b);
402    transmute(r)
403}
404
405/// Performs an affine transformation on the packed bytes in x.
406/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
407/// and b being a constant 8-bit immediate value.
408/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
409///
410/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
411/// Otherwise the computation result is written into the result.
412///
413/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
414#[inline]
415#[target_feature(enable = "gfni,avx512bw,avx512vl")]
416#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
417#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
418#[rustc_legacy_const_generics(3)]
419pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
420    k: __mmask16,
421    x: __m128i,
422    a: __m128i,
423) -> __m128i {
424    static_assert_uimm_bits!(B, 8);
425    let b = B as u8;
426    let zero = i8x16::ZERO;
427    let x = x.as_i8x16();
428    let a = a.as_i8x16();
429    let r = vgf2p8affineqb_128(x, a, b);
430    transmute(simd_select_bitmask(k, r, zero))
431}
432
433/// Performs an affine transformation on the packed bytes in x.
434/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
435/// and b being a constant 8-bit immediate value.
436/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
437///
438/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
439/// Otherwise the computation result is written into the result.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
442#[inline]
443#[target_feature(enable = "gfni,avx512bw,avx512vl")]
444#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
445#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
446#[rustc_legacy_const_generics(4)]
447pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
448    src: __m128i,
449    k: __mmask16,
450    x: __m128i,
451    a: __m128i,
452) -> __m128i {
453    static_assert_uimm_bits!(B, 8);
454    let b = B as u8;
455    let x = x.as_i8x16();
456    let a = a.as_i8x16();
457    let r = vgf2p8affineqb_128(x, a, b);
458    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
459}
460
461/// Performs an affine transformation on the inverted packed bytes in x.
462/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
463/// and b being a constant 8-bit immediate value.
464/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
465/// The inverse of 0 is 0.
466/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
469#[inline]
470#[target_feature(enable = "gfni,avx512f")]
471#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
472#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
473#[rustc_legacy_const_generics(2)]
474pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
475    static_assert_uimm_bits!(B, 8);
476    let b = B as u8;
477    let x = x.as_i8x64();
478    let a = a.as_i8x64();
479    let r = vgf2p8affineinvqb_512(x, a, b);
480    transmute(r)
481}
482
483/// Performs an affine transformation on the inverted packed bytes in x.
484/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
485/// and b being a constant 8-bit immediate value.
486/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
487/// The inverse of 0 is 0.
488/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
489///
490/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
491/// Otherwise the computation result is written into the result.
492///
493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
494#[inline]
495#[target_feature(enable = "gfni,avx512bw,avx512f")]
496#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
497#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
498#[rustc_legacy_const_generics(3)]
499pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
500    k: __mmask64,
501    x: __m512i,
502    a: __m512i,
503) -> __m512i {
504    static_assert_uimm_bits!(B, 8);
505    let b = B as u8;
506    let zero = i8x64::ZERO;
507    let x = x.as_i8x64();
508    let a = a.as_i8x64();
509    let r = vgf2p8affineinvqb_512(x, a, b);
510    transmute(simd_select_bitmask(k, r, zero))
511}
512
513/// Performs an affine transformation on the inverted packed bytes in x.
514/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
515/// and b being a constant 8-bit immediate value.
516/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
517/// The inverse of 0 is 0.
518/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
519///
520/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
521/// Otherwise the computation result is written into the result.
522///
523/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
524#[inline]
525#[target_feature(enable = "gfni,avx512bw,avx512f")]
526#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
527#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
528#[rustc_legacy_const_generics(4)]
529pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
530    src: __m512i,
531    k: __mmask64,
532    x: __m512i,
533    a: __m512i,
534) -> __m512i {
535    static_assert_uimm_bits!(B, 8);
536    let b = B as u8;
537    let x = x.as_i8x64();
538    let a = a.as_i8x64();
539    let r = vgf2p8affineinvqb_512(x, a, b);
540    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
541}
542
543/// Performs an affine transformation on the inverted packed bytes in x.
544/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
545/// and b being a constant 8-bit immediate value.
546/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
547/// The inverse of 0 is 0.
548/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
549///
550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
551#[inline]
552#[target_feature(enable = "gfni,avx")]
553#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
554#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
555#[rustc_legacy_const_generics(2)]
556pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
557    static_assert_uimm_bits!(B, 8);
558    let b = B as u8;
559    let x = x.as_i8x32();
560    let a = a.as_i8x32();
561    let r = vgf2p8affineinvqb_256(x, a, b);
562    transmute(r)
563}
564
565/// Performs an affine transformation on the inverted packed bytes in x.
566/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
567/// and b being a constant 8-bit immediate value.
568/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
569/// The inverse of 0 is 0.
570/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
571///
572/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
573/// Otherwise the computation result is written into the result.
574///
575/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
576#[inline]
577#[target_feature(enable = "gfni,avx512bw,avx512vl")]
578#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
579#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
580#[rustc_legacy_const_generics(3)]
581pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
582    k: __mmask32,
583    x: __m256i,
584    a: __m256i,
585) -> __m256i {
586    static_assert_uimm_bits!(B, 8);
587    let b = B as u8;
588    let zero = i8x32::ZERO;
589    let x = x.as_i8x32();
590    let a = a.as_i8x32();
591    let r = vgf2p8affineinvqb_256(x, a, b);
592    transmute(simd_select_bitmask(k, r, zero))
593}
594
595/// Performs an affine transformation on the inverted packed bytes in x.
596/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
597/// and b being a constant 8-bit immediate value.
598/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
599/// The inverse of 0 is 0.
600/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
601///
602/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
603/// Otherwise the computation result is written into the result.
604///
605/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
606#[inline]
607#[target_feature(enable = "gfni,avx512bw,avx512vl")]
608#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
609#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
610#[rustc_legacy_const_generics(4)]
611pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
612    src: __m256i,
613    k: __mmask32,
614    x: __m256i,
615    a: __m256i,
616) -> __m256i {
617    static_assert_uimm_bits!(B, 8);
618    let b = B as u8;
619    let x = x.as_i8x32();
620    let a = a.as_i8x32();
621    let r = vgf2p8affineinvqb_256(x, a, b);
622    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
623}
624
625/// Performs an affine transformation on the inverted packed bytes in x.
626/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
627/// and b being a constant 8-bit immediate value.
628/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
629/// The inverse of 0 is 0.
630/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
631///
632/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
633#[inline]
634#[target_feature(enable = "gfni")]
635#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
636#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
637#[rustc_legacy_const_generics(2)]
638pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
639    static_assert_uimm_bits!(B, 8);
640    let b = B as u8;
641    let x = x.as_i8x16();
642    let a = a.as_i8x16();
643    let r = vgf2p8affineinvqb_128(x, a, b);
644    transmute(r)
645}
646
647/// Performs an affine transformation on the inverted packed bytes in x.
648/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
649/// and b being a constant 8-bit immediate value.
650/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
651/// The inverse of 0 is 0.
652/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
653///
654/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
655/// Otherwise the computation result is written into the result.
656///
657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
658#[inline]
659#[target_feature(enable = "gfni,avx512bw,avx512vl")]
660#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
661#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
662#[rustc_legacy_const_generics(3)]
663pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
664    k: __mmask16,
665    x: __m128i,
666    a: __m128i,
667) -> __m128i {
668    static_assert_uimm_bits!(B, 8);
669    let b = B as u8;
670    let zero = i8x16::ZERO;
671    let x = x.as_i8x16();
672    let a = a.as_i8x16();
673    let r = vgf2p8affineinvqb_128(x, a, b);
674    transmute(simd_select_bitmask(k, r, zero))
675}
676
677/// Performs an affine transformation on the inverted packed bytes in x.
678/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
679/// and b being a constant 8-bit immediate value.
680/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
681/// The inverse of 0 is 0.
682/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
683///
684/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
685/// Otherwise the computation result is written into the result.
686///
687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
688#[inline]
689#[target_feature(enable = "gfni,avx512bw,avx512vl")]
690#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
691#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
692#[rustc_legacy_const_generics(4)]
693pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
694    src: __m128i,
695    k: __mmask16,
696    x: __m128i,
697    a: __m128i,
698) -> __m128i {
699    static_assert_uimm_bits!(B, 8);
700    let b = B as u8;
701    let x = x.as_i8x16();
702    let a = a.as_i8x16();
703    let r = vgf2p8affineinvqb_128(x, a, b);
704    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
705}
706
707#[cfg(test)]
708mod tests {
709    // The constants in the tests below are just bit patterns. They should not
710    // be interpreted as integers; signedness does not make sense for them, but
711    // __mXXXi happens to be defined in terms of signed integers.
712    #![allow(overflowing_literals)]
713
714    use core::hint::black_box;
715    use core::intrinsics::size_of;
716    use stdarch_test::simd_test;
717
718    use crate::core_arch::x86::*;
719
720    fn mulbyte(left: u8, right: u8) -> u8 {
721        // this implementation follows the description in
722        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
723        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
724        let left: u16 = left.into();
725        let right: u16 = right.into();
726        let mut carryless_product: u16 = 0;
727
728        // Carryless multiplication
729        for i in 0..8 {
730            if ((left >> i) & 0x01) != 0 {
731                carryless_product ^= right << i;
732            }
733        }
734
735        // reduction, adding in "0" where appropriate to clear out high bits
736        // note that REDUCTION_POLYNOMIAL is zero in this context
737        for i in (8..=14).rev() {
738            if ((carryless_product >> i) & 0x01) != 0 {
739                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
740            }
741        }
742
743        carryless_product as u8
744    }
745
746    const NUM_TEST_WORDS_512: usize = 4;
747    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
748    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
749    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
750    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
751    const NUM_BYTES: usize = 256;
752    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
753    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
754    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
755
756    fn parity(input: u8) -> u8 {
757        let mut accumulator = 0;
758        for i in 0..8 {
759            accumulator ^= (input >> i) & 0x01;
760        }
761        accumulator
762    }
763
764    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
765        // this implementation follows the description in
766        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
767        let mut accumulator = 0;
768
769        for bit in 0..8 {
770            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
771        }
772
773        accumulator ^ b
774    }
775
776    fn generate_affine_mul_test_data(
777        immediate: u8,
778    ) -> (
779        [u64; NUM_TEST_WORDS_64],
780        [u8; NUM_TEST_ENTRIES],
781        [u8; NUM_TEST_ENTRIES],
782    ) {
783        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
784        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
785        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
786
787        for i in 0..NUM_TEST_WORDS_64 {
788            left[i] = (i as u64) * 103 * 101;
789            for j in 0..8 {
790                let j64 = j as u64;
791                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
792                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
793            }
794        }
795
796        (left, right, result)
797    }
798
799    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
800        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
801        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
802
803        for i in 0..NUM_BYTES {
804            input[i] = (i % 256) as u8;
805            result[i] = if i == 0 { 0 } else { 1 };
806        }
807
808        (input, result)
809    }
810
811    const AES_S_BOX: [u8; NUM_BYTES] = [
812        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
813        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
814        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
815        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
816        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
817        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
818        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
819        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
820        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
821        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
822        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
823        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
824        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
825        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
826        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
827        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
828        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
829        0x16,
830    ];
831
832    fn generate_byte_mul_test_data() -> (
833        [u8; NUM_TEST_ENTRIES],
834        [u8; NUM_TEST_ENTRIES],
835        [u8; NUM_TEST_ENTRIES],
836    ) {
837        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
838        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
839        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
840
841        for i in 0..NUM_TEST_ENTRIES {
842            left[i] = (i % 256) as u8;
843            right[i] = left[i].wrapping_mul(101);
844            result[i] = mulbyte(left[i], right[i]);
845        }
846
847        (left, right, result)
848    }
849
850    #[target_feature(enable = "sse2")]
851    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
852    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
853        let byte_offset = word_index * 16 / size_of::<T>();
854        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
855        _mm_loadu_si128(black_box(pointer))
856    }
857
858    #[target_feature(enable = "avx")]
859    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
860    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
861        let byte_offset = word_index * 32 / size_of::<T>();
862        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
863        _mm256_loadu_si256(black_box(pointer))
864    }
865
866    #[target_feature(enable = "avx512f")]
867    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
868    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
869        let byte_offset = word_index * 64 / size_of::<T>();
870        let pointer = data.as_ptr().add(byte_offset) as *const _;
871        _mm512_loadu_si512(black_box(pointer))
872    }
873
874    #[simd_test(enable = "gfni,avx512f")]
875    unsafe fn test_mm512_gf2p8mul_epi8() {
876        let (left, right, expected) = generate_byte_mul_test_data();
877
878        for i in 0..NUM_TEST_WORDS_512 {
879            let left = load_m512i_word(&left, i);
880            let right = load_m512i_word(&right, i);
881            let expected = load_m512i_word(&expected, i);
882            let result = _mm512_gf2p8mul_epi8(left, right);
883            assert_eq_m512i(result, expected);
884        }
885    }
886
887    #[simd_test(enable = "gfni,avx512bw")]
888    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
889        let (left, right, _expected) = generate_byte_mul_test_data();
890
891        for i in 0..NUM_TEST_WORDS_512 {
892            let left = load_m512i_word(&left, i);
893            let right = load_m512i_word(&right, i);
894            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
895            assert_eq_m512i(result_zero, _mm512_setzero_si512());
896            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
897            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
898            let expected_result = _mm512_gf2p8mul_epi8(left, right);
899            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
900            let expected_masked =
901                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
902            assert_eq_m512i(result_masked, expected_masked);
903        }
904    }
905
906    #[simd_test(enable = "gfni,avx512bw")]
907    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
908        let (left, right, _expected) = generate_byte_mul_test_data();
909
910        for i in 0..NUM_TEST_WORDS_512 {
911            let left = load_m512i_word(&left, i);
912            let right = load_m512i_word(&right, i);
913            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
914            assert_eq_m512i(result_left, left);
915            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
916            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
917            let expected_result = _mm512_gf2p8mul_epi8(left, right);
918            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
919            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
920            assert_eq_m512i(result_masked, expected_masked);
921        }
922    }
923
924    #[simd_test(enable = "gfni,avx")]
925    unsafe fn test_mm256_gf2p8mul_epi8() {
926        let (left, right, expected) = generate_byte_mul_test_data();
927
928        for i in 0..NUM_TEST_WORDS_256 {
929            let left = load_m256i_word(&left, i);
930            let right = load_m256i_word(&right, i);
931            let expected = load_m256i_word(&expected, i);
932            let result = _mm256_gf2p8mul_epi8(left, right);
933            assert_eq_m256i(result, expected);
934        }
935    }
936
937    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
938    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
939        let (left, right, _expected) = generate_byte_mul_test_data();
940
941        for i in 0..NUM_TEST_WORDS_256 {
942            let left = load_m256i_word(&left, i);
943            let right = load_m256i_word(&right, i);
944            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
945            assert_eq_m256i(result_zero, _mm256_setzero_si256());
946            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
947            const MASK_WORDS: i32 = 0b01_10_11_00;
948            let expected_result = _mm256_gf2p8mul_epi8(left, right);
949            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
950            let expected_masked =
951                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
952            assert_eq_m256i(result_masked, expected_masked);
953        }
954    }
955
956    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
957    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
958        let (left, right, _expected) = generate_byte_mul_test_data();
959
960        for i in 0..NUM_TEST_WORDS_256 {
961            let left = load_m256i_word(&left, i);
962            let right = load_m256i_word(&right, i);
963            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
964            assert_eq_m256i(result_left, left);
965            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
966            const MASK_WORDS: i32 = 0b01_10_11_00;
967            let expected_result = _mm256_gf2p8mul_epi8(left, right);
968            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
969            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
970            assert_eq_m256i(result_masked, expected_masked);
971        }
972    }
973
974    #[simd_test(enable = "gfni")]
975    unsafe fn test_mm_gf2p8mul_epi8() {
976        let (left, right, expected) = generate_byte_mul_test_data();
977
978        for i in 0..NUM_TEST_WORDS_128 {
979            let left = load_m128i_word(&left, i);
980            let right = load_m128i_word(&right, i);
981            let expected = load_m128i_word(&expected, i);
982            let result = _mm_gf2p8mul_epi8(left, right);
983            assert_eq_m128i(result, expected);
984        }
985    }
986
987    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
988    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
989        let (left, right, _expected) = generate_byte_mul_test_data();
990
991        for i in 0..NUM_TEST_WORDS_128 {
992            let left = load_m128i_word(&left, i);
993            let right = load_m128i_word(&right, i);
994            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
995            assert_eq_m128i(result_zero, _mm_setzero_si128());
996            let mask_bytes: __mmask16 = 0x0F_F0;
997            const MASK_WORDS: i32 = 0b01_10;
998            let expected_result = _mm_gf2p8mul_epi8(left, right);
999            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1000            let expected_masked =
1001                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1002            assert_eq_m128i(result_masked, expected_masked);
1003        }
1004    }
1005
1006    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1007    unsafe fn test_mm_mask_gf2p8mul_epi8() {
1008        let (left, right, _expected) = generate_byte_mul_test_data();
1009
1010        for i in 0..NUM_TEST_WORDS_128 {
1011            let left = load_m128i_word(&left, i);
1012            let right = load_m128i_word(&right, i);
1013            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
1014            assert_eq_m128i(result_left, left);
1015            let mask_bytes: __mmask16 = 0x0F_F0;
1016            const MASK_WORDS: i32 = 0b01_10;
1017            let expected_result = _mm_gf2p8mul_epi8(left, right);
1018            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1019            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1020            assert_eq_m128i(result_masked, expected_masked);
1021        }
1022    }
1023
1024    #[simd_test(enable = "gfni,avx512f")]
1025    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1026        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1027        const IDENTITY_BYTE: i32 = 0;
1028        let constant: i64 = 0;
1029        const CONSTANT_BYTE: i32 = 0x63;
1030        let identity = _mm512_set1_epi64(identity);
1031        let constant = _mm512_set1_epi64(constant);
1032        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1033
1034        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1035        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1036
1037        for i in 0..NUM_TEST_WORDS_512 {
1038            let data = load_m512i_word(&bytes, i);
1039            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1040            assert_eq_m512i(result, data);
1041            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1042            assert_eq_m512i(result, constant_reference);
1043            let data = load_m512i_word(&more_bytes, i);
1044            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1045            assert_eq_m512i(result, data);
1046            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1047            assert_eq_m512i(result, constant_reference);
1048
1049            let matrix = load_m512i_word(&matrices, i);
1050            let vector = load_m512i_word(&vectors, i);
1051            let reference = load_m512i_word(&references, i);
1052
1053            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1054            assert_eq_m512i(result, reference);
1055        }
1056    }
1057
1058    #[simd_test(enable = "gfni,avx512bw")]
1059    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1060        const CONSTANT_BYTE: i32 = 0x63;
1061        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1062
1063        for i in 0..NUM_TEST_WORDS_512 {
1064            let matrix = load_m512i_word(&matrices, i);
1065            let vector = load_m512i_word(&vectors, i);
1066            let result_zero =
1067                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1068            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1069            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1070            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1071            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1072            let result_masked =
1073                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1074            let expected_masked =
1075                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1076            assert_eq_m512i(result_masked, expected_masked);
1077        }
1078    }
1079
1080    #[simd_test(enable = "gfni,avx512bw")]
1081    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1082        const CONSTANT_BYTE: i32 = 0x63;
1083        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1084
1085        for i in 0..NUM_TEST_WORDS_512 {
1086            let left = load_m512i_word(&vectors, i);
1087            let right = load_m512i_word(&matrices, i);
1088            let result_left =
1089                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1090            assert_eq_m512i(result_left, left);
1091            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1092            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1093            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1094            let result_masked =
1095                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1096            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1097            assert_eq_m512i(result_masked, expected_masked);
1098        }
1099    }
1100
1101    #[simd_test(enable = "gfni,avx")]
1102    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1103        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1104        const IDENTITY_BYTE: i32 = 0;
1105        let constant: i64 = 0;
1106        const CONSTANT_BYTE: i32 = 0x63;
1107        let identity = _mm256_set1_epi64x(identity);
1108        let constant = _mm256_set1_epi64x(constant);
1109        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1110
1111        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1112        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1113
1114        for i in 0..NUM_TEST_WORDS_256 {
1115            let data = load_m256i_word(&bytes, i);
1116            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1117            assert_eq_m256i(result, data);
1118            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1119            assert_eq_m256i(result, constant_reference);
1120            let data = load_m256i_word(&more_bytes, i);
1121            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1122            assert_eq_m256i(result, data);
1123            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1124            assert_eq_m256i(result, constant_reference);
1125
1126            let matrix = load_m256i_word(&matrices, i);
1127            let vector = load_m256i_word(&vectors, i);
1128            let reference = load_m256i_word(&references, i);
1129
1130            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1131            assert_eq_m256i(result, reference);
1132        }
1133    }
1134
1135    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1136    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1137        const CONSTANT_BYTE: i32 = 0x63;
1138        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1139
1140        for i in 0..NUM_TEST_WORDS_256 {
1141            let matrix = load_m256i_word(&matrices, i);
1142            let vector = load_m256i_word(&vectors, i);
1143            let result_zero =
1144                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1145            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1146            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1147            const MASK_WORDS: i32 = 0b11_01_10_00;
1148            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1149            let result_masked =
1150                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1151            let expected_masked =
1152                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1153            assert_eq_m256i(result_masked, expected_masked);
1154        }
1155    }
1156
1157    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1158    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1159        const CONSTANT_BYTE: i32 = 0x63;
1160        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1161
1162        for i in 0..NUM_TEST_WORDS_256 {
1163            let left = load_m256i_word(&vectors, i);
1164            let right = load_m256i_word(&matrices, i);
1165            let result_left =
1166                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1167            assert_eq_m256i(result_left, left);
1168            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1169            const MASK_WORDS: i32 = 0b11_01_10_00;
1170            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1171            let result_masked =
1172                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1173            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1174            assert_eq_m256i(result_masked, expected_masked);
1175        }
1176    }
1177
1178    #[simd_test(enable = "gfni")]
1179    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1180        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1181        const IDENTITY_BYTE: i32 = 0;
1182        let constant: i64 = 0;
1183        const CONSTANT_BYTE: i32 = 0x63;
1184        let identity = _mm_set1_epi64x(identity);
1185        let constant = _mm_set1_epi64x(constant);
1186        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1187
1188        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1189        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1190
1191        for i in 0..NUM_TEST_WORDS_128 {
1192            let data = load_m128i_word(&bytes, i);
1193            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1194            assert_eq_m128i(result, data);
1195            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1196            assert_eq_m128i(result, constant_reference);
1197            let data = load_m128i_word(&more_bytes, i);
1198            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1199            assert_eq_m128i(result, data);
1200            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1201            assert_eq_m128i(result, constant_reference);
1202
1203            let matrix = load_m128i_word(&matrices, i);
1204            let vector = load_m128i_word(&vectors, i);
1205            let reference = load_m128i_word(&references, i);
1206
1207            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1208            assert_eq_m128i(result, reference);
1209        }
1210    }
1211
1212    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1213    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1214        const CONSTANT_BYTE: i32 = 0x63;
1215        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1216
1217        for i in 0..NUM_TEST_WORDS_128 {
1218            let matrix = load_m128i_word(&matrices, i);
1219            let vector = load_m128i_word(&vectors, i);
1220            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1221            assert_eq_m128i(result_zero, _mm_setzero_si128());
1222            let mask_bytes: __mmask16 = 0x0F_F0;
1223            const MASK_WORDS: i32 = 0b01_10;
1224            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1225            let result_masked =
1226                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1227            let expected_masked =
1228                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1229            assert_eq_m128i(result_masked, expected_masked);
1230        }
1231    }
1232
1233    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1234    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1235        const CONSTANT_BYTE: i32 = 0x63;
1236        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1237
1238        for i in 0..NUM_TEST_WORDS_128 {
1239            let left = load_m128i_word(&vectors, i);
1240            let right = load_m128i_word(&matrices, i);
1241            let result_left =
1242                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1243            assert_eq_m128i(result_left, left);
1244            let mask_bytes: __mmask16 = 0x0F_F0;
1245            const MASK_WORDS: i32 = 0b01_10;
1246            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1247            let result_masked =
1248                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1249            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1250            assert_eq_m128i(result_masked, expected_masked);
1251        }
1252    }
1253
1254    #[simd_test(enable = "gfni,avx512f")]
1255    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1256        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1257        const IDENTITY_BYTE: i32 = 0;
1258        const CONSTANT_BYTE: i32 = 0x63;
1259        let identity = _mm512_set1_epi64(identity);
1260
1261        // validate inversion
1262        let (inputs, results) = generate_inv_tests_data();
1263
1264        for i in 0..NUM_BYTES_WORDS_512 {
1265            let input = load_m512i_word(&inputs, i);
1266            let reference = load_m512i_word(&results, i);
1267            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1268            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1269            assert_eq_m512i(remultiplied, reference);
1270        }
1271
1272        // validate subsequent affine operation
1273        let (matrices, vectors, _affine_expected) =
1274            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1275
1276        for i in 0..NUM_TEST_WORDS_512 {
1277            let vector = load_m512i_word(&vectors, i);
1278            let matrix = load_m512i_word(&matrices, i);
1279
1280            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1281            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1282            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1283            assert_eq_m512i(result, reference);
1284        }
1285
1286        // validate everything by virtue of checking against the AES SBox
1287        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1288        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1289
1290        for i in 0..NUM_BYTES_WORDS_512 {
1291            let reference = load_m512i_word(&AES_S_BOX, i);
1292            let input = load_m512i_word(&inputs, i);
1293            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1294            assert_eq_m512i(result, reference);
1295        }
1296    }
1297
1298    #[simd_test(enable = "gfni,avx512bw")]
1299    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1300        const CONSTANT_BYTE: i32 = 0x63;
1301        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1302
1303        for i in 0..NUM_TEST_WORDS_512 {
1304            let matrix = load_m512i_word(&matrices, i);
1305            let vector = load_m512i_word(&vectors, i);
1306            let result_zero =
1307                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1308            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1309            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1310            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1311            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1312            let result_masked =
1313                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1314            let expected_masked =
1315                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1316            assert_eq_m512i(result_masked, expected_masked);
1317        }
1318    }
1319
1320    #[simd_test(enable = "gfni,avx512bw")]
1321    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1322        const CONSTANT_BYTE: i32 = 0x63;
1323        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1324
1325        for i in 0..NUM_TEST_WORDS_512 {
1326            let left = load_m512i_word(&vectors, i);
1327            let right = load_m512i_word(&matrices, i);
1328            let result_left =
1329                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1330            assert_eq_m512i(result_left, left);
1331            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1332            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1333            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1334            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1335                left, mask_bytes, left, right,
1336            );
1337            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1338            assert_eq_m512i(result_masked, expected_masked);
1339        }
1340    }
1341
1342    #[simd_test(enable = "gfni,avx")]
1343    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1344        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1345        const IDENTITY_BYTE: i32 = 0;
1346        const CONSTANT_BYTE: i32 = 0x63;
1347        let identity = _mm256_set1_epi64x(identity);
1348
1349        // validate inversion
1350        let (inputs, results) = generate_inv_tests_data();
1351
1352        for i in 0..NUM_BYTES_WORDS_256 {
1353            let input = load_m256i_word(&inputs, i);
1354            let reference = load_m256i_word(&results, i);
1355            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1356            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1357            assert_eq_m256i(remultiplied, reference);
1358        }
1359
1360        // validate subsequent affine operation
1361        let (matrices, vectors, _affine_expected) =
1362            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1363
1364        for i in 0..NUM_TEST_WORDS_256 {
1365            let vector = load_m256i_word(&vectors, i);
1366            let matrix = load_m256i_word(&matrices, i);
1367
1368            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1369            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1370            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1371            assert_eq_m256i(result, reference);
1372        }
1373
1374        // validate everything by virtue of checking against the AES SBox
1375        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1376        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1377
1378        for i in 0..NUM_BYTES_WORDS_256 {
1379            let reference = load_m256i_word(&AES_S_BOX, i);
1380            let input = load_m256i_word(&inputs, i);
1381            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1382            assert_eq_m256i(result, reference);
1383        }
1384    }
1385
1386    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1387    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1388        const CONSTANT_BYTE: i32 = 0x63;
1389        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1390
1391        for i in 0..NUM_TEST_WORDS_256 {
1392            let matrix = load_m256i_word(&matrices, i);
1393            let vector = load_m256i_word(&vectors, i);
1394            let result_zero =
1395                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1396            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1397            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1398            const MASK_WORDS: i32 = 0b11_01_10_00;
1399            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1400            let result_masked =
1401                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1402            let expected_masked =
1403                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1404            assert_eq_m256i(result_masked, expected_masked);
1405        }
1406    }
1407
1408    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1409    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1410        const CONSTANT_BYTE: i32 = 0x63;
1411        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1412
1413        for i in 0..NUM_TEST_WORDS_256 {
1414            let left = load_m256i_word(&vectors, i);
1415            let right = load_m256i_word(&matrices, i);
1416            let result_left =
1417                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1418            assert_eq_m256i(result_left, left);
1419            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1420            const MASK_WORDS: i32 = 0b11_01_10_00;
1421            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1422            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1423                left, mask_bytes, left, right,
1424            );
1425            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1426            assert_eq_m256i(result_masked, expected_masked);
1427        }
1428    }
1429
1430    #[simd_test(enable = "gfni")]
1431    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1432        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1433        const IDENTITY_BYTE: i32 = 0;
1434        const CONSTANT_BYTE: i32 = 0x63;
1435        let identity = _mm_set1_epi64x(identity);
1436
1437        // validate inversion
1438        let (inputs, results) = generate_inv_tests_data();
1439
1440        for i in 0..NUM_BYTES_WORDS_128 {
1441            let input = load_m128i_word(&inputs, i);
1442            let reference = load_m128i_word(&results, i);
1443            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1444            let remultiplied = _mm_gf2p8mul_epi8(result, input);
1445            assert_eq_m128i(remultiplied, reference);
1446        }
1447
1448        // validate subsequent affine operation
1449        let (matrices, vectors, _affine_expected) =
1450            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1451
1452        for i in 0..NUM_TEST_WORDS_128 {
1453            let vector = load_m128i_word(&vectors, i);
1454            let matrix = load_m128i_word(&matrices, i);
1455
1456            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1457            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1458            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1459            assert_eq_m128i(result, reference);
1460        }
1461
1462        // validate everything by virtue of checking against the AES SBox
1463        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1464        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1465
1466        for i in 0..NUM_BYTES_WORDS_128 {
1467            let reference = load_m128i_word(&AES_S_BOX, i);
1468            let input = load_m128i_word(&inputs, i);
1469            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1470            assert_eq_m128i(result, reference);
1471        }
1472    }
1473
1474    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1475    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1476        const CONSTANT_BYTE: i32 = 0x63;
1477        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1478
1479        for i in 0..NUM_TEST_WORDS_128 {
1480            let matrix = load_m128i_word(&matrices, i);
1481            let vector = load_m128i_word(&vectors, i);
1482            let result_zero =
1483                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1484            assert_eq_m128i(result_zero, _mm_setzero_si128());
1485            let mask_bytes: __mmask16 = 0x0F_F0;
1486            const MASK_WORDS: i32 = 0b01_10;
1487            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1488            let result_masked =
1489                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1490            let expected_masked =
1491                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1492            assert_eq_m128i(result_masked, expected_masked);
1493        }
1494    }
1495
1496    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1497    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1498        const CONSTANT_BYTE: i32 = 0x63;
1499        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1500
1501        for i in 0..NUM_TEST_WORDS_128 {
1502            let left = load_m128i_word(&vectors, i);
1503            let right = load_m128i_word(&matrices, i);
1504            let result_left =
1505                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1506            assert_eq_m128i(result_left, left);
1507            let mask_bytes: __mmask16 = 0x0F_F0;
1508            const MASK_WORDS: i32 = 0b01_10;
1509            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1510            let result_masked =
1511                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1512            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1513            assert_eq_m128i(result_masked, expected_masked);
1514        }
1515    }
1516}