core/stdarch/crates/core_arch/src/x86/
gfni.rs

1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::x86::__m128i;
14use crate::core_arch::x86::__m256i;
15use crate::core_arch::x86::__m512i;
16use crate::core_arch::x86::__mmask16;
17use crate::core_arch::x86::__mmask32;
18use crate::core_arch::x86::__mmask64;
19use crate::intrinsics::simd::simd_select_bitmask;
20use crate::mem::transmute;
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25#[allow(improper_ctypes)]
26unsafe extern "C" {
27    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
28    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
29    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
30    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
31    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
32    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
33    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
34    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
36    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
38    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39    #[link_name = "llvm.x86.vgf2p8mulb.512"]
40    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
41    #[link_name = "llvm.x86.vgf2p8mulb.256"]
42    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
43    #[link_name = "llvm.x86.vgf2p8mulb.128"]
44    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
45}
46
47// LLVM requires AVX512BW for a lot of these instructions, see
48// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
49// however our tests also require the target feature list to match Intel's
50// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
51// requirement (for now)
52// also see
53// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
54// for forcing GFNI, BW and optionally VL extension
55
56/// Performs a multiplication in GF(2^8) on the packed bytes.
57/// The field is in polynomial representation with the reduction polynomial
58///  x^8 + x^4 + x^3 + x + 1.
59///
60/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
61#[inline]
62#[target_feature(enable = "gfni,avx512f")]
63#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64#[cfg_attr(test, assert_instr(vgf2p8mulb))]
65pub fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
66    unsafe { transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) }
67}
68
69/// Performs a multiplication in GF(2^8) on the packed bytes.
70/// The field is in polynomial representation with the reduction polynomial
71///  x^8 + x^4 + x^3 + x + 1.
72///
73/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
74/// Otherwise the computation result is written into the result.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
77#[inline]
78#[target_feature(enable = "gfni,avx512bw,avx512f")]
79#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
80#[cfg_attr(test, assert_instr(vgf2p8mulb))]
81pub fn _mm512_mask_gf2p8mul_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
82    unsafe {
83        transmute(simd_select_bitmask(
84            k,
85            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
86            src.as_i8x64(),
87        ))
88    }
89}
90
91/// Performs a multiplication in GF(2^8) on the packed bytes.
92/// The field is in polynomial representation with the reduction polynomial
93///  x^8 + x^4 + x^3 + x + 1.
94///
95/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
96/// Otherwise the computation result is written into the result.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
99#[inline]
100#[target_feature(enable = "gfni,avx512bw,avx512f")]
101#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
102#[cfg_attr(test, assert_instr(vgf2p8mulb))]
103pub fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
104    let zero = i8x64::ZERO;
105    unsafe {
106        transmute(simd_select_bitmask(
107            k,
108            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
109            zero,
110        ))
111    }
112}
113
114/// Performs a multiplication in GF(2^8) on the packed bytes.
115/// The field is in polynomial representation with the reduction polynomial
116///  x^8 + x^4 + x^3 + x + 1.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
119#[inline]
120#[target_feature(enable = "gfni,avx")]
121#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
122#[cfg_attr(test, assert_instr(vgf2p8mulb))]
123pub fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Performs a multiplication in GF(2^8) on the packed bytes.
128/// The field is in polynomial representation with the reduction polynomial
129///  x^8 + x^4 + x^3 + x + 1.
130///
131/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
132/// Otherwise the computation result is written into the result.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
135#[inline]
136#[target_feature(enable = "gfni,avx512bw,avx512vl")]
137#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
138#[cfg_attr(test, assert_instr(vgf2p8mulb))]
139pub fn _mm256_mask_gf2p8mul_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
140    unsafe {
141        transmute(simd_select_bitmask(
142            k,
143            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
144            src.as_i8x32(),
145        ))
146    }
147}
148
149/// Performs a multiplication in GF(2^8) on the packed bytes.
150/// The field is in polynomial representation with the reduction polynomial
151///  x^8 + x^4 + x^3 + x + 1.
152///
153/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
154/// Otherwise the computation result is written into the result.
155///
156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
157#[inline]
158#[target_feature(enable = "gfni,avx512bw,avx512vl")]
159#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
160#[cfg_attr(test, assert_instr(vgf2p8mulb))]
161pub fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
162    let zero = i8x32::ZERO;
163    unsafe {
164        transmute(simd_select_bitmask(
165            k,
166            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
167            zero,
168        ))
169    }
170}
171
172/// Performs a multiplication in GF(2^8) on the packed bytes.
173/// The field is in polynomial representation with the reduction polynomial
174///  x^8 + x^4 + x^3 + x + 1.
175///
176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
177#[inline]
178#[target_feature(enable = "gfni")]
179#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
180#[cfg_attr(test, assert_instr(gf2p8mulb))]
181pub fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
182    unsafe { transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) }
183}
184
185/// Performs a multiplication in GF(2^8) on the packed bytes.
186/// The field is in polynomial representation with the reduction polynomial
187///  x^8 + x^4 + x^3 + x + 1.
188///
189/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
190/// Otherwise the computation result is written into the result.
191///
192/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
193#[inline]
194#[target_feature(enable = "gfni,avx512bw,avx512vl")]
195#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
196#[cfg_attr(test, assert_instr(vgf2p8mulb))]
197pub fn _mm_mask_gf2p8mul_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
198    unsafe {
199        transmute(simd_select_bitmask(
200            k,
201            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
202            src.as_i8x16(),
203        ))
204    }
205}
206
207/// Performs a multiplication in GF(2^8) on the packed bytes.
208/// The field is in polynomial representation with the reduction polynomial
209///  x^8 + x^4 + x^3 + x + 1.
210///
211/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
212/// Otherwise the computation result is written into the result.
213///
214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
215#[inline]
216#[target_feature(enable = "gfni,avx512bw,avx512vl")]
217#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
218#[cfg_attr(test, assert_instr(vgf2p8mulb))]
219pub fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
220    unsafe {
221        let zero = i8x16::ZERO;
222        transmute(simd_select_bitmask(
223            k,
224            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
225            zero,
226        ))
227    }
228}
229
230/// Performs an affine transformation on the packed bytes in x.
231/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
232/// and b being a constant 8-bit immediate value.
233/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
236#[inline]
237#[target_feature(enable = "gfni,avx512f")]
238#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
239#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
240#[rustc_legacy_const_generics(2)]
241pub fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
242    static_assert_uimm_bits!(B, 8);
243    let b = B as u8;
244    let x = x.as_i8x64();
245    let a = a.as_i8x64();
246    unsafe {
247        let r = vgf2p8affineqb_512(x, a, b);
248        transmute(r)
249    }
250}
251
252/// Performs an affine transformation on the packed bytes in x.
253/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
254/// and b being a constant 8-bit immediate value.
255/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
256///
257/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
258/// Otherwise the computation result is written into the result.
259///
260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
261#[inline]
262#[target_feature(enable = "gfni,avx512bw,avx512f")]
263#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
264#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
265#[rustc_legacy_const_generics(3)]
266pub fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
267    k: __mmask64,
268    x: __m512i,
269    a: __m512i,
270) -> __m512i {
271    static_assert_uimm_bits!(B, 8);
272    let b = B as u8;
273    let zero = i8x64::ZERO;
274    let x = x.as_i8x64();
275    let a = a.as_i8x64();
276    unsafe {
277        let r = vgf2p8affineqb_512(x, a, b);
278        transmute(simd_select_bitmask(k, r, zero))
279    }
280}
281
282/// Performs an affine transformation on the packed bytes in x.
283/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
284/// and b being a constant 8-bit immediate value.
285/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
286///
287/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
288/// Otherwise the computation result is written into the result.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
291#[inline]
292#[target_feature(enable = "gfni,avx512bw,avx512f")]
293#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
294#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
295#[rustc_legacy_const_generics(4)]
296pub fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
297    src: __m512i,
298    k: __mmask64,
299    x: __m512i,
300    a: __m512i,
301) -> __m512i {
302    static_assert_uimm_bits!(B, 8);
303    let b = B as u8;
304    let x = x.as_i8x64();
305    let a = a.as_i8x64();
306    unsafe {
307        let r = vgf2p8affineqb_512(x, a, b);
308        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
309    }
310}
311
312/// Performs an affine transformation on the packed bytes in x.
313/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
314/// and b being a constant 8-bit immediate value.
315/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
316///
317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
318#[inline]
319#[target_feature(enable = "gfni,avx")]
320#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
321#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
322#[rustc_legacy_const_generics(2)]
323pub fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
324    static_assert_uimm_bits!(B, 8);
325    let b = B as u8;
326    let x = x.as_i8x32();
327    let a = a.as_i8x32();
328    unsafe {
329        let r = vgf2p8affineqb_256(x, a, b);
330        transmute(r)
331    }
332}
333
334/// Performs an affine transformation on the packed bytes in x.
335/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
336/// and b being a constant 8-bit immediate value.
337/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
338///
339/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
340/// Otherwise the computation result is written into the result.
341///
342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
343#[inline]
344#[target_feature(enable = "gfni,avx512bw,avx512vl")]
345#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
346#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
347#[rustc_legacy_const_generics(3)]
348pub fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
349    k: __mmask32,
350    x: __m256i,
351    a: __m256i,
352) -> __m256i {
353    static_assert_uimm_bits!(B, 8);
354    let b = B as u8;
355    let zero = i8x32::ZERO;
356    let x = x.as_i8x32();
357    let a = a.as_i8x32();
358    unsafe {
359        let r = vgf2p8affineqb_256(x, a, b);
360        transmute(simd_select_bitmask(k, r, zero))
361    }
362}
363
364/// Performs an affine transformation on the packed bytes in x.
365/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
366/// and b being a constant 8-bit immediate value.
367/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
368///
369/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
370/// Otherwise the computation result is written into the result.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
373#[inline]
374#[target_feature(enable = "gfni,avx512bw,avx512vl")]
375#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
376#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
377#[rustc_legacy_const_generics(4)]
378pub fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
379    src: __m256i,
380    k: __mmask32,
381    x: __m256i,
382    a: __m256i,
383) -> __m256i {
384    static_assert_uimm_bits!(B, 8);
385    let b = B as u8;
386    let x = x.as_i8x32();
387    let a = a.as_i8x32();
388    unsafe {
389        let r = vgf2p8affineqb_256(x, a, b);
390        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
391    }
392}
393
394/// Performs an affine transformation on the packed bytes in x.
395/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
396/// and b being a constant 8-bit immediate value.
397/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
398///
399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
400#[inline]
401#[target_feature(enable = "gfni")]
402#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
403#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
404#[rustc_legacy_const_generics(2)]
405pub fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
406    static_assert_uimm_bits!(B, 8);
407    let b = B as u8;
408    let x = x.as_i8x16();
409    let a = a.as_i8x16();
410    unsafe {
411        let r = vgf2p8affineqb_128(x, a, b);
412        transmute(r)
413    }
414}
415
416/// Performs an affine transformation on the packed bytes in x.
417/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
418/// and b being a constant 8-bit immediate value.
419/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
420///
421/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
422/// Otherwise the computation result is written into the result.
423///
424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
425#[inline]
426#[target_feature(enable = "gfni,avx512bw,avx512vl")]
427#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
428#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
429#[rustc_legacy_const_generics(3)]
430pub fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
431    k: __mmask16,
432    x: __m128i,
433    a: __m128i,
434) -> __m128i {
435    static_assert_uimm_bits!(B, 8);
436    let b = B as u8;
437    let zero = i8x16::ZERO;
438    let x = x.as_i8x16();
439    let a = a.as_i8x16();
440    unsafe {
441        let r = vgf2p8affineqb_128(x, a, b);
442        transmute(simd_select_bitmask(k, r, zero))
443    }
444}
445
446/// Performs an affine transformation on the packed bytes in x.
447/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
448/// and b being a constant 8-bit immediate value.
449/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
450///
451/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
452/// Otherwise the computation result is written into the result.
453///
454/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
455#[inline]
456#[target_feature(enable = "gfni,avx512bw,avx512vl")]
457#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
458#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
459#[rustc_legacy_const_generics(4)]
460pub fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
461    src: __m128i,
462    k: __mmask16,
463    x: __m128i,
464    a: __m128i,
465) -> __m128i {
466    static_assert_uimm_bits!(B, 8);
467    let b = B as u8;
468    let x = x.as_i8x16();
469    let a = a.as_i8x16();
470    unsafe {
471        let r = vgf2p8affineqb_128(x, a, b);
472        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
473    }
474}
475
476/// Performs an affine transformation on the inverted packed bytes in x.
477/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
478/// and b being a constant 8-bit immediate value.
479/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
480/// The inverse of 0 is 0.
481/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
484#[inline]
485#[target_feature(enable = "gfni,avx512f")]
486#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
487#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
488#[rustc_legacy_const_generics(2)]
489pub fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
490    static_assert_uimm_bits!(B, 8);
491    let b = B as u8;
492    let x = x.as_i8x64();
493    let a = a.as_i8x64();
494    unsafe {
495        let r = vgf2p8affineinvqb_512(x, a, b);
496        transmute(r)
497    }
498}
499
500/// Performs an affine transformation on the inverted packed bytes in x.
501/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
502/// and b being a constant 8-bit immediate value.
503/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
504/// The inverse of 0 is 0.
505/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
506///
507/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
508/// Otherwise the computation result is written into the result.
509///
510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
511#[inline]
512#[target_feature(enable = "gfni,avx512bw,avx512f")]
513#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
514#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
515#[rustc_legacy_const_generics(3)]
516pub fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
517    k: __mmask64,
518    x: __m512i,
519    a: __m512i,
520) -> __m512i {
521    static_assert_uimm_bits!(B, 8);
522    let b = B as u8;
523    let zero = i8x64::ZERO;
524    let x = x.as_i8x64();
525    let a = a.as_i8x64();
526    unsafe {
527        let r = vgf2p8affineinvqb_512(x, a, b);
528        transmute(simd_select_bitmask(k, r, zero))
529    }
530}
531
532/// Performs an affine transformation on the inverted packed bytes in x.
533/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
534/// and b being a constant 8-bit immediate value.
535/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
536/// The inverse of 0 is 0.
537/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
538///
539/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
540/// Otherwise the computation result is written into the result.
541///
542/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
543#[inline]
544#[target_feature(enable = "gfni,avx512bw,avx512f")]
545#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
546#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
547#[rustc_legacy_const_generics(4)]
548pub fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
549    src: __m512i,
550    k: __mmask64,
551    x: __m512i,
552    a: __m512i,
553) -> __m512i {
554    static_assert_uimm_bits!(B, 8);
555    let b = B as u8;
556    let x = x.as_i8x64();
557    let a = a.as_i8x64();
558    unsafe {
559        let r = vgf2p8affineinvqb_512(x, a, b);
560        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
561    }
562}
563
564/// Performs an affine transformation on the inverted packed bytes in x.
565/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
566/// and b being a constant 8-bit immediate value.
567/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
568/// The inverse of 0 is 0.
569/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
570///
571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
572#[inline]
573#[target_feature(enable = "gfni,avx")]
574#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
575#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
576#[rustc_legacy_const_generics(2)]
577pub fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
578    static_assert_uimm_bits!(B, 8);
579    let b = B as u8;
580    let x = x.as_i8x32();
581    let a = a.as_i8x32();
582    unsafe {
583        let r = vgf2p8affineinvqb_256(x, a, b);
584        transmute(r)
585    }
586}
587
588/// Performs an affine transformation on the inverted packed bytes in x.
589/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
590/// and b being a constant 8-bit immediate value.
591/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
592/// The inverse of 0 is 0.
593/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
594///
595/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
596/// Otherwise the computation result is written into the result.
597///
598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
599#[inline]
600#[target_feature(enable = "gfni,avx512bw,avx512vl")]
601#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
602#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
603#[rustc_legacy_const_generics(3)]
604pub fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
605    k: __mmask32,
606    x: __m256i,
607    a: __m256i,
608) -> __m256i {
609    static_assert_uimm_bits!(B, 8);
610    let b = B as u8;
611    let zero = i8x32::ZERO;
612    let x = x.as_i8x32();
613    let a = a.as_i8x32();
614    unsafe {
615        let r = vgf2p8affineinvqb_256(x, a, b);
616        transmute(simd_select_bitmask(k, r, zero))
617    }
618}
619
620/// Performs an affine transformation on the inverted packed bytes in x.
621/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
622/// and b being a constant 8-bit immediate value.
623/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
624/// The inverse of 0 is 0.
625/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
626///
627/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
628/// Otherwise the computation result is written into the result.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
631#[inline]
632#[target_feature(enable = "gfni,avx512bw,avx512vl")]
633#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
634#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
635#[rustc_legacy_const_generics(4)]
636pub fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
637    src: __m256i,
638    k: __mmask32,
639    x: __m256i,
640    a: __m256i,
641) -> __m256i {
642    static_assert_uimm_bits!(B, 8);
643    let b = B as u8;
644    let x = x.as_i8x32();
645    let a = a.as_i8x32();
646    unsafe {
647        let r = vgf2p8affineinvqb_256(x, a, b);
648        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
649    }
650}
651
652/// Performs an affine transformation on the inverted packed bytes in x.
653/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
654/// and b being a constant 8-bit immediate value.
655/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
656/// The inverse of 0 is 0.
657/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
658///
659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
660#[inline]
661#[target_feature(enable = "gfni")]
662#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
663#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
664#[rustc_legacy_const_generics(2)]
665pub fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
666    static_assert_uimm_bits!(B, 8);
667    let b = B as u8;
668    let x = x.as_i8x16();
669    let a = a.as_i8x16();
670    unsafe {
671        let r = vgf2p8affineinvqb_128(x, a, b);
672        transmute(r)
673    }
674}
675
676/// Performs an affine transformation on the inverted packed bytes in x.
677/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
678/// and b being a constant 8-bit immediate value.
679/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
680/// The inverse of 0 is 0.
681/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
682///
683/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
684/// Otherwise the computation result is written into the result.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
687#[inline]
688#[target_feature(enable = "gfni,avx512bw,avx512vl")]
689#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
690#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
691#[rustc_legacy_const_generics(3)]
692pub fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
693    k: __mmask16,
694    x: __m128i,
695    a: __m128i,
696) -> __m128i {
697    static_assert_uimm_bits!(B, 8);
698    let b = B as u8;
699    let zero = i8x16::ZERO;
700    let x = x.as_i8x16();
701    let a = a.as_i8x16();
702    unsafe {
703        let r = vgf2p8affineinvqb_128(x, a, b);
704        transmute(simd_select_bitmask(k, r, zero))
705    }
706}
707
708/// Performs an affine transformation on the inverted packed bytes in x.
709/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
710/// and b being a constant 8-bit immediate value.
711/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
712/// The inverse of 0 is 0.
713/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
714///
715/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
716/// Otherwise the computation result is written into the result.
717///
718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
719#[inline]
720#[target_feature(enable = "gfni,avx512bw,avx512vl")]
721#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
722#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
723#[rustc_legacy_const_generics(4)]
724pub fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
725    src: __m128i,
726    k: __mmask16,
727    x: __m128i,
728    a: __m128i,
729) -> __m128i {
730    static_assert_uimm_bits!(B, 8);
731    let b = B as u8;
732    let x = x.as_i8x16();
733    let a = a.as_i8x16();
734    unsafe {
735        let r = vgf2p8affineinvqb_128(x, a, b);
736        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
737    }
738}
739
740#[cfg(test)]
741mod tests {
742    // The constants in the tests below are just bit patterns. They should not
743    // be interpreted as integers; signedness does not make sense for them, but
744    // __mXXXi happens to be defined in terms of signed integers.
745    #![allow(overflowing_literals)]
746
747    use core::hint::black_box;
748    use core::intrinsics::size_of;
749    use stdarch_test::simd_test;
750
751    use crate::core_arch::x86::*;
752
753    fn mulbyte(left: u8, right: u8) -> u8 {
754        // this implementation follows the description in
755        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
756        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
757        let left: u16 = left.into();
758        let right: u16 = right.into();
759        let mut carryless_product: u16 = 0;
760
761        // Carryless multiplication
762        for i in 0..8 {
763            if ((left >> i) & 0x01) != 0 {
764                carryless_product ^= right << i;
765            }
766        }
767
768        // reduction, adding in "0" where appropriate to clear out high bits
769        // note that REDUCTION_POLYNOMIAL is zero in this context
770        for i in (8..=14).rev() {
771            if ((carryless_product >> i) & 0x01) != 0 {
772                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
773            }
774        }
775
776        carryless_product as u8
777    }
778
779    const NUM_TEST_WORDS_512: usize = 4;
780    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
781    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
782    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
783    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
784    const NUM_BYTES: usize = 256;
785    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
786    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
787    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
788
789    fn parity(input: u8) -> u8 {
790        let mut accumulator = 0;
791        for i in 0..8 {
792            accumulator ^= (input >> i) & 0x01;
793        }
794        accumulator
795    }
796
797    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
798        // this implementation follows the description in
799        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
800        let mut accumulator = 0;
801
802        for bit in 0..8 {
803            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
804        }
805
806        accumulator ^ b
807    }
808
809    fn generate_affine_mul_test_data(
810        immediate: u8,
811    ) -> (
812        [u64; NUM_TEST_WORDS_64],
813        [u8; NUM_TEST_ENTRIES],
814        [u8; NUM_TEST_ENTRIES],
815    ) {
816        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
817        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
818        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
819
820        for i in 0..NUM_TEST_WORDS_64 {
821            left[i] = (i as u64) * 103 * 101;
822            for j in 0..8 {
823                let j64 = j as u64;
824                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
825                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
826            }
827        }
828
829        (left, right, result)
830    }
831
832    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
833        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
834        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
835
836        for i in 0..NUM_BYTES {
837            input[i] = (i % 256) as u8;
838            result[i] = if i == 0 { 0 } else { 1 };
839        }
840
841        (input, result)
842    }
843
844    const AES_S_BOX: [u8; NUM_BYTES] = [
845        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
846        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
847        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
848        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
849        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
850        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
851        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
852        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
853        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
854        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
855        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
856        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
857        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
858        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
859        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
860        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
861        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
862        0x16,
863    ];
864
865    fn generate_byte_mul_test_data() -> (
866        [u8; NUM_TEST_ENTRIES],
867        [u8; NUM_TEST_ENTRIES],
868        [u8; NUM_TEST_ENTRIES],
869    ) {
870        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
871        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
872        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
873
874        for i in 0..NUM_TEST_ENTRIES {
875            left[i] = (i % 256) as u8;
876            right[i] = left[i].wrapping_mul(101);
877            result[i] = mulbyte(left[i], right[i]);
878        }
879
880        (left, right, result)
881    }
882
883    #[target_feature(enable = "sse2")]
884    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
885    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
886        let byte_offset = word_index * 16 / size_of::<T>();
887        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
888        _mm_loadu_si128(black_box(pointer))
889    }
890
891    #[target_feature(enable = "avx")]
892    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
893    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
894        let byte_offset = word_index * 32 / size_of::<T>();
895        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
896        _mm256_loadu_si256(black_box(pointer))
897    }
898
899    #[target_feature(enable = "avx512f")]
900    #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
901    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
902        let byte_offset = word_index * 64 / size_of::<T>();
903        let pointer = data.as_ptr().add(byte_offset) as *const _;
904        _mm512_loadu_si512(black_box(pointer))
905    }
906
907    #[simd_test(enable = "gfni,avx512f")]
908    unsafe fn test_mm512_gf2p8mul_epi8() {
909        let (left, right, expected) = generate_byte_mul_test_data();
910
911        for i in 0..NUM_TEST_WORDS_512 {
912            let left = load_m512i_word(&left, i);
913            let right = load_m512i_word(&right, i);
914            let expected = load_m512i_word(&expected, i);
915            let result = _mm512_gf2p8mul_epi8(left, right);
916            assert_eq_m512i(result, expected);
917        }
918    }
919
920    #[simd_test(enable = "gfni,avx512bw")]
921    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
922        let (left, right, _expected) = generate_byte_mul_test_data();
923
924        for i in 0..NUM_TEST_WORDS_512 {
925            let left = load_m512i_word(&left, i);
926            let right = load_m512i_word(&right, i);
927            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
928            assert_eq_m512i(result_zero, _mm512_setzero_si512());
929            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
930            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
931            let expected_result = _mm512_gf2p8mul_epi8(left, right);
932            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
933            let expected_masked =
934                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
935            assert_eq_m512i(result_masked, expected_masked);
936        }
937    }
938
939    #[simd_test(enable = "gfni,avx512bw")]
940    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
941        let (left, right, _expected) = generate_byte_mul_test_data();
942
943        for i in 0..NUM_TEST_WORDS_512 {
944            let left = load_m512i_word(&left, i);
945            let right = load_m512i_word(&right, i);
946            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
947            assert_eq_m512i(result_left, left);
948            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
949            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
950            let expected_result = _mm512_gf2p8mul_epi8(left, right);
951            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
952            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
953            assert_eq_m512i(result_masked, expected_masked);
954        }
955    }
956
957    #[simd_test(enable = "gfni,avx")]
958    unsafe fn test_mm256_gf2p8mul_epi8() {
959        let (left, right, expected) = generate_byte_mul_test_data();
960
961        for i in 0..NUM_TEST_WORDS_256 {
962            let left = load_m256i_word(&left, i);
963            let right = load_m256i_word(&right, i);
964            let expected = load_m256i_word(&expected, i);
965            let result = _mm256_gf2p8mul_epi8(left, right);
966            assert_eq_m256i(result, expected);
967        }
968    }
969
970    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
971    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
972        let (left, right, _expected) = generate_byte_mul_test_data();
973
974        for i in 0..NUM_TEST_WORDS_256 {
975            let left = load_m256i_word(&left, i);
976            let right = load_m256i_word(&right, i);
977            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
978            assert_eq_m256i(result_zero, _mm256_setzero_si256());
979            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
980            const MASK_WORDS: i32 = 0b01_10_11_00;
981            let expected_result = _mm256_gf2p8mul_epi8(left, right);
982            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
983            let expected_masked =
984                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
985            assert_eq_m256i(result_masked, expected_masked);
986        }
987    }
988
989    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
990    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
991        let (left, right, _expected) = generate_byte_mul_test_data();
992
993        for i in 0..NUM_TEST_WORDS_256 {
994            let left = load_m256i_word(&left, i);
995            let right = load_m256i_word(&right, i);
996            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
997            assert_eq_m256i(result_left, left);
998            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
999            const MASK_WORDS: i32 = 0b01_10_11_00;
1000            let expected_result = _mm256_gf2p8mul_epi8(left, right);
1001            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1002            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1003            assert_eq_m256i(result_masked, expected_masked);
1004        }
1005    }
1006
1007    #[simd_test(enable = "gfni")]
1008    unsafe fn test_mm_gf2p8mul_epi8() {
1009        let (left, right, expected) = generate_byte_mul_test_data();
1010
1011        for i in 0..NUM_TEST_WORDS_128 {
1012            let left = load_m128i_word(&left, i);
1013            let right = load_m128i_word(&right, i);
1014            let expected = load_m128i_word(&expected, i);
1015            let result = _mm_gf2p8mul_epi8(left, right);
1016            assert_eq_m128i(result, expected);
1017        }
1018    }
1019
1020    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1021    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
1022        let (left, right, _expected) = generate_byte_mul_test_data();
1023
1024        for i in 0..NUM_TEST_WORDS_128 {
1025            let left = load_m128i_word(&left, i);
1026            let right = load_m128i_word(&right, i);
1027            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
1028            assert_eq_m128i(result_zero, _mm_setzero_si128());
1029            let mask_bytes: __mmask16 = 0x0F_F0;
1030            const MASK_WORDS: i32 = 0b01_10;
1031            let expected_result = _mm_gf2p8mul_epi8(left, right);
1032            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1033            let expected_masked =
1034                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1035            assert_eq_m128i(result_masked, expected_masked);
1036        }
1037    }
1038
1039    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1040    unsafe fn test_mm_mask_gf2p8mul_epi8() {
1041        let (left, right, _expected) = generate_byte_mul_test_data();
1042
1043        for i in 0..NUM_TEST_WORDS_128 {
1044            let left = load_m128i_word(&left, i);
1045            let right = load_m128i_word(&right, i);
1046            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
1047            assert_eq_m128i(result_left, left);
1048            let mask_bytes: __mmask16 = 0x0F_F0;
1049            const MASK_WORDS: i32 = 0b01_10;
1050            let expected_result = _mm_gf2p8mul_epi8(left, right);
1051            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1052            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1053            assert_eq_m128i(result_masked, expected_masked);
1054        }
1055    }
1056
1057    #[simd_test(enable = "gfni,avx512f")]
1058    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1059        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1060        const IDENTITY_BYTE: i32 = 0;
1061        let constant: i64 = 0;
1062        const CONSTANT_BYTE: i32 = 0x63;
1063        let identity = _mm512_set1_epi64(identity);
1064        let constant = _mm512_set1_epi64(constant);
1065        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1066
1067        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1068        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1069
1070        for i in 0..NUM_TEST_WORDS_512 {
1071            let data = load_m512i_word(&bytes, i);
1072            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1073            assert_eq_m512i(result, data);
1074            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1075            assert_eq_m512i(result, constant_reference);
1076            let data = load_m512i_word(&more_bytes, i);
1077            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1078            assert_eq_m512i(result, data);
1079            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1080            assert_eq_m512i(result, constant_reference);
1081
1082            let matrix = load_m512i_word(&matrices, i);
1083            let vector = load_m512i_word(&vectors, i);
1084            let reference = load_m512i_word(&references, i);
1085
1086            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1087            assert_eq_m512i(result, reference);
1088        }
1089    }
1090
1091    #[simd_test(enable = "gfni,avx512bw")]
1092    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1093        const CONSTANT_BYTE: i32 = 0x63;
1094        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1095
1096        for i in 0..NUM_TEST_WORDS_512 {
1097            let matrix = load_m512i_word(&matrices, i);
1098            let vector = load_m512i_word(&vectors, i);
1099            let result_zero =
1100                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1101            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1102            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1103            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1104            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1105            let result_masked =
1106                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1107            let expected_masked =
1108                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1109            assert_eq_m512i(result_masked, expected_masked);
1110        }
1111    }
1112
1113    #[simd_test(enable = "gfni,avx512bw")]
1114    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1115        const CONSTANT_BYTE: i32 = 0x63;
1116        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1117
1118        for i in 0..NUM_TEST_WORDS_512 {
1119            let left = load_m512i_word(&vectors, i);
1120            let right = load_m512i_word(&matrices, i);
1121            let result_left =
1122                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1123            assert_eq_m512i(result_left, left);
1124            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1125            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1126            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1127            let result_masked =
1128                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1129            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1130            assert_eq_m512i(result_masked, expected_masked);
1131        }
1132    }
1133
1134    #[simd_test(enable = "gfni,avx")]
1135    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1136        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1137        const IDENTITY_BYTE: i32 = 0;
1138        let constant: i64 = 0;
1139        const CONSTANT_BYTE: i32 = 0x63;
1140        let identity = _mm256_set1_epi64x(identity);
1141        let constant = _mm256_set1_epi64x(constant);
1142        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1143
1144        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1145        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1146
1147        for i in 0..NUM_TEST_WORDS_256 {
1148            let data = load_m256i_word(&bytes, i);
1149            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1150            assert_eq_m256i(result, data);
1151            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1152            assert_eq_m256i(result, constant_reference);
1153            let data = load_m256i_word(&more_bytes, i);
1154            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1155            assert_eq_m256i(result, data);
1156            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1157            assert_eq_m256i(result, constant_reference);
1158
1159            let matrix = load_m256i_word(&matrices, i);
1160            let vector = load_m256i_word(&vectors, i);
1161            let reference = load_m256i_word(&references, i);
1162
1163            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1164            assert_eq_m256i(result, reference);
1165        }
1166    }
1167
1168    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1169    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1170        const CONSTANT_BYTE: i32 = 0x63;
1171        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1172
1173        for i in 0..NUM_TEST_WORDS_256 {
1174            let matrix = load_m256i_word(&matrices, i);
1175            let vector = load_m256i_word(&vectors, i);
1176            let result_zero =
1177                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1178            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1179            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1180            const MASK_WORDS: i32 = 0b11_01_10_00;
1181            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1182            let result_masked =
1183                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1184            let expected_masked =
1185                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1186            assert_eq_m256i(result_masked, expected_masked);
1187        }
1188    }
1189
1190    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1191    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1192        const CONSTANT_BYTE: i32 = 0x63;
1193        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1194
1195        for i in 0..NUM_TEST_WORDS_256 {
1196            let left = load_m256i_word(&vectors, i);
1197            let right = load_m256i_word(&matrices, i);
1198            let result_left =
1199                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1200            assert_eq_m256i(result_left, left);
1201            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1202            const MASK_WORDS: i32 = 0b11_01_10_00;
1203            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1204            let result_masked =
1205                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1206            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1207            assert_eq_m256i(result_masked, expected_masked);
1208        }
1209    }
1210
1211    #[simd_test(enable = "gfni")]
1212    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1213        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1214        const IDENTITY_BYTE: i32 = 0;
1215        let constant: i64 = 0;
1216        const CONSTANT_BYTE: i32 = 0x63;
1217        let identity = _mm_set1_epi64x(identity);
1218        let constant = _mm_set1_epi64x(constant);
1219        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1220
1221        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1222        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1223
1224        for i in 0..NUM_TEST_WORDS_128 {
1225            let data = load_m128i_word(&bytes, i);
1226            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1227            assert_eq_m128i(result, data);
1228            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1229            assert_eq_m128i(result, constant_reference);
1230            let data = load_m128i_word(&more_bytes, i);
1231            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1232            assert_eq_m128i(result, data);
1233            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1234            assert_eq_m128i(result, constant_reference);
1235
1236            let matrix = load_m128i_word(&matrices, i);
1237            let vector = load_m128i_word(&vectors, i);
1238            let reference = load_m128i_word(&references, i);
1239
1240            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1241            assert_eq_m128i(result, reference);
1242        }
1243    }
1244
1245    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1246    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1247        const CONSTANT_BYTE: i32 = 0x63;
1248        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1249
1250        for i in 0..NUM_TEST_WORDS_128 {
1251            let matrix = load_m128i_word(&matrices, i);
1252            let vector = load_m128i_word(&vectors, i);
1253            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1254            assert_eq_m128i(result_zero, _mm_setzero_si128());
1255            let mask_bytes: __mmask16 = 0x0F_F0;
1256            const MASK_WORDS: i32 = 0b01_10;
1257            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1258            let result_masked =
1259                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1260            let expected_masked =
1261                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1262            assert_eq_m128i(result_masked, expected_masked);
1263        }
1264    }
1265
1266    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1267    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1268        const CONSTANT_BYTE: i32 = 0x63;
1269        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1270
1271        for i in 0..NUM_TEST_WORDS_128 {
1272            let left = load_m128i_word(&vectors, i);
1273            let right = load_m128i_word(&matrices, i);
1274            let result_left =
1275                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1276            assert_eq_m128i(result_left, left);
1277            let mask_bytes: __mmask16 = 0x0F_F0;
1278            const MASK_WORDS: i32 = 0b01_10;
1279            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1280            let result_masked =
1281                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1282            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1283            assert_eq_m128i(result_masked, expected_masked);
1284        }
1285    }
1286
1287    #[simd_test(enable = "gfni,avx512f")]
1288    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1289        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1290        const IDENTITY_BYTE: i32 = 0;
1291        const CONSTANT_BYTE: i32 = 0x63;
1292        let identity = _mm512_set1_epi64(identity);
1293
1294        // validate inversion
1295        let (inputs, results) = generate_inv_tests_data();
1296
1297        for i in 0..NUM_BYTES_WORDS_512 {
1298            let input = load_m512i_word(&inputs, i);
1299            let reference = load_m512i_word(&results, i);
1300            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1301            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1302            assert_eq_m512i(remultiplied, reference);
1303        }
1304
1305        // validate subsequent affine operation
1306        let (matrices, vectors, _affine_expected) =
1307            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1308
1309        for i in 0..NUM_TEST_WORDS_512 {
1310            let vector = load_m512i_word(&vectors, i);
1311            let matrix = load_m512i_word(&matrices, i);
1312
1313            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1314            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1315            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1316            assert_eq_m512i(result, reference);
1317        }
1318
1319        // validate everything by virtue of checking against the AES SBox
1320        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1321        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1322
1323        for i in 0..NUM_BYTES_WORDS_512 {
1324            let reference = load_m512i_word(&AES_S_BOX, i);
1325            let input = load_m512i_word(&inputs, i);
1326            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1327            assert_eq_m512i(result, reference);
1328        }
1329    }
1330
1331    #[simd_test(enable = "gfni,avx512bw")]
1332    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1333        const CONSTANT_BYTE: i32 = 0x63;
1334        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1335
1336        for i in 0..NUM_TEST_WORDS_512 {
1337            let matrix = load_m512i_word(&matrices, i);
1338            let vector = load_m512i_word(&vectors, i);
1339            let result_zero =
1340                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1341            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1342            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1343            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1344            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1345            let result_masked =
1346                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1347            let expected_masked =
1348                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1349            assert_eq_m512i(result_masked, expected_masked);
1350        }
1351    }
1352
1353    #[simd_test(enable = "gfni,avx512bw")]
1354    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1355        const CONSTANT_BYTE: i32 = 0x63;
1356        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1357
1358        for i in 0..NUM_TEST_WORDS_512 {
1359            let left = load_m512i_word(&vectors, i);
1360            let right = load_m512i_word(&matrices, i);
1361            let result_left =
1362                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1363            assert_eq_m512i(result_left, left);
1364            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1365            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1366            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1367            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1368                left, mask_bytes, left, right,
1369            );
1370            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1371            assert_eq_m512i(result_masked, expected_masked);
1372        }
1373    }
1374
1375    #[simd_test(enable = "gfni,avx")]
1376    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1377        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1378        const IDENTITY_BYTE: i32 = 0;
1379        const CONSTANT_BYTE: i32 = 0x63;
1380        let identity = _mm256_set1_epi64x(identity);
1381
1382        // validate inversion
1383        let (inputs, results) = generate_inv_tests_data();
1384
1385        for i in 0..NUM_BYTES_WORDS_256 {
1386            let input = load_m256i_word(&inputs, i);
1387            let reference = load_m256i_word(&results, i);
1388            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1389            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1390            assert_eq_m256i(remultiplied, reference);
1391        }
1392
1393        // validate subsequent affine operation
1394        let (matrices, vectors, _affine_expected) =
1395            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1396
1397        for i in 0..NUM_TEST_WORDS_256 {
1398            let vector = load_m256i_word(&vectors, i);
1399            let matrix = load_m256i_word(&matrices, i);
1400
1401            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1402            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1403            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1404            assert_eq_m256i(result, reference);
1405        }
1406
1407        // validate everything by virtue of checking against the AES SBox
1408        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1409        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1410
1411        for i in 0..NUM_BYTES_WORDS_256 {
1412            let reference = load_m256i_word(&AES_S_BOX, i);
1413            let input = load_m256i_word(&inputs, i);
1414            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1415            assert_eq_m256i(result, reference);
1416        }
1417    }
1418
1419    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1420    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1421        const CONSTANT_BYTE: i32 = 0x63;
1422        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1423
1424        for i in 0..NUM_TEST_WORDS_256 {
1425            let matrix = load_m256i_word(&matrices, i);
1426            let vector = load_m256i_word(&vectors, i);
1427            let result_zero =
1428                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1429            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1430            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1431            const MASK_WORDS: i32 = 0b11_01_10_00;
1432            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1433            let result_masked =
1434                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1435            let expected_masked =
1436                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1437            assert_eq_m256i(result_masked, expected_masked);
1438        }
1439    }
1440
1441    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1442    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1443        const CONSTANT_BYTE: i32 = 0x63;
1444        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1445
1446        for i in 0..NUM_TEST_WORDS_256 {
1447            let left = load_m256i_word(&vectors, i);
1448            let right = load_m256i_word(&matrices, i);
1449            let result_left =
1450                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1451            assert_eq_m256i(result_left, left);
1452            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1453            const MASK_WORDS: i32 = 0b11_01_10_00;
1454            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1455            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1456                left, mask_bytes, left, right,
1457            );
1458            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1459            assert_eq_m256i(result_masked, expected_masked);
1460        }
1461    }
1462
1463    #[simd_test(enable = "gfni")]
1464    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1465        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1466        const IDENTITY_BYTE: i32 = 0;
1467        const CONSTANT_BYTE: i32 = 0x63;
1468        let identity = _mm_set1_epi64x(identity);
1469
1470        // validate inversion
1471        let (inputs, results) = generate_inv_tests_data();
1472
1473        for i in 0..NUM_BYTES_WORDS_128 {
1474            let input = load_m128i_word(&inputs, i);
1475            let reference = load_m128i_word(&results, i);
1476            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1477            let remultiplied = _mm_gf2p8mul_epi8(result, input);
1478            assert_eq_m128i(remultiplied, reference);
1479        }
1480
1481        // validate subsequent affine operation
1482        let (matrices, vectors, _affine_expected) =
1483            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1484
1485        for i in 0..NUM_TEST_WORDS_128 {
1486            let vector = load_m128i_word(&vectors, i);
1487            let matrix = load_m128i_word(&matrices, i);
1488
1489            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1490            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1491            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1492            assert_eq_m128i(result, reference);
1493        }
1494
1495        // validate everything by virtue of checking against the AES SBox
1496        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1497        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1498
1499        for i in 0..NUM_BYTES_WORDS_128 {
1500            let reference = load_m128i_word(&AES_S_BOX, i);
1501            let input = load_m128i_word(&inputs, i);
1502            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1503            assert_eq_m128i(result, reference);
1504        }
1505    }
1506
1507    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1508    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1509        const CONSTANT_BYTE: i32 = 0x63;
1510        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1511
1512        for i in 0..NUM_TEST_WORDS_128 {
1513            let matrix = load_m128i_word(&matrices, i);
1514            let vector = load_m128i_word(&vectors, i);
1515            let result_zero =
1516                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1517            assert_eq_m128i(result_zero, _mm_setzero_si128());
1518            let mask_bytes: __mmask16 = 0x0F_F0;
1519            const MASK_WORDS: i32 = 0b01_10;
1520            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1521            let result_masked =
1522                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1523            let expected_masked =
1524                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1525            assert_eq_m128i(result_masked, expected_masked);
1526        }
1527    }
1528
1529    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1530    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1531        const CONSTANT_BYTE: i32 = 0x63;
1532        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1533
1534        for i in 0..NUM_TEST_WORDS_128 {
1535            let left = load_m128i_word(&vectors, i);
1536            let right = load_m128i_word(&matrices, i);
1537            let result_left =
1538                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1539            assert_eq_m128i(result_left, left);
1540            let mask_bytes: __mmask16 = 0x0F_F0;
1541            const MASK_WORDS: i32 = 0b01_10;
1542            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1543            let result_masked =
1544                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1545            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1546            assert_eq_m128i(result_masked, expected_masked);
1547        }
1548    }
1549}