core/stdarch/crates/core_arch/src/x86/
gfni.rs

1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::x86::__m128i;
14use crate::core_arch::x86::__m256i;
15use crate::core_arch::x86::__m512i;
16use crate::core_arch::x86::__mmask16;
17use crate::core_arch::x86::__mmask32;
18use crate::core_arch::x86::__mmask64;
19use crate::intrinsics::simd::simd_select_bitmask;
20use crate::mem::transmute;
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25#[allow(improper_ctypes)]
26unsafe extern "C" {
27    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
28    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
29    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
30    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
31    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
32    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
33    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
34    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
36    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
38    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39    #[link_name = "llvm.x86.vgf2p8mulb.512"]
40    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
41    #[link_name = "llvm.x86.vgf2p8mulb.256"]
42    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
43    #[link_name = "llvm.x86.vgf2p8mulb.128"]
44    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
45}
46
47// LLVM requires AVX512BW for a lot of these instructions, see
48// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
49// however our tests also require the target feature list to match Intel's
50// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
51// requirement (for now)
52// also see
53// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
54// for forcing GFNI, BW and optionally VL extension
55
56/// Performs a multiplication in GF(2^8) on the packed bytes.
57/// The field is in polynomial representation with the reduction polynomial
58///  x^8 + x^4 + x^3 + x + 1.
59///
60/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
61#[inline]
62#[target_feature(enable = "gfni,avx512f")]
63#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
64#[cfg_attr(test, assert_instr(vgf2p8mulb))]
65pub fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
66    unsafe { transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) }
67}
68
69/// Performs a multiplication in GF(2^8) on the packed bytes.
70/// The field is in polynomial representation with the reduction polynomial
71///  x^8 + x^4 + x^3 + x + 1.
72///
73/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
74/// Otherwise the computation result is written into the result.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
77#[inline]
78#[target_feature(enable = "gfni,avx512bw,avx512f")]
79#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
80#[cfg_attr(test, assert_instr(vgf2p8mulb))]
81pub fn _mm512_mask_gf2p8mul_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
82    unsafe {
83        transmute(simd_select_bitmask(
84            k,
85            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
86            src.as_i8x64(),
87        ))
88    }
89}
90
91/// Performs a multiplication in GF(2^8) on the packed bytes.
92/// The field is in polynomial representation with the reduction polynomial
93///  x^8 + x^4 + x^3 + x + 1.
94///
95/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
96/// Otherwise the computation result is written into the result.
97///
98/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
99#[inline]
100#[target_feature(enable = "gfni,avx512bw,avx512f")]
101#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
102#[cfg_attr(test, assert_instr(vgf2p8mulb))]
103pub fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
104    let zero = i8x64::ZERO;
105    unsafe {
106        transmute(simd_select_bitmask(
107            k,
108            vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
109            zero,
110        ))
111    }
112}
113
114/// Performs a multiplication in GF(2^8) on the packed bytes.
115/// The field is in polynomial representation with the reduction polynomial
116///  x^8 + x^4 + x^3 + x + 1.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
119#[inline]
120#[target_feature(enable = "gfni,avx")]
121#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
122#[cfg_attr(test, assert_instr(vgf2p8mulb))]
123pub fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Performs a multiplication in GF(2^8) on the packed bytes.
128/// The field is in polynomial representation with the reduction polynomial
129///  x^8 + x^4 + x^3 + x + 1.
130///
131/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
132/// Otherwise the computation result is written into the result.
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
135#[inline]
136#[target_feature(enable = "gfni,avx512bw,avx512vl")]
137#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
138#[cfg_attr(test, assert_instr(vgf2p8mulb))]
139pub fn _mm256_mask_gf2p8mul_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
140    unsafe {
141        transmute(simd_select_bitmask(
142            k,
143            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
144            src.as_i8x32(),
145        ))
146    }
147}
148
149/// Performs a multiplication in GF(2^8) on the packed bytes.
150/// The field is in polynomial representation with the reduction polynomial
151///  x^8 + x^4 + x^3 + x + 1.
152///
153/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
154/// Otherwise the computation result is written into the result.
155///
156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
157#[inline]
158#[target_feature(enable = "gfni,avx512bw,avx512vl")]
159#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
160#[cfg_attr(test, assert_instr(vgf2p8mulb))]
161pub fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
162    let zero = i8x32::ZERO;
163    unsafe {
164        transmute(simd_select_bitmask(
165            k,
166            vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
167            zero,
168        ))
169    }
170}
171
172/// Performs a multiplication in GF(2^8) on the packed bytes.
173/// The field is in polynomial representation with the reduction polynomial
174///  x^8 + x^4 + x^3 + x + 1.
175///
176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
177#[inline]
178#[target_feature(enable = "gfni")]
179#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
180#[cfg_attr(test, assert_instr(gf2p8mulb))]
181pub fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
182    unsafe { transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) }
183}
184
185/// Performs a multiplication in GF(2^8) on the packed bytes.
186/// The field is in polynomial representation with the reduction polynomial
187///  x^8 + x^4 + x^3 + x + 1.
188///
189/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
190/// Otherwise the computation result is written into the result.
191///
192/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
193#[inline]
194#[target_feature(enable = "gfni,avx512bw,avx512vl")]
195#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
196#[cfg_attr(test, assert_instr(vgf2p8mulb))]
197pub fn _mm_mask_gf2p8mul_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
198    unsafe {
199        transmute(simd_select_bitmask(
200            k,
201            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
202            src.as_i8x16(),
203        ))
204    }
205}
206
207/// Performs a multiplication in GF(2^8) on the packed bytes.
208/// The field is in polynomial representation with the reduction polynomial
209///  x^8 + x^4 + x^3 + x + 1.
210///
211/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
212/// Otherwise the computation result is written into the result.
213///
214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
215#[inline]
216#[target_feature(enable = "gfni,avx512bw,avx512vl")]
217#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
218#[cfg_attr(test, assert_instr(vgf2p8mulb))]
219pub fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
220    unsafe {
221        let zero = i8x16::ZERO;
222        transmute(simd_select_bitmask(
223            k,
224            vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
225            zero,
226        ))
227    }
228}
229
230/// Performs an affine transformation on the packed bytes in x.
231/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
232/// and b being a constant 8-bit immediate value.
233/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
236#[inline]
237#[target_feature(enable = "gfni,avx512f")]
238#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
239#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
240#[rustc_legacy_const_generics(2)]
241pub fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
242    static_assert_uimm_bits!(B, 8);
243    let b = B as u8;
244    let x = x.as_i8x64();
245    let a = a.as_i8x64();
246    unsafe {
247        let r = vgf2p8affineqb_512(x, a, b);
248        transmute(r)
249    }
250}
251
252/// Performs an affine transformation on the packed bytes in x.
253/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
254/// and b being a constant 8-bit immediate value.
255/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
256///
257/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
258/// Otherwise the computation result is written into the result.
259///
260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
261#[inline]
262#[target_feature(enable = "gfni,avx512bw,avx512f")]
263#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
264#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
265#[rustc_legacy_const_generics(3)]
266pub fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
267    k: __mmask64,
268    x: __m512i,
269    a: __m512i,
270) -> __m512i {
271    static_assert_uimm_bits!(B, 8);
272    let b = B as u8;
273    let zero = i8x64::ZERO;
274    let x = x.as_i8x64();
275    let a = a.as_i8x64();
276    unsafe {
277        let r = vgf2p8affineqb_512(x, a, b);
278        transmute(simd_select_bitmask(k, r, zero))
279    }
280}
281
282/// Performs an affine transformation on the packed bytes in x.
283/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
284/// and b being a constant 8-bit immediate value.
285/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
286///
287/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
288/// Otherwise the computation result is written into the result.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
291#[inline]
292#[target_feature(enable = "gfni,avx512bw,avx512f")]
293#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
294#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
295#[rustc_legacy_const_generics(4)]
296pub fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
297    src: __m512i,
298    k: __mmask64,
299    x: __m512i,
300    a: __m512i,
301) -> __m512i {
302    static_assert_uimm_bits!(B, 8);
303    let b = B as u8;
304    let x = x.as_i8x64();
305    let a = a.as_i8x64();
306    unsafe {
307        let r = vgf2p8affineqb_512(x, a, b);
308        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
309    }
310}
311
312/// Performs an affine transformation on the packed bytes in x.
313/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
314/// and b being a constant 8-bit immediate value.
315/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
316///
317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
318#[inline]
319#[target_feature(enable = "gfni,avx")]
320#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
321#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
322#[rustc_legacy_const_generics(2)]
323pub fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
324    static_assert_uimm_bits!(B, 8);
325    let b = B as u8;
326    let x = x.as_i8x32();
327    let a = a.as_i8x32();
328    unsafe {
329        let r = vgf2p8affineqb_256(x, a, b);
330        transmute(r)
331    }
332}
333
334/// Performs an affine transformation on the packed bytes in x.
335/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
336/// and b being a constant 8-bit immediate value.
337/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
338///
339/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
340/// Otherwise the computation result is written into the result.
341///
342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
343#[inline]
344#[target_feature(enable = "gfni,avx512bw,avx512vl")]
345#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
346#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
347#[rustc_legacy_const_generics(3)]
348pub fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
349    k: __mmask32,
350    x: __m256i,
351    a: __m256i,
352) -> __m256i {
353    static_assert_uimm_bits!(B, 8);
354    let b = B as u8;
355    let zero = i8x32::ZERO;
356    let x = x.as_i8x32();
357    let a = a.as_i8x32();
358    unsafe {
359        let r = vgf2p8affineqb_256(x, a, b);
360        transmute(simd_select_bitmask(k, r, zero))
361    }
362}
363
364/// Performs an affine transformation on the packed bytes in x.
365/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
366/// and b being a constant 8-bit immediate value.
367/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
368///
369/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
370/// Otherwise the computation result is written into the result.
371///
372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
373#[inline]
374#[target_feature(enable = "gfni,avx512bw,avx512vl")]
375#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
376#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
377#[rustc_legacy_const_generics(4)]
378pub fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
379    src: __m256i,
380    k: __mmask32,
381    x: __m256i,
382    a: __m256i,
383) -> __m256i {
384    static_assert_uimm_bits!(B, 8);
385    let b = B as u8;
386    let x = x.as_i8x32();
387    let a = a.as_i8x32();
388    unsafe {
389        let r = vgf2p8affineqb_256(x, a, b);
390        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
391    }
392}
393
394/// Performs an affine transformation on the packed bytes in x.
395/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
396/// and b being a constant 8-bit immediate value.
397/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
398///
399/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
400#[inline]
401#[target_feature(enable = "gfni")]
402#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
403#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
404#[rustc_legacy_const_generics(2)]
405pub fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
406    static_assert_uimm_bits!(B, 8);
407    let b = B as u8;
408    let x = x.as_i8x16();
409    let a = a.as_i8x16();
410    unsafe {
411        let r = vgf2p8affineqb_128(x, a, b);
412        transmute(r)
413    }
414}
415
416/// Performs an affine transformation on the packed bytes in x.
417/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
418/// and b being a constant 8-bit immediate value.
419/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
420///
421/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
422/// Otherwise the computation result is written into the result.
423///
424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
425#[inline]
426#[target_feature(enable = "gfni,avx512bw,avx512vl")]
427#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
428#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
429#[rustc_legacy_const_generics(3)]
430pub fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
431    k: __mmask16,
432    x: __m128i,
433    a: __m128i,
434) -> __m128i {
435    static_assert_uimm_bits!(B, 8);
436    let b = B as u8;
437    let zero = i8x16::ZERO;
438    let x = x.as_i8x16();
439    let a = a.as_i8x16();
440    unsafe {
441        let r = vgf2p8affineqb_128(x, a, b);
442        transmute(simd_select_bitmask(k, r, zero))
443    }
444}
445
446/// Performs an affine transformation on the packed bytes in x.
447/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
448/// and b being a constant 8-bit immediate value.
449/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
450///
451/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
452/// Otherwise the computation result is written into the result.
453///
454/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
455#[inline]
456#[target_feature(enable = "gfni,avx512bw,avx512vl")]
457#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
458#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
459#[rustc_legacy_const_generics(4)]
460pub fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
461    src: __m128i,
462    k: __mmask16,
463    x: __m128i,
464    a: __m128i,
465) -> __m128i {
466    static_assert_uimm_bits!(B, 8);
467    let b = B as u8;
468    let x = x.as_i8x16();
469    let a = a.as_i8x16();
470    unsafe {
471        let r = vgf2p8affineqb_128(x, a, b);
472        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
473    }
474}
475
476/// Performs an affine transformation on the inverted packed bytes in x.
477/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
478/// and b being a constant 8-bit immediate value.
479/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
480/// The inverse of 0 is 0.
481/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
484#[inline]
485#[target_feature(enable = "gfni,avx512f")]
486#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
487#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
488#[rustc_legacy_const_generics(2)]
489pub fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
490    static_assert_uimm_bits!(B, 8);
491    let b = B as u8;
492    let x = x.as_i8x64();
493    let a = a.as_i8x64();
494    unsafe {
495        let r = vgf2p8affineinvqb_512(x, a, b);
496        transmute(r)
497    }
498}
499
500/// Performs an affine transformation on the inverted packed bytes in x.
501/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
502/// and b being a constant 8-bit immediate value.
503/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
504/// The inverse of 0 is 0.
505/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
506///
507/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
508/// Otherwise the computation result is written into the result.
509///
510/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
511#[inline]
512#[target_feature(enable = "gfni,avx512bw,avx512f")]
513#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
514#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
515#[rustc_legacy_const_generics(3)]
516pub fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
517    k: __mmask64,
518    x: __m512i,
519    a: __m512i,
520) -> __m512i {
521    static_assert_uimm_bits!(B, 8);
522    let b = B as u8;
523    let zero = i8x64::ZERO;
524    let x = x.as_i8x64();
525    let a = a.as_i8x64();
526    unsafe {
527        let r = vgf2p8affineinvqb_512(x, a, b);
528        transmute(simd_select_bitmask(k, r, zero))
529    }
530}
531
532/// Performs an affine transformation on the inverted packed bytes in x.
533/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
534/// and b being a constant 8-bit immediate value.
535/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
536/// The inverse of 0 is 0.
537/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
538///
539/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
540/// Otherwise the computation result is written into the result.
541///
542/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
543#[inline]
544#[target_feature(enable = "gfni,avx512bw,avx512f")]
545#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
546#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
547#[rustc_legacy_const_generics(4)]
548pub fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
549    src: __m512i,
550    k: __mmask64,
551    x: __m512i,
552    a: __m512i,
553) -> __m512i {
554    static_assert_uimm_bits!(B, 8);
555    let b = B as u8;
556    let x = x.as_i8x64();
557    let a = a.as_i8x64();
558    unsafe {
559        let r = vgf2p8affineinvqb_512(x, a, b);
560        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
561    }
562}
563
564/// Performs an affine transformation on the inverted packed bytes in x.
565/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
566/// and b being a constant 8-bit immediate value.
567/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
568/// The inverse of 0 is 0.
569/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
570///
571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
572#[inline]
573#[target_feature(enable = "gfni,avx")]
574#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
575#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
576#[rustc_legacy_const_generics(2)]
577pub fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
578    static_assert_uimm_bits!(B, 8);
579    let b = B as u8;
580    let x = x.as_i8x32();
581    let a = a.as_i8x32();
582    unsafe {
583        let r = vgf2p8affineinvqb_256(x, a, b);
584        transmute(r)
585    }
586}
587
588/// Performs an affine transformation on the inverted packed bytes in x.
589/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
590/// and b being a constant 8-bit immediate value.
591/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
592/// The inverse of 0 is 0.
593/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
594///
595/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
596/// Otherwise the computation result is written into the result.
597///
598/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
599#[inline]
600#[target_feature(enable = "gfni,avx512bw,avx512vl")]
601#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
602#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
603#[rustc_legacy_const_generics(3)]
604pub fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
605    k: __mmask32,
606    x: __m256i,
607    a: __m256i,
608) -> __m256i {
609    static_assert_uimm_bits!(B, 8);
610    let b = B as u8;
611    let zero = i8x32::ZERO;
612    let x = x.as_i8x32();
613    let a = a.as_i8x32();
614    unsafe {
615        let r = vgf2p8affineinvqb_256(x, a, b);
616        transmute(simd_select_bitmask(k, r, zero))
617    }
618}
619
620/// Performs an affine transformation on the inverted packed bytes in x.
621/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
622/// and b being a constant 8-bit immediate value.
623/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
624/// The inverse of 0 is 0.
625/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
626///
627/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
628/// Otherwise the computation result is written into the result.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
631#[inline]
632#[target_feature(enable = "gfni,avx512bw,avx512vl")]
633#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
634#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
635#[rustc_legacy_const_generics(4)]
636pub fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
637    src: __m256i,
638    k: __mmask32,
639    x: __m256i,
640    a: __m256i,
641) -> __m256i {
642    static_assert_uimm_bits!(B, 8);
643    let b = B as u8;
644    let x = x.as_i8x32();
645    let a = a.as_i8x32();
646    unsafe {
647        let r = vgf2p8affineinvqb_256(x, a, b);
648        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
649    }
650}
651
652/// Performs an affine transformation on the inverted packed bytes in x.
653/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
654/// and b being a constant 8-bit immediate value.
655/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
656/// The inverse of 0 is 0.
657/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
658///
659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
660#[inline]
661#[target_feature(enable = "gfni")]
662#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
663#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
664#[rustc_legacy_const_generics(2)]
665pub fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
666    static_assert_uimm_bits!(B, 8);
667    let b = B as u8;
668    let x = x.as_i8x16();
669    let a = a.as_i8x16();
670    unsafe {
671        let r = vgf2p8affineinvqb_128(x, a, b);
672        transmute(r)
673    }
674}
675
676/// Performs an affine transformation on the inverted packed bytes in x.
677/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
678/// and b being a constant 8-bit immediate value.
679/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
680/// The inverse of 0 is 0.
681/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
682///
683/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
684/// Otherwise the computation result is written into the result.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
687#[inline]
688#[target_feature(enable = "gfni,avx512bw,avx512vl")]
689#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
690#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
691#[rustc_legacy_const_generics(3)]
692pub fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
693    k: __mmask16,
694    x: __m128i,
695    a: __m128i,
696) -> __m128i {
697    static_assert_uimm_bits!(B, 8);
698    let b = B as u8;
699    let zero = i8x16::ZERO;
700    let x = x.as_i8x16();
701    let a = a.as_i8x16();
702    unsafe {
703        let r = vgf2p8affineinvqb_128(x, a, b);
704        transmute(simd_select_bitmask(k, r, zero))
705    }
706}
707
708/// Performs an affine transformation on the inverted packed bytes in x.
709/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
710/// and b being a constant 8-bit immediate value.
711/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
712/// The inverse of 0 is 0.
713/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
714///
715/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
716/// Otherwise the computation result is written into the result.
717///
718/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
719#[inline]
720#[target_feature(enable = "gfni,avx512bw,avx512vl")]
721#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
722#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
723#[rustc_legacy_const_generics(4)]
724pub fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
725    src: __m128i,
726    k: __mmask16,
727    x: __m128i,
728    a: __m128i,
729) -> __m128i {
730    static_assert_uimm_bits!(B, 8);
731    let b = B as u8;
732    let x = x.as_i8x16();
733    let a = a.as_i8x16();
734    unsafe {
735        let r = vgf2p8affineinvqb_128(x, a, b);
736        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
737    }
738}
739
740#[cfg(test)]
741mod tests {
742    // The constants in the tests below are just bit patterns. They should not
743    // be interpreted as integers; signedness does not make sense for them, but
744    // __mXXXi happens to be defined in terms of signed integers.
745    #![allow(overflowing_literals)]
746
747    use core::hint::black_box;
748    use stdarch_test::simd_test;
749
750    use crate::core_arch::x86::*;
751
752    fn mulbyte(left: u8, right: u8) -> u8 {
753        // this implementation follows the description in
754        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
755        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
756        let left: u16 = left.into();
757        let right: u16 = right.into();
758        let mut carryless_product: u16 = 0;
759
760        // Carryless multiplication
761        for i in 0..8 {
762            if ((left >> i) & 0x01) != 0 {
763                carryless_product ^= right << i;
764            }
765        }
766
767        // reduction, adding in "0" where appropriate to clear out high bits
768        // note that REDUCTION_POLYNOMIAL is zero in this context
769        for i in (8..=14).rev() {
770            if ((carryless_product >> i) & 0x01) != 0 {
771                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
772            }
773        }
774
775        carryless_product as u8
776    }
777
778    const NUM_TEST_WORDS_512: usize = 4;
779    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
780    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
781    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
782    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
783    const NUM_BYTES: usize = 256;
784    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
785    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
786    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
787
788    fn parity(input: u8) -> u8 {
789        let mut accumulator = 0;
790        for i in 0..8 {
791            accumulator ^= (input >> i) & 0x01;
792        }
793        accumulator
794    }
795
796    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
797        // this implementation follows the description in
798        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
799        let mut accumulator = 0;
800
801        for bit in 0..8 {
802            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
803        }
804
805        accumulator ^ b
806    }
807
808    fn generate_affine_mul_test_data(
809        immediate: u8,
810    ) -> (
811        [u64; NUM_TEST_WORDS_64],
812        [u8; NUM_TEST_ENTRIES],
813        [u8; NUM_TEST_ENTRIES],
814    ) {
815        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
816        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
817        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
818
819        for i in 0..NUM_TEST_WORDS_64 {
820            left[i] = (i as u64) * 103 * 101;
821            for j in 0..8 {
822                let j64 = j as u64;
823                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
824                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
825            }
826        }
827
828        (left, right, result)
829    }
830
831    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
832        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
833        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
834
835        for i in 0..NUM_BYTES {
836            input[i] = (i % 256) as u8;
837            result[i] = if i == 0 { 0 } else { 1 };
838        }
839
840        (input, result)
841    }
842
843    const AES_S_BOX: [u8; NUM_BYTES] = [
844        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
845        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
846        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
847        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
848        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
849        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
850        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
851        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
852        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
853        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
854        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
855        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
856        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
857        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
858        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
859        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
860        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
861        0x16,
862    ];
863
864    fn generate_byte_mul_test_data() -> (
865        [u8; NUM_TEST_ENTRIES],
866        [u8; NUM_TEST_ENTRIES],
867        [u8; NUM_TEST_ENTRIES],
868    ) {
869        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
870        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
871        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
872
873        for i in 0..NUM_TEST_ENTRIES {
874            left[i] = (i % 256) as u8;
875            right[i] = left[i].wrapping_mul(101);
876            result[i] = mulbyte(left[i], right[i]);
877        }
878
879        (left, right, result)
880    }
881
882    #[target_feature(enable = "sse2")]
883    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
884        let pointer = data.as_ptr().byte_add(word_index * 16) as *const __m128i;
885        _mm_loadu_si128(black_box(pointer))
886    }
887
888    #[target_feature(enable = "avx")]
889    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
890        let pointer = data.as_ptr().byte_add(word_index * 32) as *const __m256i;
891        _mm256_loadu_si256(black_box(pointer))
892    }
893
894    #[target_feature(enable = "avx512f")]
895    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
896        let pointer = data.as_ptr().byte_add(word_index * 64) as *const __m512i;
897        _mm512_loadu_si512(black_box(pointer))
898    }
899
900    #[simd_test(enable = "gfni,avx512f")]
901    unsafe fn test_mm512_gf2p8mul_epi8() {
902        let (left, right, expected) = generate_byte_mul_test_data();
903
904        for i in 0..NUM_TEST_WORDS_512 {
905            let left = load_m512i_word(&left, i);
906            let right = load_m512i_word(&right, i);
907            let expected = load_m512i_word(&expected, i);
908            let result = _mm512_gf2p8mul_epi8(left, right);
909            assert_eq_m512i(result, expected);
910        }
911    }
912
913    #[simd_test(enable = "gfni,avx512bw")]
914    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
915        let (left, right, _expected) = generate_byte_mul_test_data();
916
917        for i in 0..NUM_TEST_WORDS_512 {
918            let left = load_m512i_word(&left, i);
919            let right = load_m512i_word(&right, i);
920            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
921            assert_eq_m512i(result_zero, _mm512_setzero_si512());
922            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
923            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
924            let expected_result = _mm512_gf2p8mul_epi8(left, right);
925            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
926            let expected_masked =
927                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
928            assert_eq_m512i(result_masked, expected_masked);
929        }
930    }
931
932    #[simd_test(enable = "gfni,avx512bw")]
933    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
934        let (left, right, _expected) = generate_byte_mul_test_data();
935
936        for i in 0..NUM_TEST_WORDS_512 {
937            let left = load_m512i_word(&left, i);
938            let right = load_m512i_word(&right, i);
939            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
940            assert_eq_m512i(result_left, left);
941            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
942            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
943            let expected_result = _mm512_gf2p8mul_epi8(left, right);
944            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
945            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
946            assert_eq_m512i(result_masked, expected_masked);
947        }
948    }
949
950    #[simd_test(enable = "gfni,avx")]
951    unsafe fn test_mm256_gf2p8mul_epi8() {
952        let (left, right, expected) = generate_byte_mul_test_data();
953
954        for i in 0..NUM_TEST_WORDS_256 {
955            let left = load_m256i_word(&left, i);
956            let right = load_m256i_word(&right, i);
957            let expected = load_m256i_word(&expected, i);
958            let result = _mm256_gf2p8mul_epi8(left, right);
959            assert_eq_m256i(result, expected);
960        }
961    }
962
963    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
964    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
965        let (left, right, _expected) = generate_byte_mul_test_data();
966
967        for i in 0..NUM_TEST_WORDS_256 {
968            let left = load_m256i_word(&left, i);
969            let right = load_m256i_word(&right, i);
970            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
971            assert_eq_m256i(result_zero, _mm256_setzero_si256());
972            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
973            const MASK_WORDS: i32 = 0b01_10_11_00;
974            let expected_result = _mm256_gf2p8mul_epi8(left, right);
975            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
976            let expected_masked =
977                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
978            assert_eq_m256i(result_masked, expected_masked);
979        }
980    }
981
982    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
983    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
984        let (left, right, _expected) = generate_byte_mul_test_data();
985
986        for i in 0..NUM_TEST_WORDS_256 {
987            let left = load_m256i_word(&left, i);
988            let right = load_m256i_word(&right, i);
989            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
990            assert_eq_m256i(result_left, left);
991            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
992            const MASK_WORDS: i32 = 0b01_10_11_00;
993            let expected_result = _mm256_gf2p8mul_epi8(left, right);
994            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
995            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
996            assert_eq_m256i(result_masked, expected_masked);
997        }
998    }
999
1000    #[simd_test(enable = "gfni")]
1001    unsafe fn test_mm_gf2p8mul_epi8() {
1002        let (left, right, expected) = generate_byte_mul_test_data();
1003
1004        for i in 0..NUM_TEST_WORDS_128 {
1005            let left = load_m128i_word(&left, i);
1006            let right = load_m128i_word(&right, i);
1007            let expected = load_m128i_word(&expected, i);
1008            let result = _mm_gf2p8mul_epi8(left, right);
1009            assert_eq_m128i(result, expected);
1010        }
1011    }
1012
1013    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1014    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
1015        let (left, right, _expected) = generate_byte_mul_test_data();
1016
1017        for i in 0..NUM_TEST_WORDS_128 {
1018            let left = load_m128i_word(&left, i);
1019            let right = load_m128i_word(&right, i);
1020            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
1021            assert_eq_m128i(result_zero, _mm_setzero_si128());
1022            let mask_bytes: __mmask16 = 0x0F_F0;
1023            const MASK_WORDS: i32 = 0b01_10;
1024            let expected_result = _mm_gf2p8mul_epi8(left, right);
1025            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1026            let expected_masked =
1027                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1028            assert_eq_m128i(result_masked, expected_masked);
1029        }
1030    }
1031
1032    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1033    unsafe fn test_mm_mask_gf2p8mul_epi8() {
1034        let (left, right, _expected) = generate_byte_mul_test_data();
1035
1036        for i in 0..NUM_TEST_WORDS_128 {
1037            let left = load_m128i_word(&left, i);
1038            let right = load_m128i_word(&right, i);
1039            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
1040            assert_eq_m128i(result_left, left);
1041            let mask_bytes: __mmask16 = 0x0F_F0;
1042            const MASK_WORDS: i32 = 0b01_10;
1043            let expected_result = _mm_gf2p8mul_epi8(left, right);
1044            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1045            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1046            assert_eq_m128i(result_masked, expected_masked);
1047        }
1048    }
1049
1050    #[simd_test(enable = "gfni,avx512f")]
1051    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1052        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1053        const IDENTITY_BYTE: i32 = 0;
1054        let constant: i64 = 0;
1055        const CONSTANT_BYTE: i32 = 0x63;
1056        let identity = _mm512_set1_epi64(identity);
1057        let constant = _mm512_set1_epi64(constant);
1058        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1059
1060        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1061        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1062
1063        for i in 0..NUM_TEST_WORDS_512 {
1064            let data = load_m512i_word(&bytes, i);
1065            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1066            assert_eq_m512i(result, data);
1067            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1068            assert_eq_m512i(result, constant_reference);
1069            let data = load_m512i_word(&more_bytes, i);
1070            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1071            assert_eq_m512i(result, data);
1072            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1073            assert_eq_m512i(result, constant_reference);
1074
1075            let matrix = load_m512i_word(&matrices, i);
1076            let vector = load_m512i_word(&vectors, i);
1077            let reference = load_m512i_word(&references, i);
1078
1079            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1080            assert_eq_m512i(result, reference);
1081        }
1082    }
1083
1084    #[simd_test(enable = "gfni,avx512bw")]
1085    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1086        const CONSTANT_BYTE: i32 = 0x63;
1087        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1088
1089        for i in 0..NUM_TEST_WORDS_512 {
1090            let matrix = load_m512i_word(&matrices, i);
1091            let vector = load_m512i_word(&vectors, i);
1092            let result_zero =
1093                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1094            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1095            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1096            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1097            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1098            let result_masked =
1099                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1100            let expected_masked =
1101                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1102            assert_eq_m512i(result_masked, expected_masked);
1103        }
1104    }
1105
1106    #[simd_test(enable = "gfni,avx512bw")]
1107    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1108        const CONSTANT_BYTE: i32 = 0x63;
1109        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1110
1111        for i in 0..NUM_TEST_WORDS_512 {
1112            let left = load_m512i_word(&vectors, i);
1113            let right = load_m512i_word(&matrices, i);
1114            let result_left =
1115                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1116            assert_eq_m512i(result_left, left);
1117            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1118            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1119            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1120            let result_masked =
1121                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1122            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1123            assert_eq_m512i(result_masked, expected_masked);
1124        }
1125    }
1126
1127    #[simd_test(enable = "gfni,avx")]
1128    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1129        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1130        const IDENTITY_BYTE: i32 = 0;
1131        let constant: i64 = 0;
1132        const CONSTANT_BYTE: i32 = 0x63;
1133        let identity = _mm256_set1_epi64x(identity);
1134        let constant = _mm256_set1_epi64x(constant);
1135        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1136
1137        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1138        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1139
1140        for i in 0..NUM_TEST_WORDS_256 {
1141            let data = load_m256i_word(&bytes, i);
1142            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1143            assert_eq_m256i(result, data);
1144            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1145            assert_eq_m256i(result, constant_reference);
1146            let data = load_m256i_word(&more_bytes, i);
1147            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1148            assert_eq_m256i(result, data);
1149            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1150            assert_eq_m256i(result, constant_reference);
1151
1152            let matrix = load_m256i_word(&matrices, i);
1153            let vector = load_m256i_word(&vectors, i);
1154            let reference = load_m256i_word(&references, i);
1155
1156            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1157            assert_eq_m256i(result, reference);
1158        }
1159    }
1160
1161    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1162    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1163        const CONSTANT_BYTE: i32 = 0x63;
1164        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1165
1166        for i in 0..NUM_TEST_WORDS_256 {
1167            let matrix = load_m256i_word(&matrices, i);
1168            let vector = load_m256i_word(&vectors, i);
1169            let result_zero =
1170                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1171            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1172            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1173            const MASK_WORDS: i32 = 0b11_01_10_00;
1174            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1175            let result_masked =
1176                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1177            let expected_masked =
1178                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1179            assert_eq_m256i(result_masked, expected_masked);
1180        }
1181    }
1182
1183    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1184    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1185        const CONSTANT_BYTE: i32 = 0x63;
1186        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1187
1188        for i in 0..NUM_TEST_WORDS_256 {
1189            let left = load_m256i_word(&vectors, i);
1190            let right = load_m256i_word(&matrices, i);
1191            let result_left =
1192                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1193            assert_eq_m256i(result_left, left);
1194            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1195            const MASK_WORDS: i32 = 0b11_01_10_00;
1196            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1197            let result_masked =
1198                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1199            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1200            assert_eq_m256i(result_masked, expected_masked);
1201        }
1202    }
1203
1204    #[simd_test(enable = "gfni")]
1205    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1206        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1207        const IDENTITY_BYTE: i32 = 0;
1208        let constant: i64 = 0;
1209        const CONSTANT_BYTE: i32 = 0x63;
1210        let identity = _mm_set1_epi64x(identity);
1211        let constant = _mm_set1_epi64x(constant);
1212        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1213
1214        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1215        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1216
1217        for i in 0..NUM_TEST_WORDS_128 {
1218            let data = load_m128i_word(&bytes, i);
1219            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1220            assert_eq_m128i(result, data);
1221            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1222            assert_eq_m128i(result, constant_reference);
1223            let data = load_m128i_word(&more_bytes, i);
1224            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1225            assert_eq_m128i(result, data);
1226            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1227            assert_eq_m128i(result, constant_reference);
1228
1229            let matrix = load_m128i_word(&matrices, i);
1230            let vector = load_m128i_word(&vectors, i);
1231            let reference = load_m128i_word(&references, i);
1232
1233            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1234            assert_eq_m128i(result, reference);
1235        }
1236    }
1237
1238    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1239    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1240        const CONSTANT_BYTE: i32 = 0x63;
1241        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1242
1243        for i in 0..NUM_TEST_WORDS_128 {
1244            let matrix = load_m128i_word(&matrices, i);
1245            let vector = load_m128i_word(&vectors, i);
1246            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1247            assert_eq_m128i(result_zero, _mm_setzero_si128());
1248            let mask_bytes: __mmask16 = 0x0F_F0;
1249            const MASK_WORDS: i32 = 0b01_10;
1250            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1251            let result_masked =
1252                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1253            let expected_masked =
1254                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1255            assert_eq_m128i(result_masked, expected_masked);
1256        }
1257    }
1258
1259    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1260    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1261        const CONSTANT_BYTE: i32 = 0x63;
1262        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1263
1264        for i in 0..NUM_TEST_WORDS_128 {
1265            let left = load_m128i_word(&vectors, i);
1266            let right = load_m128i_word(&matrices, i);
1267            let result_left =
1268                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1269            assert_eq_m128i(result_left, left);
1270            let mask_bytes: __mmask16 = 0x0F_F0;
1271            const MASK_WORDS: i32 = 0b01_10;
1272            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1273            let result_masked =
1274                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1275            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1276            assert_eq_m128i(result_masked, expected_masked);
1277        }
1278    }
1279
1280    #[simd_test(enable = "gfni,avx512f")]
1281    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1282        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1283        const IDENTITY_BYTE: i32 = 0;
1284        const CONSTANT_BYTE: i32 = 0x63;
1285        let identity = _mm512_set1_epi64(identity);
1286
1287        // validate inversion
1288        let (inputs, results) = generate_inv_tests_data();
1289
1290        for i in 0..NUM_BYTES_WORDS_512 {
1291            let input = load_m512i_word(&inputs, i);
1292            let reference = load_m512i_word(&results, i);
1293            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1294            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1295            assert_eq_m512i(remultiplied, reference);
1296        }
1297
1298        // validate subsequent affine operation
1299        let (matrices, vectors, _affine_expected) =
1300            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1301
1302        for i in 0..NUM_TEST_WORDS_512 {
1303            let vector = load_m512i_word(&vectors, i);
1304            let matrix = load_m512i_word(&matrices, i);
1305
1306            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1307            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1308            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1309            assert_eq_m512i(result, reference);
1310        }
1311
1312        // validate everything by virtue of checking against the AES SBox
1313        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1314        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1315
1316        for i in 0..NUM_BYTES_WORDS_512 {
1317            let reference = load_m512i_word(&AES_S_BOX, i);
1318            let input = load_m512i_word(&inputs, i);
1319            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1320            assert_eq_m512i(result, reference);
1321        }
1322    }
1323
1324    #[simd_test(enable = "gfni,avx512bw")]
1325    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1326        const CONSTANT_BYTE: i32 = 0x63;
1327        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1328
1329        for i in 0..NUM_TEST_WORDS_512 {
1330            let matrix = load_m512i_word(&matrices, i);
1331            let vector = load_m512i_word(&vectors, i);
1332            let result_zero =
1333                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1334            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1335            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1336            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1337            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1338            let result_masked =
1339                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1340            let expected_masked =
1341                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1342            assert_eq_m512i(result_masked, expected_masked);
1343        }
1344    }
1345
1346    #[simd_test(enable = "gfni,avx512bw")]
1347    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1348        const CONSTANT_BYTE: i32 = 0x63;
1349        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1350
1351        for i in 0..NUM_TEST_WORDS_512 {
1352            let left = load_m512i_word(&vectors, i);
1353            let right = load_m512i_word(&matrices, i);
1354            let result_left =
1355                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1356            assert_eq_m512i(result_left, left);
1357            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1358            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1359            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1360            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1361                left, mask_bytes, left, right,
1362            );
1363            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1364            assert_eq_m512i(result_masked, expected_masked);
1365        }
1366    }
1367
1368    #[simd_test(enable = "gfni,avx")]
1369    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1370        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1371        const IDENTITY_BYTE: i32 = 0;
1372        const CONSTANT_BYTE: i32 = 0x63;
1373        let identity = _mm256_set1_epi64x(identity);
1374
1375        // validate inversion
1376        let (inputs, results) = generate_inv_tests_data();
1377
1378        for i in 0..NUM_BYTES_WORDS_256 {
1379            let input = load_m256i_word(&inputs, i);
1380            let reference = load_m256i_word(&results, i);
1381            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1382            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1383            assert_eq_m256i(remultiplied, reference);
1384        }
1385
1386        // validate subsequent affine operation
1387        let (matrices, vectors, _affine_expected) =
1388            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1389
1390        for i in 0..NUM_TEST_WORDS_256 {
1391            let vector = load_m256i_word(&vectors, i);
1392            let matrix = load_m256i_word(&matrices, i);
1393
1394            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1395            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1396            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1397            assert_eq_m256i(result, reference);
1398        }
1399
1400        // validate everything by virtue of checking against the AES SBox
1401        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1402        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1403
1404        for i in 0..NUM_BYTES_WORDS_256 {
1405            let reference = load_m256i_word(&AES_S_BOX, i);
1406            let input = load_m256i_word(&inputs, i);
1407            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1408            assert_eq_m256i(result, reference);
1409        }
1410    }
1411
1412    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1413    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1414        const CONSTANT_BYTE: i32 = 0x63;
1415        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1416
1417        for i in 0..NUM_TEST_WORDS_256 {
1418            let matrix = load_m256i_word(&matrices, i);
1419            let vector = load_m256i_word(&vectors, i);
1420            let result_zero =
1421                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1422            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1423            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1424            const MASK_WORDS: i32 = 0b11_01_10_00;
1425            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1426            let result_masked =
1427                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1428            let expected_masked =
1429                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1430            assert_eq_m256i(result_masked, expected_masked);
1431        }
1432    }
1433
1434    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1435    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1436        const CONSTANT_BYTE: i32 = 0x63;
1437        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1438
1439        for i in 0..NUM_TEST_WORDS_256 {
1440            let left = load_m256i_word(&vectors, i);
1441            let right = load_m256i_word(&matrices, i);
1442            let result_left =
1443                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1444            assert_eq_m256i(result_left, left);
1445            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1446            const MASK_WORDS: i32 = 0b11_01_10_00;
1447            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1448            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1449                left, mask_bytes, left, right,
1450            );
1451            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1452            assert_eq_m256i(result_masked, expected_masked);
1453        }
1454    }
1455
1456    #[simd_test(enable = "gfni")]
1457    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1458        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1459        const IDENTITY_BYTE: i32 = 0;
1460        const CONSTANT_BYTE: i32 = 0x63;
1461        let identity = _mm_set1_epi64x(identity);
1462
1463        // validate inversion
1464        let (inputs, results) = generate_inv_tests_data();
1465
1466        for i in 0..NUM_BYTES_WORDS_128 {
1467            let input = load_m128i_word(&inputs, i);
1468            let reference = load_m128i_word(&results, i);
1469            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1470            let remultiplied = _mm_gf2p8mul_epi8(result, input);
1471            assert_eq_m128i(remultiplied, reference);
1472        }
1473
1474        // validate subsequent affine operation
1475        let (matrices, vectors, _affine_expected) =
1476            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1477
1478        for i in 0..NUM_TEST_WORDS_128 {
1479            let vector = load_m128i_word(&vectors, i);
1480            let matrix = load_m128i_word(&matrices, i);
1481
1482            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1483            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1484            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1485            assert_eq_m128i(result, reference);
1486        }
1487
1488        // validate everything by virtue of checking against the AES SBox
1489        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1490        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1491
1492        for i in 0..NUM_BYTES_WORDS_128 {
1493            let reference = load_m128i_word(&AES_S_BOX, i);
1494            let input = load_m128i_word(&inputs, i);
1495            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1496            assert_eq_m128i(result, reference);
1497        }
1498    }
1499
1500    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1501    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1502        const CONSTANT_BYTE: i32 = 0x63;
1503        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1504
1505        for i in 0..NUM_TEST_WORDS_128 {
1506            let matrix = load_m128i_word(&matrices, i);
1507            let vector = load_m128i_word(&vectors, i);
1508            let result_zero =
1509                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1510            assert_eq_m128i(result_zero, _mm_setzero_si128());
1511            let mask_bytes: __mmask16 = 0x0F_F0;
1512            const MASK_WORDS: i32 = 0b01_10;
1513            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1514            let result_masked =
1515                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1516            let expected_masked =
1517                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1518            assert_eq_m128i(result_masked, expected_masked);
1519        }
1520    }
1521
1522    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1523    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1524        const CONSTANT_BYTE: i32 = 0x63;
1525        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1526
1527        for i in 0..NUM_TEST_WORDS_128 {
1528            let left = load_m128i_word(&vectors, i);
1529            let right = load_m128i_word(&matrices, i);
1530            let result_left =
1531                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1532            assert_eq_m128i(result_left, left);
1533            let mask_bytes: __mmask16 = 0x0F_F0;
1534            const MASK_WORDS: i32 = 0b01_10;
1535            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1536            let result_masked =
1537                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1538            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1539            assert_eq_m128i(result_masked, expected_masked);
1540        }
1541    }
1542}