core/stdarch/crates/core_arch/src/x86/
gfni.rs

1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::x86::__m128i;
14use crate::core_arch::x86::__m256i;
15use crate::core_arch::x86::__m512i;
16use crate::core_arch::x86::__mmask16;
17use crate::core_arch::x86::__mmask32;
18use crate::core_arch::x86::__mmask64;
19use crate::core_arch::x86::m128iExt;
20use crate::core_arch::x86::m256iExt;
21use crate::core_arch::x86::m512iExt;
22use crate::intrinsics::simd::simd_select_bitmask;
23use crate::mem::transmute;
24
25#[cfg(test)]
26use stdarch_test::assert_instr;
27
28#[allow(improper_ctypes)]
29extern "C" {
30    #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
31    fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
32    #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
33    fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
34    #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
35    fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
36    #[link_name = "llvm.x86.vgf2p8affineqb.512"]
37    fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
38    #[link_name = "llvm.x86.vgf2p8affineqb.256"]
39    fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
40    #[link_name = "llvm.x86.vgf2p8affineqb.128"]
41    fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
42    #[link_name = "llvm.x86.vgf2p8mulb.512"]
43    fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
44    #[link_name = "llvm.x86.vgf2p8mulb.256"]
45    fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
46    #[link_name = "llvm.x86.vgf2p8mulb.128"]
47    fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
48}
49
50// LLVM requires AVX512BW for a lot of these instructions, see
51// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
52// however our tests also require the target feature list to match Intel's
53// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
54// requirement (for now)
55// also see
56// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
57// for forcing GFNI, BW and optionally VL extension
58
59/// Performs a multiplication in GF(2^8) on the packed bytes.
60/// The field is in polynomial representation with the reduction polynomial
61///  x^8 + x^4 + x^3 + x + 1.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
64#[inline]
65#[target_feature(enable = "gfni,avx512f")]
66#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
67#[cfg_attr(test, assert_instr(vgf2p8mulb))]
68pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
69    transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
70}
71
72/// Performs a multiplication in GF(2^8) on the packed bytes.
73/// The field is in polynomial representation with the reduction polynomial
74///  x^8 + x^4 + x^3 + x + 1.
75///
76/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
77/// Otherwise the computation result is written into the result.
78///
79/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
80#[inline]
81#[target_feature(enable = "gfni,avx512bw,avx512f")]
82#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
83#[cfg_attr(test, assert_instr(vgf2p8mulb))]
84pub unsafe fn _mm512_mask_gf2p8mul_epi8(
85    src: __m512i,
86    k: __mmask64,
87    a: __m512i,
88    b: __m512i,
89) -> __m512i {
90    transmute(simd_select_bitmask(
91        k,
92        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
93        src.as_i8x64(),
94    ))
95}
96
97/// Performs a multiplication in GF(2^8) on the packed bytes.
98/// The field is in polynomial representation with the reduction polynomial
99///  x^8 + x^4 + x^3 + x + 1.
100///
101/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
102/// Otherwise the computation result is written into the result.
103///
104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
105#[inline]
106#[target_feature(enable = "gfni,avx512bw,avx512f")]
107#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
108#[cfg_attr(test, assert_instr(vgf2p8mulb))]
109pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
110    let zero = i8x64::ZERO;
111    transmute(simd_select_bitmask(
112        k,
113        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
114        zero,
115    ))
116}
117
118/// Performs a multiplication in GF(2^8) on the packed bytes.
119/// The field is in polynomial representation with the reduction polynomial
120///  x^8 + x^4 + x^3 + x + 1.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
123#[inline]
124#[target_feature(enable = "gfni,avx")]
125#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
126#[cfg_attr(test, assert_instr(vgf2p8mulb))]
127pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
128    transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
129}
130
131/// Performs a multiplication in GF(2^8) on the packed bytes.
132/// The field is in polynomial representation with the reduction polynomial
133///  x^8 + x^4 + x^3 + x + 1.
134///
135/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
136/// Otherwise the computation result is written into the result.
137///
138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
139#[inline]
140#[target_feature(enable = "gfni,avx512bw,avx512vl")]
141#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
142#[cfg_attr(test, assert_instr(vgf2p8mulb))]
143pub unsafe fn _mm256_mask_gf2p8mul_epi8(
144    src: __m256i,
145    k: __mmask32,
146    a: __m256i,
147    b: __m256i,
148) -> __m256i {
149    transmute(simd_select_bitmask(
150        k,
151        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
152        src.as_i8x32(),
153    ))
154}
155
156/// Performs a multiplication in GF(2^8) on the packed bytes.
157/// The field is in polynomial representation with the reduction polynomial
158///  x^8 + x^4 + x^3 + x + 1.
159///
160/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
161/// Otherwise the computation result is written into the result.
162///
163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
164#[inline]
165#[target_feature(enable = "gfni,avx512bw,avx512vl")]
166#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
167#[cfg_attr(test, assert_instr(vgf2p8mulb))]
168pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
169    let zero = i8x32::ZERO;
170    transmute(simd_select_bitmask(
171        k,
172        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
173        zero,
174    ))
175}
176
177/// Performs a multiplication in GF(2^8) on the packed bytes.
178/// The field is in polynomial representation with the reduction polynomial
179///  x^8 + x^4 + x^3 + x + 1.
180///
181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
182#[inline]
183#[target_feature(enable = "gfni")]
184#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
185#[cfg_attr(test, assert_instr(gf2p8mulb))]
186pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
187    transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
188}
189
190/// Performs a multiplication in GF(2^8) on the packed bytes.
191/// The field is in polynomial representation with the reduction polynomial
192///  x^8 + x^4 + x^3 + x + 1.
193///
194/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
195/// Otherwise the computation result is written into the result.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
198#[inline]
199#[target_feature(enable = "gfni,avx512bw,avx512vl")]
200#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
201#[cfg_attr(test, assert_instr(vgf2p8mulb))]
202pub unsafe fn _mm_mask_gf2p8mul_epi8(
203    src: __m128i,
204    k: __mmask16,
205    a: __m128i,
206    b: __m128i,
207) -> __m128i {
208    transmute(simd_select_bitmask(
209        k,
210        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
211        src.as_i8x16(),
212    ))
213}
214
215/// Performs a multiplication in GF(2^8) on the packed bytes.
216/// The field is in polynomial representation with the reduction polynomial
217///  x^8 + x^4 + x^3 + x + 1.
218///
219/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
220/// Otherwise the computation result is written into the result.
221///
222/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
223#[inline]
224#[target_feature(enable = "gfni,avx512bw,avx512vl")]
225#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
226#[cfg_attr(test, assert_instr(vgf2p8mulb))]
227pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
228    let zero = i8x16::ZERO;
229    transmute(simd_select_bitmask(
230        k,
231        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
232        zero,
233    ))
234}
235
236/// Performs an affine transformation on the packed bytes in x.
237/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
238/// and b being a constant 8-bit immediate value.
239/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
242#[inline]
243#[target_feature(enable = "gfni,avx512f")]
244#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
245#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
246#[rustc_legacy_const_generics(2)]
247pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
248    static_assert_uimm_bits!(B, 8);
249    let b = B as u8;
250    let x = x.as_i8x64();
251    let a = a.as_i8x64();
252    let r = vgf2p8affineqb_512(x, a, b);
253    transmute(r)
254}
255
256/// Performs an affine transformation on the packed bytes in x.
257/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
258/// and b being a constant 8-bit immediate value.
259/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
260///
261/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
262/// Otherwise the computation result is written into the result.
263///
264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
265#[inline]
266#[target_feature(enable = "gfni,avx512bw,avx512f")]
267#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
268#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
269#[rustc_legacy_const_generics(3)]
270pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
271    k: __mmask64,
272    x: __m512i,
273    a: __m512i,
274) -> __m512i {
275    static_assert_uimm_bits!(B, 8);
276    let b = B as u8;
277    let zero = i8x64::ZERO;
278    let x = x.as_i8x64();
279    let a = a.as_i8x64();
280    let r = vgf2p8affineqb_512(x, a, b);
281    transmute(simd_select_bitmask(k, r, zero))
282}
283
284/// Performs an affine transformation on the packed bytes in x.
285/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
286/// and b being a constant 8-bit immediate value.
287/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
288///
289/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
290/// Otherwise the computation result is written into the result.
291///
292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
293#[inline]
294#[target_feature(enable = "gfni,avx512bw,avx512f")]
295#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
296#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
297#[rustc_legacy_const_generics(4)]
298pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
299    src: __m512i,
300    k: __mmask64,
301    x: __m512i,
302    a: __m512i,
303) -> __m512i {
304    static_assert_uimm_bits!(B, 8);
305    let b = B as u8;
306    let x = x.as_i8x64();
307    let a = a.as_i8x64();
308    let r = vgf2p8affineqb_512(x, a, b);
309    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
310}
311
312/// Performs an affine transformation on the packed bytes in x.
313/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
314/// and b being a constant 8-bit immediate value.
315/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
316///
317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
318#[inline]
319#[target_feature(enable = "gfni,avx")]
320#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
321#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
322#[rustc_legacy_const_generics(2)]
323pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
324    static_assert_uimm_bits!(B, 8);
325    let b = B as u8;
326    let x = x.as_i8x32();
327    let a = a.as_i8x32();
328    let r = vgf2p8affineqb_256(x, a, b);
329    transmute(r)
330}
331
332/// Performs an affine transformation on the packed bytes in x.
333/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
334/// and b being a constant 8-bit immediate value.
335/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
336///
337/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
338/// Otherwise the computation result is written into the result.
339///
340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
341#[inline]
342#[target_feature(enable = "gfni,avx512bw,avx512vl")]
343#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
344#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
345#[rustc_legacy_const_generics(3)]
346pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
347    k: __mmask32,
348    x: __m256i,
349    a: __m256i,
350) -> __m256i {
351    static_assert_uimm_bits!(B, 8);
352    let b = B as u8;
353    let zero = i8x32::ZERO;
354    let x = x.as_i8x32();
355    let a = a.as_i8x32();
356    let r = vgf2p8affineqb_256(x, a, b);
357    transmute(simd_select_bitmask(k, r, zero))
358}
359
360/// Performs an affine transformation on the packed bytes in x.
361/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
362/// and b being a constant 8-bit immediate value.
363/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
364///
365/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
366/// Otherwise the computation result is written into the result.
367///
368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
369#[inline]
370#[target_feature(enable = "gfni,avx512bw,avx512vl")]
371#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
372#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
373#[rustc_legacy_const_generics(4)]
374pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
375    src: __m256i,
376    k: __mmask32,
377    x: __m256i,
378    a: __m256i,
379) -> __m256i {
380    static_assert_uimm_bits!(B, 8);
381    let b = B as u8;
382    let x = x.as_i8x32();
383    let a = a.as_i8x32();
384    let r = vgf2p8affineqb_256(x, a, b);
385    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
386}
387
388/// Performs an affine transformation on the packed bytes in x.
389/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
390/// and b being a constant 8-bit immediate value.
391/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
394#[inline]
395#[target_feature(enable = "gfni")]
396#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
397#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
398#[rustc_legacy_const_generics(2)]
399pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
400    static_assert_uimm_bits!(B, 8);
401    let b = B as u8;
402    let x = x.as_i8x16();
403    let a = a.as_i8x16();
404    let r = vgf2p8affineqb_128(x, a, b);
405    transmute(r)
406}
407
408/// Performs an affine transformation on the packed bytes in x.
409/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
410/// and b being a constant 8-bit immediate value.
411/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
412///
413/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
414/// Otherwise the computation result is written into the result.
415///
416/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
417#[inline]
418#[target_feature(enable = "gfni,avx512bw,avx512vl")]
419#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
420#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
421#[rustc_legacy_const_generics(3)]
422pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
423    k: __mmask16,
424    x: __m128i,
425    a: __m128i,
426) -> __m128i {
427    static_assert_uimm_bits!(B, 8);
428    let b = B as u8;
429    let zero = i8x16::ZERO;
430    let x = x.as_i8x16();
431    let a = a.as_i8x16();
432    let r = vgf2p8affineqb_128(x, a, b);
433    transmute(simd_select_bitmask(k, r, zero))
434}
435
436/// Performs an affine transformation on the packed bytes in x.
437/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
438/// and b being a constant 8-bit immediate value.
439/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
440///
441/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
442/// Otherwise the computation result is written into the result.
443///
444/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
445#[inline]
446#[target_feature(enable = "gfni,avx512bw,avx512vl")]
447#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
448#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
449#[rustc_legacy_const_generics(4)]
450pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
451    src: __m128i,
452    k: __mmask16,
453    x: __m128i,
454    a: __m128i,
455) -> __m128i {
456    static_assert_uimm_bits!(B, 8);
457    let b = B as u8;
458    let x = x.as_i8x16();
459    let a = a.as_i8x16();
460    let r = vgf2p8affineqb_128(x, a, b);
461    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
462}
463
464/// Performs an affine transformation on the inverted packed bytes in x.
465/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
466/// and b being a constant 8-bit immediate value.
467/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
468/// The inverse of 0 is 0.
469/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
470///
471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
472#[inline]
473#[target_feature(enable = "gfni,avx512f")]
474#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
475#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
476#[rustc_legacy_const_generics(2)]
477pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
478    static_assert_uimm_bits!(B, 8);
479    let b = B as u8;
480    let x = x.as_i8x64();
481    let a = a.as_i8x64();
482    let r = vgf2p8affineinvqb_512(x, a, b);
483    transmute(r)
484}
485
486/// Performs an affine transformation on the inverted packed bytes in x.
487/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
488/// and b being a constant 8-bit immediate value.
489/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
490/// The inverse of 0 is 0.
491/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
492///
493/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
494/// Otherwise the computation result is written into the result.
495///
496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
497#[inline]
498#[target_feature(enable = "gfni,avx512bw,avx512f")]
499#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
500#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
501#[rustc_legacy_const_generics(3)]
502pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
503    k: __mmask64,
504    x: __m512i,
505    a: __m512i,
506) -> __m512i {
507    static_assert_uimm_bits!(B, 8);
508    let b = B as u8;
509    let zero = i8x64::ZERO;
510    let x = x.as_i8x64();
511    let a = a.as_i8x64();
512    let r = vgf2p8affineinvqb_512(x, a, b);
513    transmute(simd_select_bitmask(k, r, zero))
514}
515
516/// Performs an affine transformation on the inverted packed bytes in x.
517/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
518/// and b being a constant 8-bit immediate value.
519/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
520/// The inverse of 0 is 0.
521/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
522///
523/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
524/// Otherwise the computation result is written into the result.
525///
526/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
527#[inline]
528#[target_feature(enable = "gfni,avx512bw,avx512f")]
529#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
530#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
531#[rustc_legacy_const_generics(4)]
532pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
533    src: __m512i,
534    k: __mmask64,
535    x: __m512i,
536    a: __m512i,
537) -> __m512i {
538    static_assert_uimm_bits!(B, 8);
539    let b = B as u8;
540    let x = x.as_i8x64();
541    let a = a.as_i8x64();
542    let r = vgf2p8affineinvqb_512(x, a, b);
543    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
544}
545
546/// Performs an affine transformation on the inverted packed bytes in x.
547/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
548/// and b being a constant 8-bit immediate value.
549/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
550/// The inverse of 0 is 0.
551/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
552///
553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
554#[inline]
555#[target_feature(enable = "gfni,avx")]
556#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
557#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
558#[rustc_legacy_const_generics(2)]
559pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
560    static_assert_uimm_bits!(B, 8);
561    let b = B as u8;
562    let x = x.as_i8x32();
563    let a = a.as_i8x32();
564    let r = vgf2p8affineinvqb_256(x, a, b);
565    transmute(r)
566}
567
568/// Performs an affine transformation on the inverted packed bytes in x.
569/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
570/// and b being a constant 8-bit immediate value.
571/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
572/// The inverse of 0 is 0.
573/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
574///
575/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
576/// Otherwise the computation result is written into the result.
577///
578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
579#[inline]
580#[target_feature(enable = "gfni,avx512bw,avx512vl")]
581#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
582#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
583#[rustc_legacy_const_generics(3)]
584pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
585    k: __mmask32,
586    x: __m256i,
587    a: __m256i,
588) -> __m256i {
589    static_assert_uimm_bits!(B, 8);
590    let b = B as u8;
591    let zero = i8x32::ZERO;
592    let x = x.as_i8x32();
593    let a = a.as_i8x32();
594    let r = vgf2p8affineinvqb_256(x, a, b);
595    transmute(simd_select_bitmask(k, r, zero))
596}
597
598/// Performs an affine transformation on the inverted packed bytes in x.
599/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
600/// and b being a constant 8-bit immediate value.
601/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
602/// The inverse of 0 is 0.
603/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
604///
605/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
606/// Otherwise the computation result is written into the result.
607///
608/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
609#[inline]
610#[target_feature(enable = "gfni,avx512bw,avx512vl")]
611#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
612#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
613#[rustc_legacy_const_generics(4)]
614pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
615    src: __m256i,
616    k: __mmask32,
617    x: __m256i,
618    a: __m256i,
619) -> __m256i {
620    static_assert_uimm_bits!(B, 8);
621    let b = B as u8;
622    let x = x.as_i8x32();
623    let a = a.as_i8x32();
624    let r = vgf2p8affineinvqb_256(x, a, b);
625    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
626}
627
628/// Performs an affine transformation on the inverted packed bytes in x.
629/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
630/// and b being a constant 8-bit immediate value.
631/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
632/// The inverse of 0 is 0.
633/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
634///
635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
636#[inline]
637#[target_feature(enable = "gfni")]
638#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
639#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
640#[rustc_legacy_const_generics(2)]
641pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
642    static_assert_uimm_bits!(B, 8);
643    let b = B as u8;
644    let x = x.as_i8x16();
645    let a = a.as_i8x16();
646    let r = vgf2p8affineinvqb_128(x, a, b);
647    transmute(r)
648}
649
650/// Performs an affine transformation on the inverted packed bytes in x.
651/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
652/// and b being a constant 8-bit immediate value.
653/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
654/// The inverse of 0 is 0.
655/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
656///
657/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
658/// Otherwise the computation result is written into the result.
659///
660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
661#[inline]
662#[target_feature(enable = "gfni,avx512bw,avx512vl")]
663#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
664#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
665#[rustc_legacy_const_generics(3)]
666pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
667    k: __mmask16,
668    x: __m128i,
669    a: __m128i,
670) -> __m128i {
671    static_assert_uimm_bits!(B, 8);
672    let b = B as u8;
673    let zero = i8x16::ZERO;
674    let x = x.as_i8x16();
675    let a = a.as_i8x16();
676    let r = vgf2p8affineinvqb_128(x, a, b);
677    transmute(simd_select_bitmask(k, r, zero))
678}
679
680/// Performs an affine transformation on the inverted packed bytes in x.
681/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
682/// and b being a constant 8-bit immediate value.
683/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
684/// The inverse of 0 is 0.
685/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
686///
687/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
688/// Otherwise the computation result is written into the result.
689///
690/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
691#[inline]
692#[target_feature(enable = "gfni,avx512bw,avx512vl")]
693#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
694#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
695#[rustc_legacy_const_generics(4)]
696pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
697    src: __m128i,
698    k: __mmask16,
699    x: __m128i,
700    a: __m128i,
701) -> __m128i {
702    static_assert_uimm_bits!(B, 8);
703    let b = B as u8;
704    let x = x.as_i8x16();
705    let a = a.as_i8x16();
706    let r = vgf2p8affineinvqb_128(x, a, b);
707    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
708}
709
710#[cfg(test)]
711mod tests {
712    // The constants in the tests below are just bit patterns. They should not
713    // be interpreted as integers; signedness does not make sense for them, but
714    // __mXXXi happens to be defined in terms of signed integers.
715    #![allow(overflowing_literals)]
716
717    use core::hint::black_box;
718    use core::intrinsics::size_of;
719    use stdarch_test::simd_test;
720
721    use crate::core_arch::x86::*;
722
723    fn mulbyte(left: u8, right: u8) -> u8 {
724        // this implementation follows the description in
725        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
726        const REDUCTION_POLYNOMIAL: u16 = 0x11b;
727        let left: u16 = left.into();
728        let right: u16 = right.into();
729        let mut carryless_product: u16 = 0;
730
731        // Carryless multiplication
732        for i in 0..8 {
733            if ((left >> i) & 0x01) != 0 {
734                carryless_product ^= right << i;
735            }
736        }
737
738        // reduction, adding in "0" where appropriate to clear out high bits
739        // note that REDUCTION_POLYNOMIAL is zero in this context
740        for i in (8..=14).rev() {
741            if ((carryless_product >> i) & 0x01) != 0 {
742                carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
743            }
744        }
745
746        carryless_product as u8
747    }
748
749    const NUM_TEST_WORDS_512: usize = 4;
750    const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
751    const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
752    const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
753    const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
754    const NUM_BYTES: usize = 256;
755    const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
756    const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
757    const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
758
759    fn parity(input: u8) -> u8 {
760        let mut accumulator = 0;
761        for i in 0..8 {
762            accumulator ^= (input >> i) & 0x01;
763        }
764        accumulator
765    }
766
767    fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
768        // this implementation follows the description in
769        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
770        let mut accumulator = 0;
771
772        for bit in 0..8 {
773            accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
774        }
775
776        accumulator ^ b
777    }
778
779    fn generate_affine_mul_test_data(
780        immediate: u8,
781    ) -> (
782        [u64; NUM_TEST_WORDS_64],
783        [u8; NUM_TEST_ENTRIES],
784        [u8; NUM_TEST_ENTRIES],
785    ) {
786        let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
787        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
788        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
789
790        for i in 0..NUM_TEST_WORDS_64 {
791            left[i] = (i as u64) * 103 * 101;
792            for j in 0..8 {
793                let j64 = j as u64;
794                right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
795                result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
796            }
797        }
798
799        (left, right, result)
800    }
801
802    fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
803        let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
804        let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
805
806        for i in 0..NUM_BYTES {
807            input[i] = (i % 256) as u8;
808            result[i] = if i == 0 { 0 } else { 1 };
809        }
810
811        (input, result)
812    }
813
814    const AES_S_BOX: [u8; NUM_BYTES] = [
815        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
816        0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
817        0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
818        0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
819        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
820        0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
821        0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
822        0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
823        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
824        0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
825        0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
826        0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
827        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
828        0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
829        0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
830        0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
831        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
832        0x16,
833    ];
834
835    fn generate_byte_mul_test_data() -> (
836        [u8; NUM_TEST_ENTRIES],
837        [u8; NUM_TEST_ENTRIES],
838        [u8; NUM_TEST_ENTRIES],
839    ) {
840        let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
841        let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
842        let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
843
844        for i in 0..NUM_TEST_ENTRIES {
845            left[i] = (i % 256) as u8;
846            right[i] = left[i].wrapping_mul(101);
847            result[i] = mulbyte(left[i], right[i]);
848        }
849
850        (left, right, result)
851    }
852
853    #[target_feature(enable = "sse2")]
854    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
855    unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
856        let byte_offset = word_index * 16 / size_of::<T>();
857        let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
858        _mm_loadu_si128(black_box(pointer))
859    }
860
861    #[target_feature(enable = "avx")]
862    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
863    unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
864        let byte_offset = word_index * 32 / size_of::<T>();
865        let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
866        _mm256_loadu_si256(black_box(pointer))
867    }
868
869    #[target_feature(enable = "avx512f")]
870    #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
871    unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
872        let byte_offset = word_index * 64 / size_of::<T>();
873        let pointer = data.as_ptr().add(byte_offset) as *const i32;
874        _mm512_loadu_si512(black_box(pointer))
875    }
876
877    #[simd_test(enable = "gfni,avx512f")]
878    unsafe fn test_mm512_gf2p8mul_epi8() {
879        let (left, right, expected) = generate_byte_mul_test_data();
880
881        for i in 0..NUM_TEST_WORDS_512 {
882            let left = load_m512i_word(&left, i);
883            let right = load_m512i_word(&right, i);
884            let expected = load_m512i_word(&expected, i);
885            let result = _mm512_gf2p8mul_epi8(left, right);
886            assert_eq_m512i(result, expected);
887        }
888    }
889
890    #[simd_test(enable = "gfni,avx512bw")]
891    unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
892        let (left, right, _expected) = generate_byte_mul_test_data();
893
894        for i in 0..NUM_TEST_WORDS_512 {
895            let left = load_m512i_word(&left, i);
896            let right = load_m512i_word(&right, i);
897            let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
898            assert_eq_m512i(result_zero, _mm512_setzero_si512());
899            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
900            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
901            let expected_result = _mm512_gf2p8mul_epi8(left, right);
902            let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
903            let expected_masked =
904                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
905            assert_eq_m512i(result_masked, expected_masked);
906        }
907    }
908
909    #[simd_test(enable = "gfni,avx512bw")]
910    unsafe fn test_mm512_mask_gf2p8mul_epi8() {
911        let (left, right, _expected) = generate_byte_mul_test_data();
912
913        for i in 0..NUM_TEST_WORDS_512 {
914            let left = load_m512i_word(&left, i);
915            let right = load_m512i_word(&right, i);
916            let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
917            assert_eq_m512i(result_left, left);
918            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
919            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
920            let expected_result = _mm512_gf2p8mul_epi8(left, right);
921            let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
922            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
923            assert_eq_m512i(result_masked, expected_masked);
924        }
925    }
926
927    #[simd_test(enable = "gfni,avx")]
928    unsafe fn test_mm256_gf2p8mul_epi8() {
929        let (left, right, expected) = generate_byte_mul_test_data();
930
931        for i in 0..NUM_TEST_WORDS_256 {
932            let left = load_m256i_word(&left, i);
933            let right = load_m256i_word(&right, i);
934            let expected = load_m256i_word(&expected, i);
935            let result = _mm256_gf2p8mul_epi8(left, right);
936            assert_eq_m256i(result, expected);
937        }
938    }
939
940    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
941    unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
942        let (left, right, _expected) = generate_byte_mul_test_data();
943
944        for i in 0..NUM_TEST_WORDS_256 {
945            let left = load_m256i_word(&left, i);
946            let right = load_m256i_word(&right, i);
947            let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
948            assert_eq_m256i(result_zero, _mm256_setzero_si256());
949            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
950            const MASK_WORDS: i32 = 0b01_10_11_00;
951            let expected_result = _mm256_gf2p8mul_epi8(left, right);
952            let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
953            let expected_masked =
954                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
955            assert_eq_m256i(result_masked, expected_masked);
956        }
957    }
958
959    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
960    unsafe fn test_mm256_mask_gf2p8mul_epi8() {
961        let (left, right, _expected) = generate_byte_mul_test_data();
962
963        for i in 0..NUM_TEST_WORDS_256 {
964            let left = load_m256i_word(&left, i);
965            let right = load_m256i_word(&right, i);
966            let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
967            assert_eq_m256i(result_left, left);
968            let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
969            const MASK_WORDS: i32 = 0b01_10_11_00;
970            let expected_result = _mm256_gf2p8mul_epi8(left, right);
971            let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
972            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
973            assert_eq_m256i(result_masked, expected_masked);
974        }
975    }
976
977    #[simd_test(enable = "gfni")]
978    unsafe fn test_mm_gf2p8mul_epi8() {
979        let (left, right, expected) = generate_byte_mul_test_data();
980
981        for i in 0..NUM_TEST_WORDS_128 {
982            let left = load_m128i_word(&left, i);
983            let right = load_m128i_word(&right, i);
984            let expected = load_m128i_word(&expected, i);
985            let result = _mm_gf2p8mul_epi8(left, right);
986            assert_eq_m128i(result, expected);
987        }
988    }
989
990    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
991    unsafe fn test_mm_maskz_gf2p8mul_epi8() {
992        let (left, right, _expected) = generate_byte_mul_test_data();
993
994        for i in 0..NUM_TEST_WORDS_128 {
995            let left = load_m128i_word(&left, i);
996            let right = load_m128i_word(&right, i);
997            let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
998            assert_eq_m128i(result_zero, _mm_setzero_si128());
999            let mask_bytes: __mmask16 = 0x0F_F0;
1000            const MASK_WORDS: i32 = 0b01_10;
1001            let expected_result = _mm_gf2p8mul_epi8(left, right);
1002            let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1003            let expected_masked =
1004                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1005            assert_eq_m128i(result_masked, expected_masked);
1006        }
1007    }
1008
1009    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1010    unsafe fn test_mm_mask_gf2p8mul_epi8() {
1011        let (left, right, _expected) = generate_byte_mul_test_data();
1012
1013        for i in 0..NUM_TEST_WORDS_128 {
1014            let left = load_m128i_word(&left, i);
1015            let right = load_m128i_word(&right, i);
1016            let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
1017            assert_eq_m128i(result_left, left);
1018            let mask_bytes: __mmask16 = 0x0F_F0;
1019            const MASK_WORDS: i32 = 0b01_10;
1020            let expected_result = _mm_gf2p8mul_epi8(left, right);
1021            let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1022            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1023            assert_eq_m128i(result_masked, expected_masked);
1024        }
1025    }
1026
1027    #[simd_test(enable = "gfni,avx512f")]
1028    unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1029        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1030        const IDENTITY_BYTE: i32 = 0;
1031        let constant: i64 = 0;
1032        const CONSTANT_BYTE: i32 = 0x63;
1033        let identity = _mm512_set1_epi64(identity);
1034        let constant = _mm512_set1_epi64(constant);
1035        let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1036
1037        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1038        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1039
1040        for i in 0..NUM_TEST_WORDS_512 {
1041            let data = load_m512i_word(&bytes, i);
1042            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1043            assert_eq_m512i(result, data);
1044            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1045            assert_eq_m512i(result, constant_reference);
1046            let data = load_m512i_word(&more_bytes, i);
1047            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1048            assert_eq_m512i(result, data);
1049            let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1050            assert_eq_m512i(result, constant_reference);
1051
1052            let matrix = load_m512i_word(&matrices, i);
1053            let vector = load_m512i_word(&vectors, i);
1054            let reference = load_m512i_word(&references, i);
1055
1056            let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1057            assert_eq_m512i(result, reference);
1058        }
1059    }
1060
1061    #[simd_test(enable = "gfni,avx512bw")]
1062    unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1063        const CONSTANT_BYTE: i32 = 0x63;
1064        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1065
1066        for i in 0..NUM_TEST_WORDS_512 {
1067            let matrix = load_m512i_word(&matrices, i);
1068            let vector = load_m512i_word(&vectors, i);
1069            let result_zero =
1070                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1071            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1072            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1073            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1074            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1075            let result_masked =
1076                _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1077            let expected_masked =
1078                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1079            assert_eq_m512i(result_masked, expected_masked);
1080        }
1081    }
1082
1083    #[simd_test(enable = "gfni,avx512bw")]
1084    unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1085        const CONSTANT_BYTE: i32 = 0x63;
1086        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1087
1088        for i in 0..NUM_TEST_WORDS_512 {
1089            let left = load_m512i_word(&vectors, i);
1090            let right = load_m512i_word(&matrices, i);
1091            let result_left =
1092                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1093            assert_eq_m512i(result_left, left);
1094            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1095            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1096            let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1097            let result_masked =
1098                _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1099            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1100            assert_eq_m512i(result_masked, expected_masked);
1101        }
1102    }
1103
1104    #[simd_test(enable = "gfni,avx")]
1105    unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1106        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1107        const IDENTITY_BYTE: i32 = 0;
1108        let constant: i64 = 0;
1109        const CONSTANT_BYTE: i32 = 0x63;
1110        let identity = _mm256_set1_epi64x(identity);
1111        let constant = _mm256_set1_epi64x(constant);
1112        let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1113
1114        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1115        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1116
1117        for i in 0..NUM_TEST_WORDS_256 {
1118            let data = load_m256i_word(&bytes, i);
1119            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1120            assert_eq_m256i(result, data);
1121            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1122            assert_eq_m256i(result, constant_reference);
1123            let data = load_m256i_word(&more_bytes, i);
1124            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1125            assert_eq_m256i(result, data);
1126            let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1127            assert_eq_m256i(result, constant_reference);
1128
1129            let matrix = load_m256i_word(&matrices, i);
1130            let vector = load_m256i_word(&vectors, i);
1131            let reference = load_m256i_word(&references, i);
1132
1133            let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1134            assert_eq_m256i(result, reference);
1135        }
1136    }
1137
1138    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1139    unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1140        const CONSTANT_BYTE: i32 = 0x63;
1141        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1142
1143        for i in 0..NUM_TEST_WORDS_256 {
1144            let matrix = load_m256i_word(&matrices, i);
1145            let vector = load_m256i_word(&vectors, i);
1146            let result_zero =
1147                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1148            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1149            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1150            const MASK_WORDS: i32 = 0b11_01_10_00;
1151            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1152            let result_masked =
1153                _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1154            let expected_masked =
1155                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1156            assert_eq_m256i(result_masked, expected_masked);
1157        }
1158    }
1159
1160    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1161    unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1162        const CONSTANT_BYTE: i32 = 0x63;
1163        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1164
1165        for i in 0..NUM_TEST_WORDS_256 {
1166            let left = load_m256i_word(&vectors, i);
1167            let right = load_m256i_word(&matrices, i);
1168            let result_left =
1169                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1170            assert_eq_m256i(result_left, left);
1171            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1172            const MASK_WORDS: i32 = 0b11_01_10_00;
1173            let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1174            let result_masked =
1175                _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1176            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1177            assert_eq_m256i(result_masked, expected_masked);
1178        }
1179    }
1180
1181    #[simd_test(enable = "gfni")]
1182    unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1183        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1184        const IDENTITY_BYTE: i32 = 0;
1185        let constant: i64 = 0;
1186        const CONSTANT_BYTE: i32 = 0x63;
1187        let identity = _mm_set1_epi64x(identity);
1188        let constant = _mm_set1_epi64x(constant);
1189        let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1190
1191        let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1192        let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1193
1194        for i in 0..NUM_TEST_WORDS_128 {
1195            let data = load_m128i_word(&bytes, i);
1196            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1197            assert_eq_m128i(result, data);
1198            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1199            assert_eq_m128i(result, constant_reference);
1200            let data = load_m128i_word(&more_bytes, i);
1201            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1202            assert_eq_m128i(result, data);
1203            let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1204            assert_eq_m128i(result, constant_reference);
1205
1206            let matrix = load_m128i_word(&matrices, i);
1207            let vector = load_m128i_word(&vectors, i);
1208            let reference = load_m128i_word(&references, i);
1209
1210            let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1211            assert_eq_m128i(result, reference);
1212        }
1213    }
1214
1215    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1216    unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1217        const CONSTANT_BYTE: i32 = 0x63;
1218        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1219
1220        for i in 0..NUM_TEST_WORDS_128 {
1221            let matrix = load_m128i_word(&matrices, i);
1222            let vector = load_m128i_word(&vectors, i);
1223            let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1224            assert_eq_m128i(result_zero, _mm_setzero_si128());
1225            let mask_bytes: __mmask16 = 0x0F_F0;
1226            const MASK_WORDS: i32 = 0b01_10;
1227            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1228            let result_masked =
1229                _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1230            let expected_masked =
1231                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1232            assert_eq_m128i(result_masked, expected_masked);
1233        }
1234    }
1235
1236    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1237    unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1238        const CONSTANT_BYTE: i32 = 0x63;
1239        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1240
1241        for i in 0..NUM_TEST_WORDS_128 {
1242            let left = load_m128i_word(&vectors, i);
1243            let right = load_m128i_word(&matrices, i);
1244            let result_left =
1245                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1246            assert_eq_m128i(result_left, left);
1247            let mask_bytes: __mmask16 = 0x0F_F0;
1248            const MASK_WORDS: i32 = 0b01_10;
1249            let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1250            let result_masked =
1251                _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1252            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1253            assert_eq_m128i(result_masked, expected_masked);
1254        }
1255    }
1256
1257    #[simd_test(enable = "gfni,avx512f")]
1258    unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1259        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1260        const IDENTITY_BYTE: i32 = 0;
1261        const CONSTANT_BYTE: i32 = 0x63;
1262        let identity = _mm512_set1_epi64(identity);
1263
1264        // validate inversion
1265        let (inputs, results) = generate_inv_tests_data();
1266
1267        for i in 0..NUM_BYTES_WORDS_512 {
1268            let input = load_m512i_word(&inputs, i);
1269            let reference = load_m512i_word(&results, i);
1270            let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1271            let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1272            assert_eq_m512i(remultiplied, reference);
1273        }
1274
1275        // validate subsequent affine operation
1276        let (matrices, vectors, _affine_expected) =
1277            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1278
1279        for i in 0..NUM_TEST_WORDS_512 {
1280            let vector = load_m512i_word(&vectors, i);
1281            let matrix = load_m512i_word(&matrices, i);
1282
1283            let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1284            let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1285            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1286            assert_eq_m512i(result, reference);
1287        }
1288
1289        // validate everything by virtue of checking against the AES SBox
1290        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1291        let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1292
1293        for i in 0..NUM_BYTES_WORDS_512 {
1294            let reference = load_m512i_word(&AES_S_BOX, i);
1295            let input = load_m512i_word(&inputs, i);
1296            let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1297            assert_eq_m512i(result, reference);
1298        }
1299    }
1300
1301    #[simd_test(enable = "gfni,avx512bw")]
1302    unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1303        const CONSTANT_BYTE: i32 = 0x63;
1304        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1305
1306        for i in 0..NUM_TEST_WORDS_512 {
1307            let matrix = load_m512i_word(&matrices, i);
1308            let vector = load_m512i_word(&vectors, i);
1309            let result_zero =
1310                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1311            assert_eq_m512i(result_zero, _mm512_setzero_si512());
1312            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1313            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1314            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1315            let result_masked =
1316                _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1317            let expected_masked =
1318                _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1319            assert_eq_m512i(result_masked, expected_masked);
1320        }
1321    }
1322
1323    #[simd_test(enable = "gfni,avx512bw")]
1324    unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1325        const CONSTANT_BYTE: i32 = 0x63;
1326        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1327
1328        for i in 0..NUM_TEST_WORDS_512 {
1329            let left = load_m512i_word(&vectors, i);
1330            let right = load_m512i_word(&matrices, i);
1331            let result_left =
1332                _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1333            assert_eq_m512i(result_left, left);
1334            let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1335            let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1336            let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1337            let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1338                left, mask_bytes, left, right,
1339            );
1340            let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1341            assert_eq_m512i(result_masked, expected_masked);
1342        }
1343    }
1344
1345    #[simd_test(enable = "gfni,avx")]
1346    unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1347        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1348        const IDENTITY_BYTE: i32 = 0;
1349        const CONSTANT_BYTE: i32 = 0x63;
1350        let identity = _mm256_set1_epi64x(identity);
1351
1352        // validate inversion
1353        let (inputs, results) = generate_inv_tests_data();
1354
1355        for i in 0..NUM_BYTES_WORDS_256 {
1356            let input = load_m256i_word(&inputs, i);
1357            let reference = load_m256i_word(&results, i);
1358            let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1359            let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1360            assert_eq_m256i(remultiplied, reference);
1361        }
1362
1363        // validate subsequent affine operation
1364        let (matrices, vectors, _affine_expected) =
1365            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1366
1367        for i in 0..NUM_TEST_WORDS_256 {
1368            let vector = load_m256i_word(&vectors, i);
1369            let matrix = load_m256i_word(&matrices, i);
1370
1371            let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1372            let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1373            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1374            assert_eq_m256i(result, reference);
1375        }
1376
1377        // validate everything by virtue of checking against the AES SBox
1378        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1379        let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1380
1381        for i in 0..NUM_BYTES_WORDS_256 {
1382            let reference = load_m256i_word(&AES_S_BOX, i);
1383            let input = load_m256i_word(&inputs, i);
1384            let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1385            assert_eq_m256i(result, reference);
1386        }
1387    }
1388
1389    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1390    unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1391        const CONSTANT_BYTE: i32 = 0x63;
1392        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1393
1394        for i in 0..NUM_TEST_WORDS_256 {
1395            let matrix = load_m256i_word(&matrices, i);
1396            let vector = load_m256i_word(&vectors, i);
1397            let result_zero =
1398                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1399            assert_eq_m256i(result_zero, _mm256_setzero_si256());
1400            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1401            const MASK_WORDS: i32 = 0b11_01_10_00;
1402            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1403            let result_masked =
1404                _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1405            let expected_masked =
1406                _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1407            assert_eq_m256i(result_masked, expected_masked);
1408        }
1409    }
1410
1411    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1412    unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1413        const CONSTANT_BYTE: i32 = 0x63;
1414        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1415
1416        for i in 0..NUM_TEST_WORDS_256 {
1417            let left = load_m256i_word(&vectors, i);
1418            let right = load_m256i_word(&matrices, i);
1419            let result_left =
1420                _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1421            assert_eq_m256i(result_left, left);
1422            let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1423            const MASK_WORDS: i32 = 0b11_01_10_00;
1424            let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1425            let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1426                left, mask_bytes, left, right,
1427            );
1428            let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1429            assert_eq_m256i(result_masked, expected_masked);
1430        }
1431    }
1432
1433    #[simd_test(enable = "gfni")]
1434    unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1435        let identity: i64 = 0x01_02_04_08_10_20_40_80;
1436        const IDENTITY_BYTE: i32 = 0;
1437        const CONSTANT_BYTE: i32 = 0x63;
1438        let identity = _mm_set1_epi64x(identity);
1439
1440        // validate inversion
1441        let (inputs, results) = generate_inv_tests_data();
1442
1443        for i in 0..NUM_BYTES_WORDS_128 {
1444            let input = load_m128i_word(&inputs, i);
1445            let reference = load_m128i_word(&results, i);
1446            let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1447            let remultiplied = _mm_gf2p8mul_epi8(result, input);
1448            assert_eq_m128i(remultiplied, reference);
1449        }
1450
1451        // validate subsequent affine operation
1452        let (matrices, vectors, _affine_expected) =
1453            generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1454
1455        for i in 0..NUM_TEST_WORDS_128 {
1456            let vector = load_m128i_word(&vectors, i);
1457            let matrix = load_m128i_word(&matrices, i);
1458
1459            let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1460            let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1461            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1462            assert_eq_m128i(result, reference);
1463        }
1464
1465        // validate everything by virtue of checking against the AES SBox
1466        const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1467        let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1468
1469        for i in 0..NUM_BYTES_WORDS_128 {
1470            let reference = load_m128i_word(&AES_S_BOX, i);
1471            let input = load_m128i_word(&inputs, i);
1472            let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1473            assert_eq_m128i(result, reference);
1474        }
1475    }
1476
1477    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1478    unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1479        const CONSTANT_BYTE: i32 = 0x63;
1480        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1481
1482        for i in 0..NUM_TEST_WORDS_128 {
1483            let matrix = load_m128i_word(&matrices, i);
1484            let vector = load_m128i_word(&vectors, i);
1485            let result_zero =
1486                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1487            assert_eq_m128i(result_zero, _mm_setzero_si128());
1488            let mask_bytes: __mmask16 = 0x0F_F0;
1489            const MASK_WORDS: i32 = 0b01_10;
1490            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1491            let result_masked =
1492                _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1493            let expected_masked =
1494                _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1495            assert_eq_m128i(result_masked, expected_masked);
1496        }
1497    }
1498
1499    #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1500    unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1501        const CONSTANT_BYTE: i32 = 0x63;
1502        let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1503
1504        for i in 0..NUM_TEST_WORDS_128 {
1505            let left = load_m128i_word(&vectors, i);
1506            let right = load_m128i_word(&matrices, i);
1507            let result_left =
1508                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1509            assert_eq_m128i(result_left, left);
1510            let mask_bytes: __mmask16 = 0x0F_F0;
1511            const MASK_WORDS: i32 = 0b01_10;
1512            let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1513            let result_masked =
1514                _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1515            let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1516            assert_eq_m128i(result_masked, expected_masked);
1517        }
1518    }
1519}