Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx512bf16.rs

1//! [AVX512BF16 intrinsics].
2//!
3//! [AVX512BF16 intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769&avx512techs=AVX512_BF16
4
5use crate::core_arch::{simd::*, x86::*};
6use crate::intrinsics::simd::*;
7
8#[cfg(test)]
9use stdarch_test::assert_instr;
10
11#[allow(improper_ctypes)]
12unsafe extern "C" {
13    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.128"]
14    fn cvtne2ps2bf16(a: f32x4, b: f32x4) -> i16x8;
15    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.256"]
16    fn cvtne2ps2bf16_256(a: f32x8, b: f32x8) -> i16x16;
17    #[link_name = "llvm.x86.avx512bf16.cvtne2ps2bf16.512"]
18    fn cvtne2ps2bf16_512(a: f32x16, b: f32x16) -> i16x32;
19    #[link_name = "llvm.x86.avx512bf16.mask.cvtneps2bf16.128"]
20    fn cvtneps2bf16_128(a: f32x4, src: i16x8, k: __mmask8) -> i16x8;
21    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.256"]
22    fn cvtneps2bf16_256(a: f32x8) -> i16x8;
23    #[link_name = "llvm.x86.avx512bf16.cvtneps2bf16.512"]
24    fn cvtneps2bf16_512(a: f32x16) -> i16x16;
25    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.128"]
26    fn dpbf16ps(a: f32x4, b: i16x8, c: i16x8) -> f32x4;
27    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.256"]
28    fn dpbf16ps_256(a: f32x8, b: i16x16, c: i16x16) -> f32x8;
29    #[link_name = "llvm.x86.avx512bf16.dpbf16ps.512"]
30    fn dpbf16ps_512(a: f32x16, b: i16x32, c: i16x32) -> f32x16;
31}
32
33/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
34/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
35/// 128-bit wide vector.
36/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
37#[inline]
38#[target_feature(enable = "avx512bf16,avx512vl")]
39#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
40#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
41pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
42    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
43}
44
45/// Convert packed single-precision (32-bit) floating-point elements in two vectors
46/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
47/// in single vector dst using writemask k (elements are copied from src when the
48/// corresponding mask bit is not set).
49/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
50#[inline]
51#[target_feature(enable = "avx512bf16,avx512vl")]
52#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
53#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
54pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
55    unsafe {
56        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
57        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
58    }
59}
60
61/// Convert packed single-precision (32-bit) floating-point elements in two vectors
62/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
63/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
64/// mask bit is not set).
65/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
66#[inline]
67#[target_feature(enable = "avx512bf16,avx512vl")]
68#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
69#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
70pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
71    unsafe {
72        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
73        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
74    }
75}
76
77/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
78/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
79/// 256-bit wide vector.
80/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
81#[inline]
82#[target_feature(enable = "avx512bf16,avx512vl")]
83#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
84#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
85pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
86    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
87}
88
89/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
90/// to packed BF16 (16-bit) floating-point elements and store the results in single vector
91/// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
92/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
93#[inline]
94#[target_feature(enable = "avx512bf16,avx512vl")]
95#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
96#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
97pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
98    unsafe {
99        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
100        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
101    }
102}
103
104/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
105/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
106/// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
108#[inline]
109#[target_feature(enable = "avx512bf16,avx512vl")]
110#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
111#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
112pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
113    unsafe {
114        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
115        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
116    }
117}
118
119/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
120/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
121/// 512-bit wide vector.
122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
123#[inline]
124#[target_feature(enable = "avx512bf16,avx512f")]
125#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
126#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
127pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
128    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
129}
130
131/// Convert packed single-precision (32-bit) floating-point elements in two vectors
132/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
133/// in single vector dst using writemask k (elements are copied from src when the
134/// corresponding mask bit is not set).
135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
136#[inline]
137#[target_feature(enable = "avx512bf16,avx512f")]
138#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
139#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
140pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
141    unsafe {
142        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
143        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
144    }
145}
146
147/// Convert packed single-precision (32-bit) floating-point elements in two vectors
148/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
149/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
150/// mask bit is not set).
151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
152#[inline]
153#[target_feature(enable = "avx512bf16,avx512f")]
154#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
155#[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
156pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
157    unsafe {
158        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
159        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
160    }
161}
162
163/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
164/// floating-point elements, and store the results in dst.
165/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
166#[inline]
167#[target_feature(enable = "avx512bf16,avx512vl")]
168#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
169#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
170pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
171    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
172}
173
174/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
175/// floating-point elements, and store the results in dst using writemask k
176/// (elements are copied from src when the corresponding mask bit is not set).
177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
178#[inline]
179#[target_feature(enable = "avx512bf16,avx512vl")]
180#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
181#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
182pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
183    unsafe {
184        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
185        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
186    }
187}
188
189/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
190/// floating-point elements, and store the results in dst using zeromask k
191/// (elements are zeroed out when the corresponding mask bit is not set).
192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
193#[inline]
194#[target_feature(enable = "avx512bf16,avx512vl")]
195#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
196#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
197pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
198    unsafe {
199        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
200        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
201    }
202}
203
204/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
205/// floating-point elements, and store the results in dst.
206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
207#[inline]
208#[target_feature(enable = "avx512bf16,avx512f")]
209#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
210#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
211pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
212    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
213}
214
215/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
216/// floating-point elements, and store the results in dst using writemask k
217/// (elements are copied from src when the corresponding mask bit is not set).
218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
219#[inline]
220#[target_feature(enable = "avx512bf16,avx512f")]
221#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
222#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
223pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
224    unsafe {
225        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
226        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
227    }
228}
229
230/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
231/// floating-point elements, and store the results in dst using zeromask k
232/// (elements are zeroed out when the corresponding mask bit is not set).
233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
234#[inline]
235#[target_feature(enable = "avx512bf16,avx512f")]
236#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
237#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
238pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
239    unsafe {
240        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
241        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
242    }
243}
244
245/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
246/// accumulating the intermediate single-precision (32-bit) floating-point elements
247/// with elements in src, and store the results in dst.
248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
249#[inline]
250#[target_feature(enable = "avx512bf16,avx512vl")]
251#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
252#[cfg_attr(test, assert_instr("vdpbf16ps"))]
253pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
254    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i16x8(), b.as_i16x8())) }
255}
256
257/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
258/// accumulating the intermediate single-precision (32-bit) floating-point elements
259/// with elements in src, and store the results in dst using writemask k
260/// (elements are copied from src when the corresponding mask bit is not set).
261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
262#[inline]
263#[target_feature(enable = "avx512bf16,avx512vl")]
264#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
265#[cfg_attr(test, assert_instr("vdpbf16ps"))]
266pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
267    unsafe {
268        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
269        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
270    }
271}
272
273/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
274/// accumulating the intermediate single-precision (32-bit) floating-point elements
275/// with elements in src, and store the results in dst using zeromask k
276/// (elements are zeroed out when the corresponding mask bit is not set).
277/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
278#[inline]
279#[target_feature(enable = "avx512bf16,avx512vl")]
280#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
281#[cfg_attr(test, assert_instr("vdpbf16ps"))]
282pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
283    unsafe {
284        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
285        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
286        transmute(simd_select_bitmask(k, rst, zero))
287    }
288}
289
290/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
291/// accumulating the intermediate single-precision (32-bit) floating-point elements
292/// with elements in src, and store the results in dst.
293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
294#[inline]
295#[target_feature(enable = "avx512bf16,avx512vl")]
296#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
297#[cfg_attr(test, assert_instr("vdpbf16ps"))]
298pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
299    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i16x16(), b.as_i16x16())) }
300}
301
302/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
303/// accumulating the intermediate single-precision (32-bit) floating-point elements
304/// with elements in src, and store the results in dst using writemask k
305/// (elements are copied from src when the corresponding mask bit is not set).
306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
307#[inline]
308#[target_feature(enable = "avx512bf16,avx512vl")]
309#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
310#[cfg_attr(test, assert_instr("vdpbf16ps"))]
311pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
312    unsafe {
313        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
314        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
315    }
316}
317
318/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
319/// accumulating the intermediate single-precision (32-bit) floating-point elements
320/// with elements in src, and store the results in dst using zeromask k
321/// (elements are zeroed out when the corresponding mask bit is not set).
322/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
323#[inline]
324#[target_feature(enable = "avx512bf16,avx512vl")]
325#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
326#[cfg_attr(test, assert_instr("vdpbf16ps"))]
327pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
328    unsafe {
329        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
330        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
331    }
332}
333
334/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
335/// accumulating the intermediate single-precision (32-bit) floating-point elements
336/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
337/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
338/// floating-point elements with elements in src, and store the results in dst.
339/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
340#[inline]
341#[target_feature(enable = "avx512bf16,avx512f")]
342#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
343#[cfg_attr(test, assert_instr("vdpbf16ps"))]
344pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
345    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i16x32(), b.as_i16x32())) }
346}
347
348/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
349/// accumulating the intermediate single-precision (32-bit) floating-point elements
350/// with elements in src, and store the results in dst using writemask k
351/// (elements are copied from src when the corresponding mask bit is not set).
352/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
353#[inline]
354#[target_feature(enable = "avx512bf16,avx512f")]
355#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
356#[cfg_attr(test, assert_instr("vdpbf16ps"))]
357pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
358    unsafe {
359        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
360        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
361    }
362}
363
364/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
365/// accumulating the intermediate single-precision (32-bit) floating-point elements
366/// with elements in src, and store the results in dst using zeromask k
367/// (elements are zeroed out when the corresponding mask bit is not set).
368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
369#[inline]
370#[target_feature(enable = "avx512bf16,avx512f")]
371#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
372#[cfg_attr(test, assert_instr("vdpbf16ps"))]
373pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
374    unsafe {
375        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
376        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
377    }
378}
379
380/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
381/// floating-point elements, and store the results in dst.
382///
383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpbh_ps)
384#[inline]
385#[target_feature(enable = "avx512bf16,avx512f")]
386#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
387pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
388    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
389}
390
391/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
392/// floating-point elements, and store the results in dst using writemask k (elements are copied
393/// from src when the corresponding mask bit is not set).
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpbh_ps)
396#[inline]
397#[target_feature(enable = "avx512bf16,avx512f")]
398#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
399pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
400    unsafe {
401        let cvt = _mm512_cvtpbh_ps(a);
402        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
403    }
404}
405
406/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
407/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
408/// when the corresponding mask bit is not set).
409///
410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpbh_ps)
411#[inline]
412#[target_feature(enable = "avx512bf16,avx512f")]
413#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
414pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
415    unsafe {
416        let cvt = _mm512_cvtpbh_ps(a);
417        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
418    }
419}
420
421/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
422/// floating-point elements, and store the results in dst.
423///
424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpbh_ps)
425#[inline]
426#[target_feature(enable = "avx512bf16,avx512vl")]
427#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
428pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
429    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
430}
431
432/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
433/// floating-point elements, and store the results in dst using writemask k (elements are copied
434/// from src when the corresponding mask bit is not set).
435///
436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpbh_ps)
437#[inline]
438#[target_feature(enable = "avx512bf16,avx512vl")]
439#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
440pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
441    unsafe {
442        let cvt = _mm256_cvtpbh_ps(a);
443        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
444    }
445}
446
447/// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
448/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
449/// when the corresponding mask bit is not set).
450///
451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpbh_ps)
452#[inline]
453#[target_feature(enable = "avx512bf16,avx512vl")]
454#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
455pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
456    unsafe {
457        let cvt = _mm256_cvtpbh_ps(a);
458        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
459    }
460}
461
462/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
463/// elements, and store the results in dst.
464///
465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpbh_ps)
466#[inline]
467#[target_feature(enable = "avx512bf16,avx512vl")]
468#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
469pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
470    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
471}
472
473/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
474/// elements, and store the results in dst using writemask k (elements are copied from src when the corresponding
475/// mask bit is not set).
476///
477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpbh_ps)
478#[inline]
479#[target_feature(enable = "avx512bf16,avx512vl")]
480#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
481pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
482    unsafe {
483        let cvt = _mm_cvtpbh_ps(a);
484        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
485    }
486}
487
488/// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
489/// elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
490/// mask bit is not set).
491///
492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpbh_ps)
493#[inline]
494#[target_feature(enable = "avx512bf16,avx512vl")]
495#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
496pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
497    unsafe {
498        let cvt = _mm_cvtpbh_ps(a);
499        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
500    }
501}
502
503/// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
504/// element, and store the result in dst.
505///
506/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsbh_ss)
507#[inline]
508#[target_feature(enable = "avx512bf16,avx512f")]
509#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
510pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
511    f32::from_bits((a.to_bits() as u32) << 16)
512}
513
514/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
515/// floating-point elements, and store the results in dst.
516///
517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
518#[inline]
519#[target_feature(enable = "avx512bf16,avx512vl")]
520#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
521#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
522pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
523    _mm_mask_cvtneps_pbh(__m128bh::splat(0), !0, a)
524}
525
526/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
527/// floating-point elements, and store the results in dst using writemask k (elements are copied
528/// from src when the corresponding mask bit is not set).
529///
530/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtneps_pbh)
531#[inline]
532#[target_feature(enable = "avx512bf16,avx512vl")]
533#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
534#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
535pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
536    unsafe { cvtneps2bf16_128(a.as_f32x4(), src.as_i16x8(), k).as_m128bh() }
537}
538
539/// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
540/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out
541/// when the corresponding mask bit is not set).
542///
543/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtneps_pbh)
544#[inline]
545#[target_feature(enable = "avx512bf16,avx512vl")]
546#[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
547#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
548pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
549    _mm_mask_cvtneps_pbh(__m128bh::splat(0), k, a)
550}
551
552/// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
553/// element, and store the result in dst.
554///
555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtness_sbh)
556#[inline]
557#[target_feature(enable = "avx512bf16,avx512vl")]
558#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
559pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
560    unsafe {
561        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
562        bf16::from_bits(value)
563    }
564}
565
566#[cfg(test)]
567mod tests {
568    use crate::core_arch::simd::{f32x4, f32x8, f32x16, u16x4, u16x8, u16x16, u16x32};
569    use crate::{
570        core_arch::x86::*,
571        mem::{transmute, transmute_copy},
572    };
573    use stdarch_test::simd_test;
574
575    #[simd_test(enable = "avx512bf16,avx512vl")]
576    fn test_mm_cvtne2ps_pbh() {
577        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
578        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
579        let a = f32x4::from_array(a_array).as_m128();
580        let b = f32x4::from_array(b_array).as_m128();
581        let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
582        let result = *c.as_u16x8().as_array();
583        #[rustfmt::skip]
584        let expected_result: [u16; 8] = [
585            0b1_10000110_0110010,
586            0b1_10000010_0101000,
587            0b1_10000000_1110000,
588            0b1_10000100_1001001,
589            0b0_10000110_0110010,
590            0b0_10000010_0101000,
591            0b0_10000000_1110000,
592            0b0_10000100_1001001,
593        ];
594        assert_eq!(result, expected_result);
595    }
596
597    #[simd_test(enable = "avx512bf16,avx512vl")]
598    fn test_mm_mask_cvtne2ps_pbh() {
599        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
600        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
601        #[rustfmt::skip]
602        let src_array: [u16; 8] = [
603            0b0_10000110_0110010,
604            0b0_10000010_0101000,
605            0b0_10000000_1110000,
606            0b0_10000100_1001001,
607            0b0_10000110_0110010,
608            0b0_10000010_0101000,
609            0b0_10000000_1110000,
610            0b0_10000100_1001001,
611        ];
612        let src = u16x8::from_array(src_array).as_m128bh();
613        let a = f32x4::from_array(a_array).as_m128();
614        let b = f32x4::from_array(b_array).as_m128();
615        let k: __mmask8 = 0b1111_1111;
616        let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
617        let result = *c.as_u16x8().as_array();
618        #[rustfmt::skip]
619        let expected_result: [u16; 8] = [
620            0b1_10000110_0110010,
621            0b1_10000010_0101000,
622            0b1_10000000_1110000,
623            0b1_10000100_1001001,
624            0b0_10000110_0110010,
625            0b0_10000010_0101000,
626            0b0_10000000_1110000,
627            0b0_10000100_1001001,
628        ];
629        assert_eq!(result, expected_result);
630        let k = 0b0000_0000;
631        let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
632        let result = *c.as_u16x8().as_array();
633        let expected_result = src_array;
634        assert_eq!(result, expected_result);
635    }
636
637    #[simd_test(enable = "avx512bf16,avx512vl")]
638    fn test_mm_maskz_cvtne2ps_pbh() {
639        let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
640        let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
641        let a = f32x4::from_array(a_array).as_m128();
642        let b = f32x4::from_array(b_array).as_m128();
643        let k: __mmask8 = 0b1111_1111;
644        let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
645        let result = *c.as_u16x8().as_array();
646        #[rustfmt::skip]
647        let expected_result: [u16; 8] = [
648            0b1_10000110_0110010,
649            0b1_10000010_0101000,
650            0b1_10000000_1110000,
651            0b1_10000100_1001001,
652            0b0_10000110_0110010,
653            0b0_10000010_0101000,
654            0b0_10000000_1110000,
655            0b0_10000100_1001001,
656        ];
657        assert_eq!(result, expected_result);
658        let k = 0b0011_1100;
659        let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
660        let result = *c.as_u16x8().as_array();
661        #[rustfmt::skip]
662        let expected_result: [u16; 8] = [
663            0,
664            0,
665            0b1_10000000_1110000,
666            0b1_10000100_1001001,
667            0b0_10000110_0110010,
668            0b0_10000010_0101000,
669            0,
670            0,
671        ];
672        assert_eq!(result, expected_result);
673    }
674
675    #[simd_test(enable = "avx512bf16,avx512vl")]
676    fn test_mm256_cvtne2ps_pbh() {
677        #[rustfmt::skip]
678        let a_array = [
679            178.125_f32,
680            10.5_f32,
681            3.75_f32,
682            50.25_f32,
683            16.5_f32,
684            255.11_f32,
685            1000.158_f32,
686            575.575_f32,
687        ];
688        let b_array = [
689            -178.125_f32,
690            -10.5_f32,
691            -3.75_f32,
692            -50.25_f32,
693            -16.5_f32,
694            -255.11_f32,
695            -1000.158_f32,
696            -575.575_f32,
697        ];
698        let a = f32x8::from_array(a_array).as_m256();
699        let b = f32x8::from_array(b_array).as_m256();
700        let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
701        let result = *c.as_u16x16().as_array();
702        #[rustfmt::skip]
703        let expected_result: [u16; 16] = [
704            0b1_10000110_0110010,
705            0b1_10000010_0101000,
706            0b1_10000000_1110000,
707            0b1_10000100_1001001,
708            0b1_10000011_0000100,
709            0b1_10000110_1111111,
710            0b1_10001000_1111010,
711            0b1_10001000_0010000,
712            0b0_10000110_0110010,
713            0b0_10000010_0101000,
714            0b0_10000000_1110000,
715            0b0_10000100_1001001,
716            0b0_10000011_0000100,
717            0b0_10000110_1111111,
718            0b0_10001000_1111010,
719            0b0_10001000_0010000,
720        ];
721        assert_eq!(result, expected_result);
722    }
723
724    #[simd_test(enable = "avx512bf16,avx512vl")]
725    fn test_mm256_mask_cvtne2ps_pbh() {
726        #[rustfmt::skip]
727        let a_array = [
728            178.125_f32,
729            10.5_f32,
730            3.75_f32,
731            50.25_f32,
732            16.5_f32,
733            255.11_f32,
734            1000.158_f32,
735            575.575_f32,
736        ];
737        let b_array = [
738            -178.125_f32,
739            -10.5_f32,
740            -3.75_f32,
741            -50.25_f32,
742            -16.5_f32,
743            -255.11_f32,
744            -1000.158_f32,
745            -575.575_f32,
746        ];
747        let src_array: [u16; 16] = [
748            0b0_10000110_0110010,
749            0b0_10000010_0101000,
750            0b0_10000000_1110000,
751            0b0_10000100_1001001,
752            0b0_10000110_0110010,
753            0b0_10000010_0101000,
754            0b0_10000000_1110000,
755            0b0_10000100_1001001,
756            0b0_10000110_0110010,
757            0b0_10000010_0101000,
758            0b0_10000000_1110000,
759            0b0_10000100_1001001,
760            0b0_10000110_0110010,
761            0b0_10000010_0101000,
762            0b0_10000000_1110000,
763            0b0_10000100_1001001,
764        ];
765        let src = u16x16::from_array(src_array).as_m256bh();
766        let a = f32x8::from_array(a_array).as_m256();
767        let b = f32x8::from_array(b_array).as_m256();
768        let k: __mmask16 = 0xffff;
769        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
770        let result = *c.as_u16x16().as_array();
771        #[rustfmt::skip]
772        let expected_result: [u16; 16] = [
773            0b1_10000110_0110010,
774            0b1_10000010_0101000,
775            0b1_10000000_1110000,
776            0b1_10000100_1001001,
777            0b1_10000011_0000100,
778            0b1_10000110_1111111,
779            0b1_10001000_1111010,
780            0b1_10001000_0010000,
781            0b0_10000110_0110010,
782            0b0_10000010_0101000,
783            0b0_10000000_1110000,
784            0b0_10000100_1001001,
785            0b0_10000011_0000100,
786            0b0_10000110_1111111,
787            0b0_10001000_1111010,
788            0b0_10001000_0010000,
789        ];
790        assert_eq!(result, expected_result);
791        let k: __mmask16 = 0;
792        let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
793        let result = *c.as_u16x16().as_array();
794        let expected_result = src_array;
795        assert_eq!(result, expected_result);
796    }
797
798    #[simd_test(enable = "avx512bf16,avx512vl")]
799    fn test_mm256_maskz_cvtne2ps_pbh() {
800        #[rustfmt::skip]
801        let a_array = [
802            178.125_f32,
803            10.5_f32,
804            3.75_f32,
805            50.25_f32,
806            16.5_f32,
807            255.11_f32,
808            1000.158_f32,
809            575.575_f32,
810        ];
811        let b_array = [
812            -178.125_f32,
813            -10.5_f32,
814            -3.75_f32,
815            -50.25_f32,
816            -16.5_f32,
817            -255.11_f32,
818            -1000.158_f32,
819            -575.575_f32,
820        ];
821        let a = f32x8::from_array(a_array).as_m256();
822        let b = f32x8::from_array(b_array).as_m256();
823        let k: __mmask16 = 0xffff;
824        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
825        let result = *c.as_u16x16().as_array();
826        #[rustfmt::skip]
827        let expected_result: [u16; 16] = [
828            0b1_10000110_0110010,
829            0b1_10000010_0101000,
830            0b1_10000000_1110000,
831            0b1_10000100_1001001,
832            0b1_10000011_0000100,
833            0b1_10000110_1111111,
834            0b1_10001000_1111010,
835            0b1_10001000_0010000,
836            0b0_10000110_0110010,
837            0b0_10000010_0101000,
838            0b0_10000000_1110000,
839            0b0_10000100_1001001,
840            0b0_10000011_0000100,
841            0b0_10000110_1111111,
842            0b0_10001000_1111010,
843            0b0_10001000_0010000,
844        ];
845        assert_eq!(result, expected_result);
846        let k: __mmask16 = 0b0110_1100_0011_0110;
847        let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
848        let result = *c.as_u16x16().as_array();
849        #[rustfmt::skip]
850        let expected_result: [u16; 16] = [
851            0,
852            0b1_10000010_0101000,
853            0b1_10000000_1110000,
854            0,
855            0b1_10000011_0000100,
856            0b1_10000110_1111111,
857            0,
858            0,
859            0,
860            0,
861            0b0_10000000_1110000,
862            0b0_10000100_1001001,
863            0,
864            0b0_10000110_1111111,
865            0b0_10001000_1111010,
866            0,
867        ];
868        assert_eq!(result, expected_result);
869    }
870
871    #[simd_test(enable = "avx512bf16,avx512f")]
872    fn test_mm512_cvtne2ps_pbh() {
873        #[rustfmt::skip]
874        let a_array = [
875            178.125_f32,
876            10.5_f32,
877            3.75_f32,
878            50.25_f32,
879            16.5_f32,
880            255.11_f32,
881            1000.158_f32,
882            575.575_f32,
883            178.125_f32,
884            10.5_f32,
885            3.75_f32,
886            50.25_f32,
887            16.5_f32,
888            255.11_f32,
889            1000.158_f32,
890            575.575_f32,
891        ];
892        let b_array = [
893            -178.125_f32,
894            -10.5_f32,
895            -3.75_f32,
896            -50.25_f32,
897            -16.5_f32,
898            -255.11_f32,
899            -1000.158_f32,
900            -575.575_f32,
901            -178.125_f32,
902            -10.5_f32,
903            -3.75_f32,
904            -50.25_f32,
905            -16.5_f32,
906            -255.11_f32,
907            -1000.158_f32,
908            -575.575_f32,
909        ];
910        let a = f32x16::from_array(a_array).as_m512();
911        let b = f32x16::from_array(b_array).as_m512();
912        let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
913        let result = *c.as_u16x32().as_array();
914        #[rustfmt::skip]
915        let expected_result: [u16; 32] = [
916            0b1_10000110_0110010,
917            0b1_10000010_0101000,
918            0b1_10000000_1110000,
919            0b1_10000100_1001001,
920            0b1_10000011_0000100,
921            0b1_10000110_1111111,
922            0b1_10001000_1111010,
923            0b1_10001000_0010000,
924            0b1_10000110_0110010,
925            0b1_10000010_0101000,
926            0b1_10000000_1110000,
927            0b1_10000100_1001001,
928            0b1_10000011_0000100,
929            0b1_10000110_1111111,
930            0b1_10001000_1111010,
931            0b1_10001000_0010000,
932            0b0_10000110_0110010,
933            0b0_10000010_0101000,
934            0b0_10000000_1110000,
935            0b0_10000100_1001001,
936            0b0_10000011_0000100,
937            0b0_10000110_1111111,
938            0b0_10001000_1111010,
939            0b0_10001000_0010000,
940            0b0_10000110_0110010,
941            0b0_10000010_0101000,
942            0b0_10000000_1110000,
943            0b0_10000100_1001001,
944            0b0_10000011_0000100,
945            0b0_10000110_1111111,
946            0b0_10001000_1111010,
947            0b0_10001000_0010000,
948        ];
949        assert_eq!(result, expected_result);
950    }
951
952    #[simd_test(enable = "avx512bf16,avx512f")]
953    fn test_mm512_mask_cvtne2ps_pbh() {
954        #[rustfmt::skip]
955        let a_array = [
956            178.125_f32,
957            10.5_f32,
958            3.75_f32,
959            50.25_f32,
960            16.5_f32,
961            255.11_f32,
962            1000.158_f32,
963            575.575_f32,
964            178.125_f32,
965            10.5_f32,
966            3.75_f32,
967            50.25_f32,
968            16.5_f32,
969            255.11_f32,
970            1000.158_f32,
971            575.575_f32,
972        ];
973        let b_array = [
974            -178.125_f32,
975            -10.5_f32,
976            -3.75_f32,
977            -50.25_f32,
978            -16.5_f32,
979            -255.11_f32,
980            -1000.158_f32,
981            -575.575_f32,
982            -178.125_f32,
983            -10.5_f32,
984            -3.75_f32,
985            -50.25_f32,
986            -16.5_f32,
987            -255.11_f32,
988            -1000.158_f32,
989            -575.575_f32,
990        ];
991        let src_array: [u16; 32] = [
992            0b0_10000110_0110010,
993            0b0_10000010_0101000,
994            0b0_10000000_1110000,
995            0b0_10000100_1001001,
996            0b0_10000110_0110010,
997            0b0_10000010_0101000,
998            0b0_10000000_1110000,
999            0b0_10000100_1001001,
1000            0b0_10000110_0110010,
1001            0b0_10000010_0101000,
1002            0b0_10000000_1110000,
1003            0b0_10000100_1001001,
1004            0b0_10000110_0110010,
1005            0b0_10000010_0101000,
1006            0b0_10000000_1110000,
1007            0b0_10000100_1001001,
1008            0b0_10000110_0110010,
1009            0b0_10000010_0101000,
1010            0b0_10000000_1110000,
1011            0b0_10000100_1001001,
1012            0b0_10000110_0110010,
1013            0b0_10000010_0101000,
1014            0b0_10000000_1110000,
1015            0b0_10000100_1001001,
1016            0b0_10000110_0110010,
1017            0b0_10000010_0101000,
1018            0b0_10000000_1110000,
1019            0b0_10000100_1001001,
1020            0b0_10000110_0110010,
1021            0b0_10000010_0101000,
1022            0b0_10000000_1110000,
1023            0b0_10000100_1001001,
1024        ];
1025        let src = u16x32::from_array(src_array).as_m512bh();
1026        let a = f32x16::from_array(a_array).as_m512();
1027        let b = f32x16::from_array(b_array).as_m512();
1028        let k: __mmask32 = 0xffffffff;
1029        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1030        let result = *c.as_u16x32().as_array();
1031        #[rustfmt::skip]
1032        let expected_result: [u16; 32] = [
1033            0b1_10000110_0110010,
1034            0b1_10000010_0101000,
1035            0b1_10000000_1110000,
1036            0b1_10000100_1001001,
1037            0b1_10000011_0000100,
1038            0b1_10000110_1111111,
1039            0b1_10001000_1111010,
1040            0b1_10001000_0010000,
1041            0b1_10000110_0110010,
1042            0b1_10000010_0101000,
1043            0b1_10000000_1110000,
1044            0b1_10000100_1001001,
1045            0b1_10000011_0000100,
1046            0b1_10000110_1111111,
1047            0b1_10001000_1111010,
1048            0b1_10001000_0010000,
1049            0b0_10000110_0110010,
1050            0b0_10000010_0101000,
1051            0b0_10000000_1110000,
1052            0b0_10000100_1001001,
1053            0b0_10000011_0000100,
1054            0b0_10000110_1111111,
1055            0b0_10001000_1111010,
1056            0b0_10001000_0010000,
1057            0b0_10000110_0110010,
1058            0b0_10000010_0101000,
1059            0b0_10000000_1110000,
1060            0b0_10000100_1001001,
1061            0b0_10000011_0000100,
1062            0b0_10000110_1111111,
1063            0b0_10001000_1111010,
1064            0b0_10001000_0010000,
1065        ];
1066        assert_eq!(result, expected_result);
1067        let k: __mmask32 = 0;
1068        let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
1069        let result = *c.as_u16x32().as_array();
1070        let expected_result = src_array;
1071        assert_eq!(result, expected_result);
1072    }
1073
1074    #[simd_test(enable = "avx512bf16,avx512f")]
1075    fn test_mm512_maskz_cvtne2ps_pbh() {
1076        #[rustfmt::skip]
1077        let a_array = [
1078            178.125_f32,
1079            10.5_f32,
1080            3.75_f32,
1081            50.25_f32,
1082            16.5_f32,
1083            255.11_f32,
1084            1000.158_f32,
1085            575.575_f32,
1086            178.125_f32,
1087            10.5_f32,
1088            3.75_f32,
1089            50.25_f32,
1090            16.5_f32,
1091            255.11_f32,
1092            1000.158_f32,
1093            575.575_f32,
1094        ];
1095        let b_array = [
1096            -178.125_f32,
1097            -10.5_f32,
1098            -3.75_f32,
1099            -50.25_f32,
1100            -16.5_f32,
1101            -255.11_f32,
1102            -1000.158_f32,
1103            -575.575_f32,
1104            -178.125_f32,
1105            -10.5_f32,
1106            -3.75_f32,
1107            -50.25_f32,
1108            -16.5_f32,
1109            -255.11_f32,
1110            -1000.158_f32,
1111            -575.575_f32,
1112        ];
1113        let a = f32x16::from_array(a_array).as_m512();
1114        let b = f32x16::from_array(b_array).as_m512();
1115        let k: __mmask32 = 0xffffffff;
1116        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1117        let result = *c.as_u16x32().as_array();
1118        #[rustfmt::skip]
1119        let expected_result: [u16; 32] = [
1120            0b1_10000110_0110010,
1121            0b1_10000010_0101000,
1122            0b1_10000000_1110000,
1123            0b1_10000100_1001001,
1124            0b1_10000011_0000100,
1125            0b1_10000110_1111111,
1126            0b1_10001000_1111010,
1127            0b1_10001000_0010000,
1128            0b1_10000110_0110010,
1129            0b1_10000010_0101000,
1130            0b1_10000000_1110000,
1131            0b1_10000100_1001001,
1132            0b1_10000011_0000100,
1133            0b1_10000110_1111111,
1134            0b1_10001000_1111010,
1135            0b1_10001000_0010000,
1136            0b0_10000110_0110010,
1137            0b0_10000010_0101000,
1138            0b0_10000000_1110000,
1139            0b0_10000100_1001001,
1140            0b0_10000011_0000100,
1141            0b0_10000110_1111111,
1142            0b0_10001000_1111010,
1143            0b0_10001000_0010000,
1144            0b0_10000110_0110010,
1145            0b0_10000010_0101000,
1146            0b0_10000000_1110000,
1147            0b0_10000100_1001001,
1148            0b0_10000011_0000100,
1149            0b0_10000110_1111111,
1150            0b0_10001000_1111010,
1151            0b0_10001000_0010000,
1152        ];
1153        assert_eq!(result, expected_result);
1154        let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
1155        let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
1156        let result = *c.as_u16x32().as_array();
1157        #[rustfmt::skip]
1158        let expected_result: [u16; 32] = [
1159            0,
1160            0b1_10000010_0101000,
1161            0b1_10000000_1110000,
1162            0,
1163            0b1_10000011_0000100,
1164            0,
1165            0b1_10001000_1111010,
1166            0,
1167            0b1_10000110_0110010,
1168            0b1_10000010_0101000,
1169            0,
1170            0,
1171            0,
1172            0b1_10000110_1111111,
1173            0,
1174            0b1_10001000_0010000,
1175            0,
1176            0b0_10000010_0101000,
1177            0b0_10000000_1110000,
1178            0,
1179            0b0_10000011_0000100,
1180            0,
1181            0,
1182            0b0_10001000_0010000,
1183            0,
1184            0b0_10000010_0101000,
1185            0,
1186            0b0_10000100_1001001,
1187            0,
1188            0,
1189            0b0_10001000_1111010,
1190            0b0_10001000_0010000,
1191        ];
1192        assert_eq!(result, expected_result);
1193    }
1194
1195    #[simd_test(enable = "avx512bf16,avx512vl")]
1196    fn test_mm256_cvtneps_pbh() {
1197        #[rustfmt::skip]
1198        let a_array = [
1199            178.125_f32,
1200            10.5_f32,
1201            3.75_f32,
1202            50.25_f32,
1203            16.5_f32,
1204            255.11_f32,
1205            1000.158_f32,
1206            575.575_f32,
1207        ];
1208        let a = f32x8::from_array(a_array).as_m256();
1209        let c: __m128bh = _mm256_cvtneps_pbh(a);
1210        let result = *c.as_u16x8().as_array();
1211        #[rustfmt::skip]
1212        let expected_result: [u16; 8] = [
1213            0b0_10000110_0110010,
1214            0b0_10000010_0101000,
1215            0b0_10000000_1110000,
1216            0b0_10000100_1001001,
1217            0b0_10000011_0000100,
1218            0b0_10000110_1111111,
1219            0b0_10001000_1111010,
1220            0b0_10001000_0010000,
1221        ];
1222        assert_eq!(result, expected_result);
1223    }
1224
1225    #[simd_test(enable = "avx512bf16,avx512vl")]
1226    fn test_mm256_mask_cvtneps_pbh() {
1227        #[rustfmt::skip]
1228        let a_array = [
1229            178.125_f32,
1230            10.5_f32,
1231            3.75_f32,
1232            50.25_f32,
1233            16.5_f32,
1234            255.11_f32,
1235            1000.158_f32,
1236            575.575_f32,
1237        ];
1238        let src_array: [u16; 8] = [
1239            0b1_10000110_0110010,
1240            0b1_10000010_0101000,
1241            0b1_10000000_1110000,
1242            0b1_10000100_1001001,
1243            0b1_10000011_0000100,
1244            0b1_10000110_1111111,
1245            0b1_10001000_1111010,
1246            0b1_10001000_0010000,
1247        ];
1248        let src = u16x8::from_array(src_array).as_m128bh();
1249        let a = f32x8::from_array(a_array).as_m256();
1250        let k: __mmask8 = 0xff;
1251        let b = _mm256_mask_cvtneps_pbh(src, k, a);
1252        let result = *b.as_u16x8().as_array();
1253        #[rustfmt::skip]
1254        let expected_result: [u16; 8] = [
1255            0b0_10000110_0110010,
1256            0b0_10000010_0101000,
1257            0b0_10000000_1110000,
1258            0b0_10000100_1001001,
1259            0b0_10000011_0000100,
1260            0b0_10000110_1111111,
1261            0b0_10001000_1111010,
1262            0b0_10001000_0010000,
1263        ];
1264        assert_eq!(result, expected_result);
1265        let k: __mmask8 = 0x0;
1266        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
1267        let result = *b.as_u16x8().as_array();
1268        let expected_result: [u16; 8] = src_array;
1269        assert_eq!(result, expected_result);
1270    }
1271
1272    #[simd_test(enable = "avx512bf16,avx512vl")]
1273    fn test_mm256_maskz_cvtneps_pbh() {
1274        #[rustfmt::skip]
1275        let a_array = [
1276            178.125_f32,
1277            10.5_f32,
1278            3.75_f32,
1279            50.25_f32,
1280            16.5_f32,
1281            255.11_f32,
1282            1000.158_f32,
1283            575.575_f32,
1284        ];
1285        let a = f32x8::from_array(a_array).as_m256();
1286        let k: __mmask8 = 0xff;
1287        let b = _mm256_maskz_cvtneps_pbh(k, a);
1288        let result = *b.as_u16x8().as_array();
1289        #[rustfmt::skip]
1290        let expected_result: [u16; 8] = [
1291            0b0_10000110_0110010,
1292            0b0_10000010_0101000,
1293            0b0_10000000_1110000,
1294            0b0_10000100_1001001,
1295            0b0_10000011_0000100,
1296            0b0_10000110_1111111,
1297            0b0_10001000_1111010,
1298            0b0_10001000_0010000,
1299        ];
1300        assert_eq!(result, expected_result);
1301        let k: __mmask8 = 0x6;
1302        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
1303        let result = *b.as_u16x8().as_array();
1304        let expected_result: [u16; 8] =
1305            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
1306        assert_eq!(result, expected_result);
1307    }
1308
1309    #[simd_test(enable = "avx512bf16,avx512f")]
1310    fn test_mm512_cvtneps_pbh() {
1311        #[rustfmt::skip]
1312        let a_array = [
1313            178.125_f32,
1314            10.5_f32,
1315            3.75_f32,
1316            50.25_f32,
1317            16.5_f32,
1318            255.11_f32,
1319            1000.158_f32,
1320            575.575_f32,
1321            178.125_f32,
1322            10.5_f32,
1323            3.75_f32,
1324            50.25_f32,
1325            16.5_f32,
1326            255.11_f32,
1327            1000.158_f32,
1328            575.575_f32,
1329        ];
1330        let a = f32x16::from_array(a_array).as_m512();
1331        let c: __m256bh = _mm512_cvtneps_pbh(a);
1332        let result = *c.as_u16x16().as_array();
1333        #[rustfmt::skip]
1334        let expected_result: [u16; 16] = [
1335            0b0_10000110_0110010,
1336            0b0_10000010_0101000,
1337            0b0_10000000_1110000,
1338            0b0_10000100_1001001,
1339            0b0_10000011_0000100,
1340            0b0_10000110_1111111,
1341            0b0_10001000_1111010,
1342            0b0_10001000_0010000,
1343            0b0_10000110_0110010,
1344            0b0_10000010_0101000,
1345            0b0_10000000_1110000,
1346            0b0_10000100_1001001,
1347            0b0_10000011_0000100,
1348            0b0_10000110_1111111,
1349            0b0_10001000_1111010,
1350            0b0_10001000_0010000,
1351        ];
1352        assert_eq!(result, expected_result);
1353    }
1354
1355    #[simd_test(enable = "avx512bf16,avx512f")]
1356    fn test_mm512_mask_cvtneps_pbh() {
1357        #[rustfmt::skip]
1358        let a_array = [
1359            178.125_f32,
1360            10.5_f32,
1361            3.75_f32,
1362            50.25_f32,
1363            16.5_f32,
1364            255.11_f32,
1365            1000.158_f32,
1366            575.575_f32,
1367            178.125_f32,
1368            10.5_f32,
1369            3.75_f32,
1370            50.25_f32,
1371            16.5_f32,
1372            255.11_f32,
1373            1000.158_f32,
1374            575.575_f32,
1375        ];
1376        let src_array: [u16; 16] = [
1377            0b1_10000110_0110010,
1378            0b1_10000010_0101000,
1379            0b1_10000000_1110000,
1380            0b1_10000100_1001001,
1381            0b1_10000011_0000100,
1382            0b1_10000110_1111111,
1383            0b1_10001000_1111010,
1384            0b1_10001000_0010000,
1385            0b1_10000110_0110010,
1386            0b1_10000010_0101000,
1387            0b1_10000000_1110000,
1388            0b1_10000100_1001001,
1389            0b1_10000011_0000100,
1390            0b1_10000110_1111111,
1391            0b1_10001000_1111010,
1392            0b1_10001000_0010000,
1393        ];
1394        let src = u16x16::from_array(src_array).as_m256bh();
1395        let a = f32x16::from_array(a_array).as_m512();
1396        let k: __mmask16 = 0xffff;
1397        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1398        let result = *c.as_u16x16().as_array();
1399        #[rustfmt::skip]
1400        let expected_result: [u16; 16] = [
1401            0b0_10000110_0110010,
1402            0b0_10000010_0101000,
1403            0b0_10000000_1110000,
1404            0b0_10000100_1001001,
1405            0b0_10000011_0000100,
1406            0b0_10000110_1111111,
1407            0b0_10001000_1111010,
1408            0b0_10001000_0010000,
1409            0b0_10000110_0110010,
1410            0b0_10000010_0101000,
1411            0b0_10000000_1110000,
1412            0b0_10000100_1001001,
1413            0b0_10000011_0000100,
1414            0b0_10000110_1111111,
1415            0b0_10001000_1111010,
1416            0b0_10001000_0010000,
1417        ];
1418        assert_eq!(result, expected_result);
1419        let k: __mmask16 = 0;
1420        let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
1421        let result = *c.as_u16x16().as_array();
1422        let expected_result = src_array;
1423        assert_eq!(result, expected_result);
1424    }
1425
1426    #[simd_test(enable = "avx512bf16,avx512f")]
1427    fn test_mm512_maskz_cvtneps_pbh() {
1428        #[rustfmt::skip]
1429        let a_array = [
1430            178.125_f32,
1431            10.5_f32,
1432            3.75_f32,
1433            50.25_f32,
1434            16.5_f32,
1435            255.11_f32,
1436            1000.158_f32,
1437            575.575_f32,
1438            178.125_f32,
1439            10.5_f32,
1440            3.75_f32,
1441            50.25_f32,
1442            16.5_f32,
1443            255.11_f32,
1444            1000.158_f32,
1445            575.575_f32,
1446        ];
1447        let a = f32x16::from_array(a_array).as_m512();
1448        let k: __mmask16 = 0xffff;
1449        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1450        let result = *c.as_u16x16().as_array();
1451        #[rustfmt::skip]
1452        let expected_result: [u16; 16] = [
1453            0b0_10000110_0110010,
1454            0b0_10000010_0101000,
1455            0b0_10000000_1110000,
1456            0b0_10000100_1001001,
1457            0b0_10000011_0000100,
1458            0b0_10000110_1111111,
1459            0b0_10001000_1111010,
1460            0b0_10001000_0010000,
1461            0b0_10000110_0110010,
1462            0b0_10000010_0101000,
1463            0b0_10000000_1110000,
1464            0b0_10000100_1001001,
1465            0b0_10000011_0000100,
1466            0b0_10000110_1111111,
1467            0b0_10001000_1111010,
1468            0b0_10001000_0010000,
1469        ];
1470        assert_eq!(result, expected_result);
1471        let k: __mmask16 = 0x653a;
1472        let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
1473        let result = *c.as_u16x16().as_array();
1474        #[rustfmt::skip]
1475        let expected_result: [u16; 16] = [
1476            0,
1477            0b0_10000010_0101000,
1478            0,
1479            0b0_10000100_1001001,
1480            0b0_10000011_0000100,
1481            0b0_10000110_1111111,
1482            0,
1483            0,
1484            0b0_10000110_0110010,
1485            0,
1486            0b0_10000000_1110000,
1487            0,
1488            0,
1489            0b0_10000110_1111111,
1490            0b0_10001000_1111010,
1491            0,
1492        ];
1493        assert_eq!(result, expected_result);
1494    }
1495
1496    #[simd_test(enable = "avx512bf16,avx512vl")]
1497    fn test_mm_dpbf16_ps() {
1498        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1499        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1500        let a1 = f32x4::from_array(a_array).as_m128();
1501        let b1 = f32x4::from_array(b_array).as_m128();
1502        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1503        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1504        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1505        let c: __m128 = _mm_dpbf16_ps(src, a, b);
1506        let result = *c.as_f32x4().as_array();
1507        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1508        assert_eq!(result, expected_result);
1509    }
1510
1511    #[simd_test(enable = "avx512bf16,avx512vl")]
1512    fn test_mm_mask_dpbf16_ps() {
1513        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1514        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1515        let a1 = f32x4::from_array(a_array).as_m128();
1516        let b1 = f32x4::from_array(b_array).as_m128();
1517        let k: __mmask8 = 0xf3;
1518        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1519        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1520        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1521        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1522        let result = *c.as_f32x4().as_array();
1523        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
1524        assert_eq!(result, expected_result);
1525        let k: __mmask8 = 0xff;
1526        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1527        let result = *c.as_f32x4().as_array();
1528        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1529        assert_eq!(result, expected_result);
1530        let k: __mmask8 = 0;
1531        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
1532        let result = *c.as_f32x4().as_array();
1533        let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
1534        assert_eq!(result, expected_result);
1535    }
1536
1537    #[simd_test(enable = "avx512bf16,avx512vl")]
1538    fn test_mm_maskz_dpbf16_ps() {
1539        let a_array = [8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
1540        let b_array = [-1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
1541        let a1 = f32x4::from_array(a_array).as_m128();
1542        let b1 = f32x4::from_array(b_array).as_m128();
1543        let k: __mmask8 = 0xf3;
1544        let src = f32x4::from_array([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]).as_m128();
1545        let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
1546        let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
1547        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1548        let result = *c.as_f32x4().as_array();
1549        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
1550        assert_eq!(result, expected_result);
1551        let k: __mmask8 = 0xff;
1552        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1553        let result = *c.as_f32x4().as_array();
1554        let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
1555        assert_eq!(result, expected_result);
1556        let k: __mmask8 = 0;
1557        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
1558        let result = *c.as_f32x4().as_array();
1559        let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
1560        assert_eq!(result, expected_result);
1561    }
1562
1563    #[simd_test(enable = "avx512bf16,avx512vl")]
1564    fn test_mm256_dpbf16_ps() {
1565        #[rustfmt::skip]
1566        let a_array = [
1567            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1568        ];
1569        let b_array = [
1570            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1571        ];
1572        let a1 = f32x8::from_array(a_array).as_m256();
1573        let b1 = f32x8::from_array(b_array).as_m256();
1574        #[rustfmt::skip]
1575        let src = f32x8::from_array([
1576            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1577        ]).as_m256();
1578        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1579        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1580        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
1581        let result = *c.as_f32x8().as_array();
1582        #[rustfmt::skip]
1583        let expected_result: [f32; 8] = [
1584            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1585        ];
1586        assert_eq!(result, expected_result);
1587    }
1588
1589    #[simd_test(enable = "avx512bf16,avx512vl")]
1590    fn test_mm256_mask_dpbf16_ps() {
1591        #[rustfmt::skip]
1592        let a_array = [
1593            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1594        ];
1595        let b_array = [
1596            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1597        ];
1598        let a1 = f32x8::from_array(a_array).as_m256();
1599        let b1 = f32x8::from_array(b_array).as_m256();
1600        let k: __mmask8 = 0x33;
1601        #[rustfmt::skip]
1602        let src = f32x8::from_array([
1603            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1604        ]).as_m256();
1605        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1606        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1607        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1608        let result = *c.as_f32x8().as_array();
1609        #[rustfmt::skip]
1610        let expected_result: [f32; 8] = [
1611            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1612        ];
1613        assert_eq!(result, expected_result);
1614        let k: __mmask8 = 0xff;
1615        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1616        let result = *c.as_f32x8().as_array();
1617        #[rustfmt::skip]
1618        let expected_result: [f32; 8] = [
1619            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1620        ];
1621        assert_eq!(result, expected_result);
1622        let k: __mmask8 = 0;
1623        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
1624        let result = *c.as_f32x8().as_array();
1625        #[rustfmt::skip]
1626        let expected_result: [f32; 8] = [
1627            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1628        ];
1629        assert_eq!(result, expected_result);
1630    }
1631
1632    #[simd_test(enable = "avx512bf16,avx512vl")]
1633    fn test_mm256_maskz_dpbf16_ps() {
1634        #[rustfmt::skip]
1635        let a_array = [
1636            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1637        ];
1638        let b_array = [
1639            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1640        ];
1641        let a1 = f32x8::from_array(a_array).as_m256();
1642        let b1 = f32x8::from_array(b_array).as_m256();
1643        let k: __mmask8 = 0x33;
1644        #[rustfmt::skip]
1645        let src = f32x8::from_array([
1646            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1647        ]).as_m256();
1648        let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
1649        let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
1650        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1651        let result = *c.as_f32x8().as_array();
1652        #[rustfmt::skip]
1653        let expected_result: [f32; 8] = [
1654            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1655        ];
1656        assert_eq!(result, expected_result);
1657        let k: __mmask8 = 0xff;
1658        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1659        let result = *c.as_f32x8().as_array();
1660        #[rustfmt::skip]
1661        let expected_result: [f32; 8] = [
1662            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1663        ];
1664        assert_eq!(result, expected_result);
1665        let k: __mmask8 = 0;
1666        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
1667        let result = *c.as_f32x8().as_array();
1668        let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
1669        assert_eq!(result, expected_result);
1670    }
1671
1672    #[simd_test(enable = "avx512bf16,avx512f")]
1673    fn test_mm512_dpbf16_ps() {
1674        #[rustfmt::skip]
1675        let a_array = [
1676            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1677            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1678        ];
1679        let b_array = [
1680            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1681            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1682        ];
1683        let a1 = f32x16::from_array(a_array).as_m512();
1684        let b1 = f32x16::from_array(b_array).as_m512();
1685        let src = f32x16::from_array([
1686            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1687            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1688        ])
1689        .as_m512();
1690        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1691        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1692        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
1693        let result = *c.as_f32x16().as_array();
1694        #[rustfmt::skip]
1695        let expected_result: [f32; 16] = [
1696            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1697            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1698        ];
1699        assert_eq!(result, expected_result);
1700    }
1701
1702    #[simd_test(enable = "avx512bf16,avx512f")]
1703    fn test_mm512_mask_dpbf16_ps() {
1704        #[rustfmt::skip]
1705        let a_array = [
1706            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1707            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1708        ];
1709        let b_array = [
1710            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1711            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1712        ];
1713        let a1 = f32x16::from_array(a_array).as_m512();
1714        let b1 = f32x16::from_array(b_array).as_m512();
1715        let k: __mmask16 = 0x3333;
1716        #[rustfmt::skip]
1717        let src = f32x16::from_array([
1718            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1719            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1720        ]).as_m512();
1721        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1722        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1723        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1724        let result = *c.as_f32x16().as_array();
1725        #[rustfmt::skip]
1726        let expected_result: [f32; 16] = [
1727            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1728            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
1729        ];
1730        assert_eq!(result, expected_result);
1731        let k: __mmask16 = 0xffff;
1732        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1733        let result = *c.as_f32x16().as_array();
1734        #[rustfmt::skip]
1735        let expected_result: [f32; 16] = [
1736            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1737            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1738        ];
1739        assert_eq!(result, expected_result);
1740        let k: __mmask16 = 0;
1741        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
1742        let result = *c.as_f32x16().as_array();
1743        #[rustfmt::skip]
1744        let expected_result: [f32; 16] = [
1745            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1746            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1747        ];
1748        assert_eq!(result, expected_result);
1749    }
1750
1751    #[simd_test(enable = "avx512bf16,avx512f")]
1752    fn test_mm512_maskz_dpbf16_ps() {
1753        #[rustfmt::skip]
1754        let a_array = [
1755            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1756            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
1757        ];
1758        let b_array = [
1759            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1760            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
1761        ];
1762        let a1 = f32x16::from_array(a_array).as_m512();
1763        let b1 = f32x16::from_array(b_array).as_m512();
1764        let k: __mmask16 = 0x3333;
1765        #[rustfmt::skip]
1766        let src = f32x16::from_array([
1767            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
1768            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
1769        ]).as_m512();
1770        let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
1771        let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
1772        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1773        let result = *c.as_f32x16().as_array();
1774        #[rustfmt::skip]
1775        let expected_result: [f32; 16] = [
1776            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
1777            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
1778        ];
1779        assert_eq!(result, expected_result);
1780        let k: __mmask16 = 0xffff;
1781        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1782        let result = *c.as_f32x16().as_array();
1783        #[rustfmt::skip]
1784        let expected_result: [f32; 16] = [
1785            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1786            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
1787        ];
1788        assert_eq!(result, expected_result);
1789        let k: __mmask16 = 0;
1790        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
1791        let result = *c.as_f32x16().as_array();
1792        #[rustfmt::skip]
1793        let expected_result: [f32; 16] = [
1794            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1795        ];
1796        assert_eq!(result, expected_result);
1797    }
1798
1799    const BF16_ONE: u16 = 0b0_01111111_0000000;
1800    const BF16_TWO: u16 = 0b0_10000000_0000000;
1801    const BF16_THREE: u16 = 0b0_10000000_1000000;
1802    const BF16_FOUR: u16 = 0b0_10000001_0000000;
1803    const BF16_FIVE: u16 = 0b0_10000001_0100000;
1804    const BF16_SIX: u16 = 0b0_10000001_1000000;
1805    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
1806    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
1807
1808    #[simd_test(enable = "avx512bf16")]
1809    fn test_mm512_cvtpbh_ps() {
1810        let a = __m256bh([
1811            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1812            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1813        ]);
1814        let r = _mm512_cvtpbh_ps(a);
1815        let e = _mm512_setr_ps(
1816            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
1817        );
1818        assert_eq_m512(r, e);
1819    }
1820
1821    #[simd_test(enable = "avx512bf16")]
1822    fn test_mm512_mask_cvtpbh_ps() {
1823        let a = __m256bh([
1824            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1825            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1826        ]);
1827        let src = _mm512_setr_ps(
1828            9., 10., 11., 12., 13., 14., 15., 16., 9., 10., 11., 12., 13., 14., 15., 16.,
1829        );
1830        let k = 0b1010_1010_1010_1010;
1831        let r = _mm512_mask_cvtpbh_ps(src, k, a);
1832        let e = _mm512_setr_ps(
1833            9., 2., 11., 4., 13., 6., 15., 8., 9., 2., 11., 4., 13., 6., 15., 8.,
1834        );
1835        assert_eq_m512(r, e);
1836    }
1837
1838    #[simd_test(enable = "avx512bf16")]
1839    fn test_mm512_maskz_cvtpbh_ps() {
1840        let a = __m256bh([
1841            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1842            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1843        ]);
1844        let k = 0b1010_1010_1010_1010;
1845        let r = _mm512_maskz_cvtpbh_ps(k, a);
1846        let e = _mm512_setr_ps(
1847            0., 2., 0., 4., 0., 6., 0., 8., 0., 2., 0., 4., 0., 6., 0., 8.,
1848        );
1849        assert_eq_m512(r, e);
1850    }
1851
1852    #[simd_test(enable = "avx512bf16,avx512vl")]
1853    fn test_mm256_cvtpbh_ps() {
1854        let a = __m128bh([
1855            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1856        ]);
1857        let r = _mm256_cvtpbh_ps(a);
1858        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
1859        assert_eq_m256(r, e);
1860    }
1861
1862    #[simd_test(enable = "avx512bf16,avx512vl")]
1863    fn test_mm256_mask_cvtpbh_ps() {
1864        let a = __m128bh([
1865            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1866        ]);
1867        let src = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
1868        let k = 0b1010_1010;
1869        let r = _mm256_mask_cvtpbh_ps(src, k, a);
1870        let e = _mm256_setr_ps(9., 2., 11., 4., 13., 6., 15., 8.);
1871        assert_eq_m256(r, e);
1872    }
1873
1874    #[simd_test(enable = "avx512bf16,avx512vl")]
1875    fn test_mm256_maskz_cvtpbh_ps() {
1876        let a = __m128bh([
1877            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
1878        ]);
1879        let k = 0b1010_1010;
1880        let r = _mm256_maskz_cvtpbh_ps(k, a);
1881        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
1882        assert_eq_m256(r, e);
1883    }
1884
1885    #[simd_test(enable = "avx512bf16,avx512vl")]
1886    fn test_mm_cvtpbh_ps() {
1887        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1888        let r = _mm_cvtpbh_ps(a);
1889        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1890        assert_eq_m128(r, e);
1891    }
1892
1893    #[simd_test(enable = "avx512bf16,avx512vl")]
1894    fn test_mm_mask_cvtpbh_ps() {
1895        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1896        let src = _mm_setr_ps(9., 10., 11., 12.);
1897        let k = 0b1010;
1898        let r = _mm_mask_cvtpbh_ps(src, k, a);
1899        let e = _mm_setr_ps(9., 2., 11., 4.);
1900        assert_eq_m128(r, e);
1901    }
1902
1903    #[simd_test(enable = "avx512bf16,avx512vl")]
1904    fn test_mm_maskz_cvtpbh_ps() {
1905        let a = __m128bh([BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, 0, 0, 0, 0]);
1906        let k = 0b1010;
1907        let r = _mm_maskz_cvtpbh_ps(k, a);
1908        let e = _mm_setr_ps(0., 2., 0., 4.);
1909        assert_eq_m128(r, e);
1910    }
1911
1912    #[simd_test(enable = "avx512bf16")]
1913    fn test_mm_cvtsbh_ss() {
1914        let r = _mm_cvtsbh_ss(bf16::from_bits(BF16_ONE));
1915        assert_eq!(r, 1.);
1916    }
1917
1918    #[simd_test(enable = "avx512bf16,avx512vl")]
1919    fn test_mm_cvtneps_pbh() {
1920        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1921        let r: u16x4 = unsafe { transmute_copy(&_mm_cvtneps_pbh(a)) };
1922        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
1923        assert_eq!(r, e);
1924    }
1925
1926    #[simd_test(enable = "avx512bf16,avx512vl")]
1927    fn test_mm_mask_cvtneps_pbh() {
1928        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1929        let src = __m128bh([5, 6, 7, 8, !0, !0, !0, !0]);
1930        let k = 0b1010;
1931        let r: u16x4 = unsafe { transmute_copy(&_mm_mask_cvtneps_pbh(src, k, a)) };
1932        let e = u16x4::new(5, BF16_TWO, 7, BF16_FOUR);
1933        assert_eq!(r, e);
1934    }
1935
1936    #[simd_test(enable = "avx512bf16,avx512vl")]
1937    fn test_mm_maskz_cvtneps_pbh() {
1938        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1939        let k = 0b1010;
1940        let r: u16x4 = unsafe { transmute_copy(&_mm_maskz_cvtneps_pbh(k, a)) };
1941        let e = u16x4::new(0, BF16_TWO, 0, BF16_FOUR);
1942        assert_eq!(r, e);
1943    }
1944
1945    #[simd_test(enable = "avx512bf16,avx512vl")]
1946    fn test_mm_cvtness_sbh() {
1947        let r = _mm_cvtness_sbh(1.);
1948        assert_eq!(r.to_bits(), BF16_ONE);
1949    }
1950}